Search code examples
pythonfacebook-graph-api

Trying to extract a month worth of posts using Cursor Based Pagination on Facebook Graph API but failed


I'm trying to extract a month worth of facebook posts into a csv file. I'm extracting from the 1st May until 30th May of 2024, but once my script done running, it only extracted posts from the 30th May 2024 (it started on this date) until 12th May 2024.

It doesn't extract from the 9th May until 1st May 2024.

Below is my code:

import pyfacebook
from datetime import datetime, timedelta, timezone
import json
import pandas as pd
import time
# Initialize Facebook Graph API connection (replace placeholders with your actual credentials)
graph = pyfacebook.GraphAPI(access_token='my-access-token', version='v20.0')

# Define the page ID
page_id = 'my-page-id'

# date 
from_date = '2024-05-01'
to_date = '2024-05-30'

# Construct the API request URL (notice the parameters 'since', 'until' and 'fields')
posts_url = f'/{page_id}/feed?fields=attachments,created_time&since={from_date}T08:00:00&until={to_date}T23:59:59'

posts_data = graph._request(posts_url).json()

while True: 
    if 'paging' in posts_data and 'next' in posts_data['paging']:
        posts_url = posts_data['paging']['next']
        posts_data = graph._request(posts_url).json()
        time.sleep(2)  # Increased delay to be safe
    else:
        break  # No more pages to retrieve
    
df = pd.DataFrame(posts_data)
df.to_csv('facebook_data.csv', index=False)

Here is the result, it ends on the 9th May 2024 instead of continuing until 1st May 2024:

How do I extract the full month's worth of data?


Solution

  • I solved this using Batch Requests provided in Facebook Graph API doc, The Link.

    import requests
    import json
    import time
    import pandas as pd
    
    from_start_date = '2024-04-30' # Need to be less a day
    to_mid_date = '2024-05-16'
    from_mid_date = '2024-05-16'
    to_end_date = '2024-05-31'
    
    credential = [
        # Rasa
        ('Page_id_1', 'Access_token_1'),
        # Roda Panas
        ('Page_id_2','Access_token_2')
    ]
    all_ips_df = []
    for page_id, access_token in credential:
        batch_parameters = [
        {
            "method": "GET",
            "relative_url": f"{page_id}/feed?fields=from,attachments,created_time&since={from_mid_date}T16:00:00&until={to_end_date}T15:59:59&limit=25",   
        },
        {
            "method": "GET",
            "relative_url": f"{page_id}/feed?fields=from,attachments,created_time&since={from_start_date}T16:00:00&until={to_mid_date}T15:59:59&limit=25",
        },
        ]
    
        batch_payload = {
            "access_token" : {access_token},
            "batch" : json.dumps(batch_parameters)
        }
    
        batch_url = f"https://graph.facebook.com/v20.0/"
        batch_response = requests.post(batch_url, data=batch_payload).json()
    
        all_posts = []
    
        for response_item in batch_response:
            if 'body' in response_item:
                post_data = json.loads(response_item['body'])
                while True:
                    for post in post_data['data']:
                        try:
                            attachments = post['attachments']['data']
                            video_attachments = [a for a in attachments if pd.Series([a['type']]).str.contains('video', case=False).any()]
                            video_attachment = video_attachments[0]
                            all_posts.append({
                                'post_id': post['id'],
                                'ip': post['from']['name'],
                                'title': video_attachment.get('title'),
                                'created_time': post['created_time'],
                                'media_type': video_attachment['type'],
                                'post_url': video_attachment['url']
                            })
                        except(KeyError, IndexError):
                            print(f"Incomplete data found in post: {post['id']}")
    
                        
                    # Pagination for each batch response
                    if 'next' in post_data.get('paging', {}):
                        posts_url = post_data['paging']['next']
                        post_data = requests.get(posts_url).json()
                        time.sleep(2)  # Increased delay to be safe
                    else:
                        break
    
        df = pd.DataFrame(all_posts)
    
        video_insights_df = []
    
        for id in df['post_id']:
            video_url = batch_url + f'{id}/insights?access_token={access_token}&metric=post_reactions_by_type_total, post_impressions, post_impressions_organic_unique,post_impressions_paid_unique'
            video_data = requests.get(video_url).json()
            engagements = impressions = reach_organic = reach_paid = None
    
            for video_insights in video_data['data']:
                if video_insights['name'] == 'post_reactions_by_type_total': 
                    engagement_dict = video_insights['values'][0]['value']
                    engagements = sum(list(engagement_dict.values())) 
                elif video_insights['name'] == 'post_impressions':
                    impressions = video_insights['values'][0]['value']
                elif video_insights['name'] == 'post_impressions_organic_unique':
                    reach_organic = video_insights['values'][0]['value']
                elif video_insights['name'] == 'post_impressions_paid_unique':
                    reach_paid = video_insights['values'][0]['value']
                    
            reach = reach_organic + reach_paid
            video_insights_df.append({'post_id': id, 'engagements': engagements, 'impressions': impressions ,'reach': reach})
    
            
        video_insights_df = pd.DataFrame(video_insights_df)
        df = df.merge(video_insights_df, on='post_id', how='left')
        all_ips_df.append(df)
    
    df = pd.concat(all_ips_df)            
    df.to_csv('facebook_data_video.csv', index=False)
    print(df)
    

    Basically the script is trying to extract the posts in the page feed that have a media type video.

    Since I cant extract the whole month directly, I need to use batch request provided. Batch request is a way you send a single HTTP request that contains multiple Facebook Graph API calls. In the script, I made two Facebook Graph API calls, one is to call from the date of 1st May until 15th May and the other is to call from the 16th May until 31st May.

    I will provide the code for the date below:

    import datetime
    import calendar
    
    from_date = datetime.date(2024,1,22)
    from_date = from_date.replace(day=1) # 1-5-2024
    from_date_fb = from_date - datetime.timedelta(days=1) # 30-4-2024: The from_date of the need to be less a day [Reason why it need to be less a day][2]
    
    to_mid_date = from_date + datetime.timedelta(days=15) # 16-5-2024
    from_mid_date = to_mid_date # 16-5-2024
    
    _, days_in_month = calendar.monthrange(from_date.year, from_date.month) # this will calculate the days of that month
    to_date = datetime.date(from_date.year, from_date.month, days_in_month) # 31-5-2024
    
    print(from_date_fb,to_mid_date, from_mid_date,to_date)
    

    If you have questions regarding the code please let me know and if you like to criticize my code, please do. Thanks