Trying to extract a month worth of posts using Cursor Based Pagination on Facebook Graph API but failed

I'm trying to extract a month worth of facebook posts into a csv file. I'm extracting from the 1st May until 30th May of 2024, but once my script done running, it only extracted posts from the 30th May 2024 (it started on this date) until 12th May 2024.

It doesn't extract from the 9th May until 1st May 2024.

Below is my code:

import pyfacebook
from datetime import datetime, timedelta, timezone
import json
import pandas as pd
import time
# Initialize Facebook Graph API connection (replace placeholders with your actual credentials)
graph = pyfacebook.GraphAPI(access_token='my-access-token', version='v20.0')

# Define the page ID
page_id = 'my-page-id'

# date 
from_date = '2024-05-01'
to_date = '2024-05-30'

# Construct the API request URL (notice the parameters 'since', 'until' and 'fields')
posts_url = f'/{page_id}/feed?fields=attachments,created_time&since={from_date}T08:00:00&until={to_date}T23:59:59'

posts_data = graph._request(posts_url).json()

while True: 
    if 'paging' in posts_data and 'next' in posts_data['paging']:
        posts_url = posts_data['paging']['next']
        posts_data = graph._request(posts_url).json()
        time.sleep(2)  # Increased delay to be safe
    else:
        break  # No more pages to retrieve
    
df = pd.DataFrame(posts_data)
df.to_csv('facebook_data.csv', index=False)

Here is the result, it ends on the 9th May 2024 instead of continuing until 1st May 2024:

How do I extract the full month's worth of data?

Solution

I solved this using Batch Requests provided in Facebook Graph API doc, The Link.

import requests
import json
import time
import pandas as pd

from_start_date = '2024-04-30' # Need to be less a day
to_mid_date = '2024-05-16'
from_mid_date = '2024-05-16'
to_end_date = '2024-05-31'

credential = [
    # Rasa
    ('Page_id_1', 'Access_token_1'),
    # Roda Panas
    ('Page_id_2','Access_token_2')
]
all_ips_df = []
for page_id, access_token in credential:
    batch_parameters = [
    {
        "method": "GET",
        "relative_url": f"{page_id}/feed?fields=from,attachments,created_time&since={from_mid_date}T16:00:00&until={to_end_date}T15:59:59&limit=25",   
    },
    {
        "method": "GET",
        "relative_url": f"{page_id}/feed?fields=from,attachments,created_time&since={from_start_date}T16:00:00&until={to_mid_date}T15:59:59&limit=25",
    },
    ]

    batch_payload = {
        "access_token" : {access_token},
        "batch" : json.dumps(batch_parameters)
    }

    batch_url = f"https://graph.facebook.com/v20.0/"
    batch_response = requests.post(batch_url, data=batch_payload).json()

    all_posts = []

    for response_item in batch_response:
        if 'body' in response_item:
            post_data = json.loads(response_item['body'])
            while True:
                for post in post_data['data']:
                    try:
                        attachments = post['attachments']['data']
                        video_attachments = [a for a in attachments if pd.Series([a['type']]).str.contains('video', case=False).any()]
                        video_attachment = video_attachments[0]
                        all_posts.append({
                            'post_id': post['id'],
                            'ip': post['from']['name'],
                            'title': video_attachment.get('title'),
                            'created_time': post['created_time'],
                            'media_type': video_attachment['type'],
                            'post_url': video_attachment['url']
                        })
                    except(KeyError, IndexError):
                        print(f"Incomplete data found in post: {post['id']}")

                    
                # Pagination for each batch response
                if 'next' in post_data.get('paging', {}):
                    posts_url = post_data['paging']['next']
                    post_data = requests.get(posts_url).json()
                    time.sleep(2)  # Increased delay to be safe
                else:
                    break

    df = pd.DataFrame(all_posts)

    video_insights_df = []

    for id in df['post_id']:
        video_url = batch_url + f'{id}/insights?access_token={access_token}&metric=post_reactions_by_type_total, post_impressions, post_impressions_organic_unique,post_impressions_paid_unique'
        video_data = requests.get(video_url).json()
        engagements = impressions = reach_organic = reach_paid = None

        for video_insights in video_data['data']:
            if video_insights['name'] == 'post_reactions_by_type_total': 
                engagement_dict = video_insights['values'][0]['value']
                engagements = sum(list(engagement_dict.values())) 
            elif video_insights['name'] == 'post_impressions':
                impressions = video_insights['values'][0]['value']
            elif video_insights['name'] == 'post_impressions_organic_unique':
                reach_organic = video_insights['values'][0]['value']
            elif video_insights['name'] == 'post_impressions_paid_unique':
                reach_paid = video_insights['values'][0]['value']
                
        reach = reach_organic + reach_paid
        video_insights_df.append({'post_id': id, 'engagements': engagements, 'impressions': impressions ,'reach': reach})

        
    video_insights_df = pd.DataFrame(video_insights_df)
    df = df.merge(video_insights_df, on='post_id', how='left')
    all_ips_df.append(df)

df = pd.concat(all_ips_df)            
df.to_csv('facebook_data_video.csv', index=False)
print(df)

Basically the script is trying to extract the posts in the page feed that have a media type video.

Since I cant extract the whole month directly, I need to use batch request provided. Batch request is a way you send a single HTTP request that contains multiple Facebook Graph API calls. In the script, I made two Facebook Graph API calls, one is to call from the date of 1st May until 15th May and the other is to call from the 16th May until 31st May.

I will provide the code for the date below:

import datetime
import calendar

from_date = datetime.date(2024,1,22)
from_date = from_date.replace(day=1) # 1-5-2024
from_date_fb = from_date - datetime.timedelta(days=1) # 30-4-2024: The from_date of the need to be less a day [Reason why it need to be less a day][2]

to_mid_date = from_date + datetime.timedelta(days=15) # 16-5-2024
from_mid_date = to_mid_date # 16-5-2024

_, days_in_month = calendar.monthrange(from_date.year, from_date.month) # this will calculate the days of that month
to_date = datetime.date(from_date.year, from_date.month, days_in_month) # 31-5-2024

print(from_date_fb,to_mid_date, from_mid_date,to_date)

If you have questions regarding the code please let me know and if you like to criticize my code, please do. Thanks