I am trying scrape Facebook data, of public pages.
The code I was using a couple of months (10 months ago maybe) ago was working fine. Now, when I wanted to continue that project, but the code is not working anymore. I used to use my private user token, which expires after a couple of minutes. But it is enough for my use case. I don't need an App, and and App Review etc. to get a permanent token.
Here is the code:
def getData(page, urlToConnect, startTime, filterStart, filterEnd):
posts = []
found = False
try:
while (True):
#print(url)
facebook_connection = urlopen(urlToConnect)
data = facebook_connection.read().decode('utf8')
json_object = json.loads(data)
#posts=json_object["data"]
allposts=json_object["data"]
allposts = np.asarray(allposts)
created = startTime
for i in range(0,100,1):
if (pd.to_datetime(allposts[i]['created_time']) > pd.to_datetime(created)):
posts.append(allposts[i])
else:
print(" found date at this index: ", i)
posts.append(allposts[i])
found = True
break;
if (i == 99):
urlToConnect = json_object["paging"]["next"]
if (found == True):
break;
df=pd.DataFrame(allposts)
df['Angry'] = df['Angry'].astype(str).str.replace('{\'data\':(.*?)count\': ','')
df['Angry'] = df['Angry'].str.replace(',(.*?)}}','')
df['Haha'] = df['Haha'].astype(str).str.replace('{\'data\':(.*?)count\': ','')
df['Haha'] = df['Haha'].str.replace('}}','')
df['Love'] = df['Love'].astype(str).str.replace('{\'data\':(.*?)count\': ','')
df['Love'] = df['Love'].str.replace('}}','')
df['Sad'] = df['Sad'].astype(str).str.replace('{\'data\':(.*?)count\': ','')
df['Sad'] = df['Sad'].str.replace(',(.*?)}}','')
df['Wow'] = df['Wow'].astype(str).str.replace('{\'data\':(.*?)count\': ','')
df['Wow'] = df['Wow'].str.replace('}}','')
df['comments'] = df['comments'].astype(str).str.replace('{\'data\':(.*?)count\': ','')
df['comments'] = df['comments'].str.replace(',(.*?)}}','')
df['likes'] = df['likes'].astype(str).str.replace('{\'(.*?)count\':','')
df['likes'] = df['likes'].str.replace(',(.*?)}}','')
df['shares'] = df['shares'].astype(str).str.replace('{\'count\': ','')
df['shares'] = df['shares'].str.replace('}','')
df['date'], df['time'] = df['created_time'].astype(str).str.split('T', 1).str
df['time'] = df['time'].str.replace('[+]0000','')
# Convert NaN's to 0 (as string)
df['shares'] = df['shares'].str.replace('nan','0')
df['shares'] = df['shares'].str.replace('Nan','0')
df['shares'] = df['shares'].str.replace('NaN','0')
# Convert Series values from str to int
df['shares'] = df['shares'].astype(int)
df['likes'] = df['likes'].astype(int)
df['comments'] = df['comments'].astype(int)
df['Love'] = df['Love'].astype(int)
df['Wow'] = df['Wow'].astype(int)
df['Sad'] = df['Sad'].astype(int)
df['Angry'] = df['Angry'].astype(int)
df['Haha'] = df['Haha'].astype(int)
# Sum over all number columns of one row
col_list= list(df)
df['total_reac'] = df[col_list].sum(axis=1)
# Sort values by 'total_reac' column, descending
df = df.sort_values(by='total_reac', ascending=False)
# Convert column from str to datetime
df['created_time'] = pd.to_datetime(df['created_time'])
# Filter for dates needed
df = df[(df['created_time'] > fStart) & (df['created_time'] <= fEnd)]
# Save Dataframe as csv
df.to_csv("Facebook_Posts_" + page + ".csv" )
except Exception as ex:
print (ex)
return df
token="my_User__Token_Here (got from my personal https://developers.facebook.com/tools/explorer)"
sTime = '2018-05-01'
fStart = '2018-05-01'
fEnd = '2018-05-29'
page_id="nytimes"
url="https://graph.facebook.com/3.2/"+page_id+"/posts/?fields=id,created_time,message,shares.summary(true).limit(0),comments.summary(true).limit(0),likes.summary(true),reactions.type(LOVE).limit(0).summary(total_count).as(Love),reactions.type(WOW).limit(0).summary(total_count).as(Wow),reactions.type(HAHA).limit(0).summary(total_count).as(Haha),reactions.type(SAD).limit(0).summary(1).as(Sad),reactions.type(ANGRY).limit(0).summary(1).as(Angry)&access_token="+token+"&limit=100"
dataNYT = getData(page_id, url, sTime, fStart, fEnd)
dataNYT.to_csv("NYT_posts.csv")
Here is the error I am getting now:
HTTP Error 400: Bad Request
And when I try to type in the url requested in my browser, this error appears:
{
"error": {
"message": "Unknown path components: /nytimes/posts",
"type": "OAuthException",
"code": 2500,
"fbtrace_id": "HsN9zi+byTD"
}
}
Anyone has an idea?
Not sure why you get that error, when I try that API call in the API Explorer, i get the correct one:
{
"error": {
"message": "(#10) To use 'Page Public Content Access', your use of this endpoint must be reviewed and approved by Facebook. To submit this 'Page Public Content Access' feature for review please read our documentation on reviewable features: https://developers.facebook.com/docs/apps/review.",
"type": "OAuthException",
"code": 10,
"fbtrace_id": "AZJ2HjKFmkW"
}
}
You DO need an App, and you DO need App Review. In order to get access to pages you don´t own, you have to get "Page Public Content Access" approved by Facebook. After that, you can even use a never-expiring App Access Token. But you still need an App, for ANY API access, always.
More information: https://developers.facebook.com/docs/apps/review/feature/?locale=de_DE#reference-PAGES_ACCESS