I'm a beginner at python, and I'm working on a project that parses comments from an Instagram post to determine who commented first. I have everything working except for the parsing of the large text file I get when I pull the comments from the Instagram API. I tried using a split statement to split by the instance of "id", but that didn't do what I want, and I can't seem to figure out how to do this.
Text file I'm trying to parse is here: https://pastebin.com/SjyaUZ1u
Code is below
with open("comments.txt", "r") as f:
for line in f:
str = line.split("id", 10)
print(str)
Eventually I'd like to output these instances along with following text into another file, but this is the main problem right now.
BeautifulSoup
and its html.parser
parse your html json
content inside pre
tagfrom bs4 import BeautifulSoup
import json
content = open('comments.txt')
soup = BeautifulSoup(content, 'html.parser')
a = soup.find('pre').get_text()
# print(a)
data = json.loads(a)
# print(data['graphql']['shortcode_media']['edge_media_to_comment']['edges'])
comments = data['graphql']['shortcode_media']['edge_media_to_comment']['edges']
for i in comments:
print(i)
output:
{'node': {'id': '17866050361316207', 'text': '@pay2on gimme that coin', 'created_at': 1549838809, 'did_report_as_spam': False, 'owner': {'id': '351807810', 'is_verified': False, 'profile_pic_url': 'https://scontent-dfw5-1.cdninstagram.com/vp/3d9c34d98843652774cbe4ed24111890/5CDB0369/t51.2885-19/s150x150/46791596_1837989999663259_8814990427338833920_n.jpg?_nc_ht=scontent-dfw5-1.cdninstagram.com', 'username': 'pay2on'}, 'viewer_has_liked': False, 'edge_liked_by': {'count': 0}}}
{'node': {'id': '18015390769112027', 'text': 'Gg', 'created_at': 1549838810, 'did_report_as_spam': False, 'owner': {'id': '1620477479', 'is_verified': False, 'profile_pic_url': 'https://scontent-dfw5-1.cdninstagram.com/vp/386991ecd3581167df6fbcdcc3fbffba/5CDA79E7/t51.2885-19/s150x150/43093738_486110321900668_5555852948304560128_n.jpg?_nc_ht=scontent-dfw5-1.cdninstagram.com', 'username': 'a_andrestares'}, 'viewer_has_liked': False, 'edge_liked_by': {'count': 0}}}
{'node': {'id': '18032678767050139', 'text': '💀', 'created_at': 1549838821, 'did_report_as_spam': False, 'owner': {'id': '10336797539', 'is_verified': False, 'profile_pic_url': 'https://scontent-dfw5-1.cdninstagram.com/vp/a76a0efe6e172d370cd1e78d91fd0dd8/5CF3F51A/t51.2885-19/50496250_242828456650749_2095120626497880064_n.jpg?_nc_ht=scontent-dfw5-1.cdninstagram.com', 'username': 'sourpods'}, 'viewer_has_liked': False, 'edge_liked_by': {'count': 0}}}
{'node': {'id': '18031342408039828', 'text': '@pay2on anticoin', 'created_at': 1549838844, 'did_report_as_spam': False, 'owner': {'id': '4726002591', 'is_verified': False, 'profile_pic_url': 'https://scontent-dfw5-1.cdninstagram.com/vp/124d070772b728492cb4a2aa00ff7919/5D01194C/t51.2885-19/s150x150/49303395_379094076216927_6321318684270788608_n.jpg?_nc_ht=scontent-dfw5-1.cdninstagram.com', 'username': 'anti.tv'}, 'viewer_has_liked': False, 'edge_liked_by': {'count': 0}}}
{'node': {'id': '17900497066291896', 'text': 'H9', 'created_at': 1549838912, 'did_report_as_spam': False, 'owner': {'id': '9667789306', 'is_verified': False, 'profile_pic_url': 'https://scontent-dfw5-1.cdninstagram.com/vp/de6dfba0978218656579b080ee53f8c4/5CE83473/t51.2885-19/s150x150/50746304_2379848008910407_7742041896683307008_n.jpg?_nc_ht=scontent-dfw5-1.cdninstagram.com', 'username': 'chronic_wanna_be'}, 'viewer_has_liked': False, 'edge_liked_by': {'count': 0}}}