Search code examples
pythonjsonpandastextdelimiter

Reading Json objects from text file into pandas


I have extracted json objects from an api library and wrote them into a text file. I am now stuck on how to take the json structure saved in the .txt file and read that back into python pandas library.

There are many resources that walk through how to import a json file into pandas but since this is a text file and I'm new to programming and working with json structure I'm not sure how to efficiently perform this task.

There are numerous json objects in the text file and I would share an example but it has a bunch of url shorteners which is preventing me from being able to post this question so unless someone really needs to see the structure Ill hold off. I already tried pd.read_csv() and pd.read_json() but since this is a json structure in a .txt file its not working properly for either so far.

Here has been my best guess so far to get the data back into python:

data = []
with open('tweet_json.txt') as f:
    for line in f:
        data.append(json.loads(line))

But I got the following error message when I tried that: JSONDecodeError: Extra data: line 1 column 4626 (char 4625)

Here are two tweets that you can copy and save to a .txt file to replicate:

{'contributors': None,
 'coordinates': None,
 'created_at': 'Tue Aug 01 16:23:56 +0000 2017',
 'display_text_range': [0, 85],
 'entities': {'hashtags': [],
  'media': [{'display_url': 'pic.twitter.com/MgUWQ76dJU',
    'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
    'id': 892420639486877696,
    'id_str': '892420639486877696',
    'indices': [86, 109],
    'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
    'sizes': {'large': {'h': 528, 'resize': 'fit', 'w': 540},
     'medium': {'h': 528, 'resize': 'fit', 'w': 540},
     'small': {'h': 528, 'resize': 'fit', 'w': 540},
     'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
    'type': 'photo',
    'url': na}],
  'symbols': [],
  'urls': [],
  'user_mentions': []},
 'extended_entities': {'media': [{'display_url': 'pic.twitter.com/MgUWQ76dJU',
    'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
    'id': 892420639486877696,
    'id_str': '892420639486877696',
    'indices': [86, 109],
    'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
    'sizes': {'large': {'h': 528, 'resize': 'fit', 'w': 540},
     'medium': {'h': 528, 'resize': 'fit', 'w': 540},
     'small': {'h': 528, 'resize': 'fit', 'w': 540},
     'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
    'type': 'photo',
    'url': na}]},
 'favorite_count': 39311,
 'favorited': False,
 'full_text': "This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 na ",
 'geo': None,
 'id': 892420643555336193,
 'id_str': '892420643555336193',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'is_quote_status': False,
 'lang': 'en',
 'place': None,
 'possibly_sensitive': False,
 'possibly_sensitive_appealable': False,
 'retweet_count': 8778,
 'retweeted': False,
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'truncated': False,
 'user': {'contributors_enabled': False,
  'created_at': 'Sun Nov 15 21:41:29 +0000 2015',
  'default_profile': False,
  'default_profile_image': False,
  'description': 'Only Legit Source for Professional Dog Ratings STORE: @ShopWeRateDogs | IG, FB & SC: WeRateDogs | MOBILE APP: @GoodDogsGame Business: [email protected]',
  'entities': {'description': {'urls': []},
   'url': {'urls': [{'display_url': 'weratedogs.com',
      'expanded_url': 'http://weratedogs.com',
      'indices': [0, 23],
      'url': na }]}},
  'favourites_count': 126135,
  'follow_request_sent': False,
  'followers_count': 4730764,
  'following': False,
  'friends_count': 109,
  'geo_enabled': True,
  'has_extended_profile': True,
  'id': 4196983835,
  'id_str': '4196983835',
  'is_translation_enabled': False,
  'is_translator': False,
  'lang': 'en',
  'listed_count': 3700,
  'location': 'DM YOUR DOGS. WE WILL RATE',
  'name': 'WeRateDogs™',
  'notifications': False,
  'profile_background_color': '000000',
  'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',
  'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',
  'profile_background_tile': False,
  'profile_banner_url': 'https://pbs.twimg.com/profile_banners/4196983835/1510812288',
  'profile_image_url': 'http://pbs.twimg.com/profile_images/936608706107772929/GwbLQRxf_normal.jpg',
  'profile_image_url_https': 'https://pbs.twimg.com/profile_images/936608706107772929/GwbLQRxf_normal.jpg',
  'profile_link_color': 'F5ABB5',
  'profile_sidebar_border_color': '000000',
  'profile_sidebar_fill_color': '000000',
  'profile_text_color': '000000',
  'profile_use_background_image': False,
  'protected': False,
  'screen_name': 'dog_rates',
  'statuses_count': 6301,
  'time_zone': None,
  'translator_type': 'none',
  'url': n/a,
  'utc_offset': None,
  'verified': True}}

{'contributors': None,
 'coordinates': None,
 'created_at': 'Tue Aug 01 00:17:27 +0000 2017',
 'display_text_range': [0, 138],
 'entities': {'hashtags': [],
  'media': [{'display_url': 'pic.twitter.com/0Xxu71qeIV',
    'expanded_url': 'https://twitter.com/dog_rates/status/892177421306343426/photo/1',
    'id': 892177413194625024,
    'id_str': '892177413194625024',
    'indices': [139, 162],
    'media_url': 'http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg',
    'sizes': {'large': {'h': 1600, 'resize': 'fit', 'w': 1407},
     'medium': {'h': 1200, 'resize': 'fit', 'w': 1055},
     'small': {'h': 680, 'resize': 'fit', 'w': 598},
     'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
    'type': 'photo',
    'url': na}],
  'symbols': [],
  'urls': [],
  'user_mentions': []},
 'extended_entities': {'media': [{'display_url': 'pic.twitter.com/0Xxu71qeIV',
    'expanded_url': 'https://twitter.com/dog_rates/status/892177421306343426/photo/1',
    'id': 892177413194625024,
    'id_str': '892177413194625024',
    'indices': [139, 162],
    'media_url': 'http://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg',
    'sizes': {'large': {'h': 1600, 'resize': 'fit', 'w': 1407},
     'medium': {'h': 1200, 'resize': 'fit', 'w': 1055},
     'small': {'h': 680, 'resize': 'fit', 'w': 598},
     'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
    'type': 'photo',
    'url': na}]},
 'favorite_count': 33662,
 'favorited': False,
 'full_text': "This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 na,
 'geo': None,
 'id': 892177421306343426,
 'id_str': '892177421306343426',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'is_quote_status': False,
 'lang': 'en',
 'place': None,
 'possibly_sensitive': False,
 'possibly_sensitive_appealable': False,
 'retweet_count': 6431,
 'retweeted': False,
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'truncated': False,
 'user': {'contributors_enabled': False,
  'created_at': 'Sun Nov 15 21:41:29 +0000 2015',
  'default_profile': False,
  'default_profile_image': False,
  'description': 'Only Legit Source for Professional Dog Ratings STORE: @ShopWeRateDogs | IG, FB & SC: WeRateDogs | MOBILE APP: @GoodDogsGame Business: [email protected]',
  'entities': {'description': {'urls': []},
   'url': {'urls': [{'display_url': 'weratedogs.com',
      'expanded_url': 'http://weratedogs.com',
      'indices': [0, 23],
      'url': na}]}},
  'favourites_count': 126135,
  'follow_request_sent': False,
  'followers_count': 4730865,
  'following': False,
  'friends_count': 109,
  'geo_enabled': True,
  'has_extended_profile': True,
  'id': 4196983835,
  'id_str': '4196983835',
  'is_translation_enabled': False,
  'is_translator': False,
  'lang': 'en',
  'listed_count': 3728,
  'location': 'DM YOUR DOGS. WE WILL RATE',
  'name': 'WeRateDogs™',
  'notifications': False,
  'profile_background_color': '000000',
  'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png',
  'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png',
  'profile_background_tile': False,
  'profile_banner_url': 'https://pbs.twimg.com/profile_banners/4196983835/1510812288',
  'profile_image_url': 'http://pbs.twimg.com/profile_images/936608706107772929/GwbLQRxf_normal.jpg',
  'profile_image_url_https': 'https://pbs.twimg.com/profile_images/936608706107772929/GwbLQRxf_normal.jpg',
  'profile_link_color': 'F5ABB5',
  'profile_sidebar_border_color': '000000',
  'profile_sidebar_fill_color': '000000',
  'profile_text_color': '000000',
  'profile_use_background_image': False,
  'protected': False,
  'screen_name': 'dog_rates',
  'statuses_count': 6301,
  'time_zone': None,
  'translator_type': 'none',
  'url': na,
  'utc_offset': None,
  'verified': True}}

Update

The following code produces this error: JSONDecodeError: Expecting ',' delimiter: line 1 column 4627 (char 4626)

with open('tweet_json.txt', 'r') as f:
    datastore = json.load(f)

This post is the closest I've found so far to help me solve my issue:

Python json.loads shows ValueError: Expecting , delimiter: line 1


Solution

  • Thanks everyone for the feedback. I had to adjust the code regarding how I was extracting the data from the API and then it was pretty straight-forward to get the data into a list of dictionaries after that.

    with open('tweet_json.txt', 'a+', encoding='utf-8') as file:
        for tweet_id in twitter_archive_df['tweet_id']:
            try:
                tweet = api.get_status(id = tweet_id, tweet_mode='extended')
                file.write(json.dumps(tweet))
                file.write('\n')
            except:
                pass
    
    file.close()
    

    then I ran the following code to import the json objects from the .txt file into a list of dictionaries:

    with open('tweet_json.txt') as file:
        status = []
        for line in file:
            status.append(json.loads(line))