Search code examples
pythonjsonstringunzip

I seem to lose some characters when converting a byte object into a string object


I am currently working on a python program that will filter out some keywords in the "text" tag of a JSON file. The conversion for my system is the following: .gz --> open with gzip in mode rb --> transform the b'' into a str --> json.load(str)

def gzworker(fullpath, condition):
    """Worker opens one .gz file"""
    print('Opening {}'.format(fullpath))
    buffer = []
    with gzip.open(fullpath, 'rb') as infile:
        for _line in infile:
            result = filter(json.loads(str(_line).split('|',1)[1][:-5]), condition)
            if result:
                buffer.append(result)
    print('Closing {}'.format(fullpath))
    return buffer

With the filter function taking into argument a JSON file

After running through this code multiple times I realised that actually the reason for it not working is that some commas seem to disappear. Does anybody know if it is possible that in the process some information is lost?

Result of what I get using the previous method (invalid JSON) [same result if I use decode]

{"created_at":"Thu Apr 17 04:45:03 +0000 2014","id":456654551114735616,"id_str":"456654551114735616","text":"@cam_clay1 come visit us soon plz \\ud83d\\ude18","source":"\\u003ca href=\\"http:\\/\\/twitter.com\\/download\\/iphone\\" rel=\\"nofollow\\"\\u003eTwitter for iPhone\\u003c\\/a\\u003e","truncated":false,"in_reply_to_status_id":456654343781892098,"in_reply_to_status_id_str":"456654343781892098","in_reply_to_user_id":427007607,"in_reply_to_user_id_str":"427007607","in_reply_to_screen_name":"cam_clay1","user":{"id":335107310,"id_str":"335107310","name":"Roger Krick","screen_name":"roger_krick","location":"Atlanta GA","url":null,"description":"I pushed Regina George in front of the bus.","protected":false,"followers_count":772,"friends_count":235,"listed_count":3,"created_at":"Thu Jul 14 04:49:29 +0000 2011","favourites_count":7192,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"verified":false,"statuses_count":9518,"lang":"en","contributors_enabled":false,"is_translator":false,"is_translation_enabled":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\\/\\/pbs.twimg.com\\/profile_background_images\\/378800000021719152\\/28971ed1e15e606fb52ef9e7af736e60.jpeg","profile_background_image_url_https":"https:\\/\\/pbs.twimg.com\\/profile_background_images\\/378800000021719152\\/28971ed1e15e606fb52ef9e7af736e60.jpeg","profile_background_tile":true,"profile_image_url":"http:\\/\\/pbs.twimg.com\\/profile_images\\/453031044393222144\\/7vIvMWvk_normal.jpeg","profile_image_url_https":"https:\\/\\/pbs.twimg.com\\/profile_images\\/453031044393222144\\/7vIvMWvk_normal.jpeg","profile_banner_url":"https:\\/\\/pbs.twimg.com\\/profile_banners\\/335107310\\/1352964715","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[33.75781394,-84.38479358]},"coordinates":{"type":"Point","coordinates":[-84.38479358,33.75781394]},"place":{"id":"8173485c72e78ca5","url":"https:\\/\\/api.twitter.com\\/1.1\\/geo\\/id\\/8173485c72e78ca5.json","place_type":"city","name":"Atlanta","full_name":"Atlanta, GA","country_code":"US","country":"United States","contained_within":[],"bounding_box":{"type":"Polygon","coordinates":[[[-84.5464728,33.647845],[-84.5464728,33.8868859],[-84.289385,33.8868859],[-84.289385,33.647845]]]},"attributes":{}},"contributors":null,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[{"screen_name":"cam_clay1","name":"Cameron Clay","id":427007607,"id_str":"427007607","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"}

What I should be getting (valid JSON):

{"created_at":"Thu Apr 17 04:45:03 +0000 2014","id":456654551114735616,"id_str":"456654551114735616","text":"@cam_clay1 come visit us soon plz \ud83d\ude18","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":456654343781892098,"in_reply_to_status_id_str":"456654343781892098","in_reply_to_user_id":427007607,"in_reply_to_user_id_str":"427007607","in_reply_to_screen_name":"cam_clay1","user":{"id":335107310,"id_str":"335107310","name":"Roger Krick","screen_name":"roger_krick","location":"Atlanta GA","url":null,"description":"I pushed Regina George in front of the bus.","protected":false,"followers_count":772,"friends_count":235,"listed_count":3,"created_at":"Thu Jul 14 04:49:29 +0000 2011","favourites_count":7192,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"verified":false,"statuses_count":9518,"lang":"en","contributors_enabled":false,"is_translator":false,"is_translation_enabled":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/378800000021719152\/28971ed1e15e606fb52ef9e7af736e60.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/378800000021719152\/28971ed1e15e606fb52ef9e7af736e60.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/453031044393222144\/7vIvMWvk_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/453031044393222144\/7vIvMWvk_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/335107310\/1352964715","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[33.75781394,-84.38479358]},"coordinates":{"type":"Point","coordinates":[-84.38479358,33.75781394]},"place":{"id":"8173485c72e78ca5","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/8173485c72e78ca5.json","place_type":"city","name":"Atlanta","full_name":"Atlanta, GA","country_code":"US","country":"United States","contained_within":[],"bounding_box":{"type":"Polygon","coordinates":[[[-84.5464728,33.647845],[-84.5464728,33.8868859],[-84.289385,33.8868859],[-84.289385,33.647845]]]},"attributes":{}},"contributors":null,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[{"screen_name":"cam_clay1","name":"Cameron Clay","id":427007607,"id_str":"427007607","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"}

Solution

  • You are decoding your bytes wrong:

    str(_line)
    

    That converts the object to a representation, that is useful for debugging but not for handling the data:

    >>> 'Føo'.encode('utf8')
    b'F\xc3\xb8o'
    >>> str('Føo'.encode('utf8'))
    "b'F\\xc3\\xb8o'"
    

    Note the b' prefix, the ' suffix, and the escape sequences!

    Decode bytes objects:

    _line.decode('utf8')
    

    I'm assuming that since this is JSON data, it is using the UTF-8 encoding (the JSON standard states that that is the default, the only other permitted options being UTF-16 and UTF-32).

    Better yet, use a io.TextIOWrapper() object to handle the decoding for you.

    Next, you appear to have reversed your condition and data. filter() takes a condition first, data sequence second.

    Corrected code:

    def gzworker(fullpath, condition):
        """Worker opens one .gz file"""
        print('Opening {}'.format(fullpath))
        buffer = []
        with gzip.open(fullpath, 'rb') as infile:
            decoded = io.TextIOWrapper(infile, encoding='utf8')
            for line in decoded:
                json_data = line.split('|', 1)[1][:-4]
                result = filter(condition, json.loads(json_data))
                if result:
                    buffer.append(result)
        print('Closing {}'.format(fullpath))
        return buffer
    

    I adjusted your slicing operation, assuming you previously sliced off the ' character introduced by the str() call.