Search code examples
pythonjsonrestpython-requestsimdb

simplejson.scanner.JSONDecodeError: Invalid \X escape sequence u's': line 1 column 468 (char 467)


I am having a problem while making an HTTP request to a REST api where my JSON formatted response isn't properly formed. There are escape sequences in the JSON which are interpreted correctly, but then there is a specific sentence which has the words "inner\spiritual", making the JSON decoder believe the "\s" is an escape sequence, when it is not meant to be.

Searching through the stackoverflow articles, I could not find a solution that fit my precise use case, but I figured out a good hack using the exceptions and indexing the JSON string and doing simple replacement. I thought it was worth a share since it might help someone in a similar situation. Happy hacking :)

Full Code:

import requests
import os
import json

base_url = 'http://www.omdbapi.com/'
tv_series = {}
films = {}

#for i in range(1, 9999999):
#imdb_id = 'tt' + str(i).zfill(7)
imdb_id = 'tt0120690'
print imdb_id



payload = {
            'i':imdb_id,
            'plot':'full',
            'r':'json'      
}
response = requests.get(base_url, params=payload)
if response.status_code == 200:     
    result = None
    result = response.json()

    if result != None:
        if result['Response'] != 'False':
            if result['Type'] == 'movie':
                films[result['Title']] = result
            elif result['Type'] == 'series':
                tv_series[result['Title']] = result
            else:
                print '[ERROR] Type:', result['Type']
with open('tv_series.json', 'w') as tv_series_outfile:
    json.dump(tv_series, tv_series_outfile)

with open('films.json', 'w') as films_outfile:
    json.dump(films, films_outfile)

Error:

Traceback (most recent call last):
  File "import_imdb.py", line 41, in <module>
    result = response.json()
  File "C:\Python27\lib\site-packages\requests\models.py", line 797, in json
    return json.loads(self.text, **kwargs)
  File "C:\Python27\lib\site-packages\simplejson\__init__.py", line 516, in loads
    return _default_decoder.decode(s)
  File "C:\Python27\lib\site-packages\simplejson\decoder.py", line 370, in decode
    obj, end = self.raw_decode(s)
  File "C:\Python27\lib\site-packages\simplejson\decoder.py", line 400, in raw_decode
    return self.scan_once(s, idx=_w(s, idx).end())
  File "C:\Python27\lib\site-packages\simplejson\scanner.py", line 127, in scan_once
    return _scan_once(string, idx)
  File "C:\Python27\lib\site-packages\simplejson\scanner.py", line 93, in _scan_once
    _scan_once, object_hook, object_pairs_hook, memo)
  File "C:\Python27\lib\site-packages\simplejson\decoder.py", line 194, in JSONObject
    value, end = scan_once(s, end)
  File "C:\Python27\lib\site-packages\simplejson\scanner.py", line 90, in _scan_once
    return parse_string(string, idx + 1, encoding, strict)
  File "C:\Python27\lib\site-packages\simplejson\decoder.py", line 99, in py_scanstring
    raise JSONDecodeError(msg, s, end)
simplejson.scanner.JSONDecodeError: Invalid \X escape sequence u's': line 1 column 468 (char 467)

Solution

  • My Fix:

    def fix_JSON(json_message=None):
        result = None
        try:        
            result = json.loads(json_message)
        except Exception as e:      
            # Find the offending character index:
            idx_to_replace = int(e.message.split(' ')[-1].replace(')',''))      
            # Remove the offending character:
            json_message = list(json_message)
            json_message[idx_to_replace] = ' '
            new_message = ''.join(json_message)     
            return fix_JSON(json_message=new_message)
        return result
    

    Full Code:

    import requests
    import os
    import json
    
    
    def fix_JSON(json_message=None):
        result = None
        try:        
            result = json.loads(json_message)
        except Exception as e:      
            # Find the offending character index:
            idx_to_replace = int(e.message.split(' ')[-1].replace(')',''))      
            # Remove the offending character:
            json_message = list(json_message)
            json_message[idx_to_replace] = ' '
            new_message = ''.join(json_message)     
            return fix_JSON(json_message=new_message)
        return result
    
    
    base_url = 'http://www.omdbapi.com/'
    tv_series = {}
    films = {}
    for i in range(1, 9999999):
        imdb_id = 'tt' + str(i).zfill(7)
        #imdb_id = 'tt0120690'
        print imdb_id
    
    
    
        payload = {
                    'i':imdb_id,
                    'plot':'full',
                    'r':'json'      
        }
        response = requests.get(base_url, params=payload)
        if response.status_code == 200:     
            result = None
            result = fix_JSON(json_message=response.content)
    
            if result != None:
                if result['Response'] != 'False':
                    if result['Type'] == 'movie':
                        films[result['Title']] = result
                    elif result['Type'] == 'series':
                        tv_series[result['Title']] = result
                    else:
                        print '[ERROR] Type:', result['Type']
    with open('tv_series.json', 'w') as tv_series_outfile:
        json.dump(tv_series, tv_series_outfile)
    
    with open('films.json', 'w') as films_outfile:
        json.dump(films, films_outfile)