Search code examples
pythonjsonpandaspython-os

JSONDecodeError when trying to read and format multiple json files in a directory in Python


I am trying to read and format multiple json files in a directory using Python. I have created a function load_json_to_dataframe to load and format the json data into a pandas dataframe, and another function read_json_files to read and append each dataframe to a list. However, I keep getting a JSONDecodeError when I run the code.

Here is the code I am using:

import os
import pandas as pd
import json

def load_json_to_dataframe(json_file_path):
    with open(json_file_path, 'r') as json_file:
        doc = json.load(json_file)
        return pd.json_normalize(doc)

def read_json_files(folder_path):
    dataframes = []
    json_files = os.listdir(folder_path)
    for json_file in json_files:
        if json_file.endswith('.json'):
            df = load_json_to_dataframe(os.path.join(folder_path, json_file))
            dataframes.append(df)
    return pd.concat(dataframes, ignore_index=True)

folder_path = 'path/to/json/files'
combined_dataframe = read_json_files(folder_path)

And this is the error message I am receiving:

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

I am not sure what is causing this error or how to fix it. Can anyone help me figure out what I am doing wrong and how to fix it? Thanks in advance.

Here a example of my data: https://drive.google.com/file/d/1h2J-e0cF9IbbWVO8ugrXMGdQTn-dGtsA/view?usp=sharing

Update: There was a file with a different format than the others and therefore it was not read correctly, I have deleted it. Now it gives me a different error

---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
Cell In[1], line 20
     17     return pd.concat(dataframes, ignore_index=True)
     19 folder_path = 'C:/Users/gusta/Desktop/business/Emprendimiento'
---> 20 combined_dataframe = read_json_files(folder_path)

Cell In[1], line 17, in read_json_files(folder_path)
     15         df = load_json_to_dataframe(os.path.join(folder_path, json_file))
     16         dataframes.append(df)
---> 17 return pd.concat(dataframes, ignore_index=True)

File c:\Users\gusta\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\util\_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    325 if len(args) > num_allow_args:
    326     warnings.warn(
    327         msg.format(arguments=_format_argument_list(allow_args)),
    328         FutureWarning,
    329         stacklevel=find_stack_level(),
    330     )
--> 331 return func(*args, **kwargs)

File c:\Users\gusta\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\reshape\concat.py:381, in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
    159 """
    160 Concatenate pandas objects along a particular axis.
    161 
...
    186 return self._blknos

File c:\Users\gusta\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\_libs\internals.pyx:718, in pandas._libs.internals.BlockManager._rebuild_blknos_and_blklocs()

MemoryError: Unable to allocate 966. KiB for an array with shape (123696,) and data type int64

Solution

  • Finally, I found the solution through the following code, and the main issues were:

    1. I was not correctly specifying the layers of the .json file, so loading it completely was consuming too much memory.

    2. Due to the above reason, I made the code apply to one product category at a time. This way it worked.

    So, I was able to solve the problem by specifying the correct layers to extract from the .json file, rather than extracting the entire file and filtering later. Additionally, I modified the code to process only one product category at a time to improve memory usage.

    import pandas as pd
    import json
    import os
    
    # Function to load a JSON file into a Pandas DataFrame
    def load_json_to_dataframe(json_file_path):
        with open(json_file_path, 'r') as json_file:
            # Load JSON file into a Python dictionary
            doc = json.load(json_file)
            # Extract the creation time of the file
            file_creation_time = os.path.getctime(json_file_path)
            # Convert the creation time to a datetime object
            file_creation_time = pd.to_datetime(file_creation_time, unit='s')
            # Normalize the JSON data and add the creation time as a new column
            df = pd.json_normalize(doc, meta=['id', 'title', 'condition', 'permalink',
                                              'category_id', 'domain_id', 'thumbnail',
                                              'currency_id', 'price', 'sold_quantity',
                                              'available_quantity', ['seller', 'id'],
                                              ['seller', 'nickname'], ['seller', 'permalink'],
                                              ['address', 'state_name'], ['address', 'city_name']])
            # Reorder the columns to have the file_creation_time column after the id column
            df = df[['id', 'title', 'condition', 'permalink', 'category_id', 'domain_id',
                     'thumbnail', 'currency_id', 'price', 'sold_quantity', 'available_quantity',
                     'seller.id', 'seller.nickname', 'seller.permalink', 'address.state_name',
                     'address.city_name']]
            # Add the file creation time column to the DataFrame
            df['file_creation_time'] = file_creation_time
            return df
    
    # Function to read multiple JSON files into a single Pandas DataFrame
    def read_json_files(folder_path, categories=None, batch_size=1000):
        if categories is None:
            # If no categories are specified, read all files that end in '.json'
            json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
        else:
            # If categories are specified, read only files that correspond to those categories
            json_files = [f for f in os.listdir(folder_path) if f.endswith('.json') and any(category in f for category in categories)]
        # Split the list of files into batches of a given size
        batches = [json_files[i:i+batch_size] for i in range(0, len(json_files), batch_size)]
        # Read each batch of files into a list of DataFrames
        dfs = []
        for batch in batches:
            batch_dfs = [load_json_to_dataframe(os.path.join(folder_path, f)) for f in batch]
            dfs.append(pd.concat(batch_dfs, ignore_index=True))
        # Concatenate all DataFrames into a single DataFrame
        return pd.concat(dfs, ignore_index=True)
    
    # Specify the categories of files to read and the folder path
    categories = ['MLC4922.json']
    folder_path = 'C:/path/to/folder/files'
    
    # Read the JSON files into a single DataFrame
    combined_dataframe = read_json_files(folder_path, categories)