Search code examples
pythonpython-multiprocessingpython-multithreading

Reading thousands of json file and process them using python multiprocessing


I'm trying to read thousands of json file from directory and process each file separately and store the result in a dictionary. I already write a working code for sequential execution. Now i want to take the leverage of multi-processing for speed up the whole process.

So far what i did -


import json
import os
from multiprocessing import Process, Manager

def read_file(file_name):
    '''
        Read the given json file and return data
    '''
    
    with open(file_name) as file :
        data = json.load(file)
    
    return data

def do_some_process(data):
    '''
        Some calculation will be done here
        and return the result
    '''

    return some_result

def process_each_file(file, result):

    file_name = file.split('.')[0]    
    # reading data from file
    data = read_file('../data/{}'.format(file))
    processed_result = do_some_process(data)

    
    result[file_name] = processed_result


if __name__ == '__main__':


    manager = Manager()
    result = manager.dict()

    file_list = os.listdir("../data")
    
    all_process = [Process(target=process_each_file, args=(file, result, )) 
                  for file in file_list if file.endswith(".json")]
    

    for p in all_process:
        p.start() 
        
    for p in all_process:
        p.join() 

    '''
        Do some further work with 'rusult' variable
    '''

When i run this code it shows OSError: [Errno 24] Too many open files

How can i achive my goal ?


Solution

  • To read and process multiple JSON files using Python's multiprocessing module, you can use the following approach:

    import os
    import json
    from multiprocessing import Pool
    
    # List all the JSON files in the current directory
    json_files = [f for f in os.listdir('.') if f.endswith('.json')]
    
    def process_data(data):
        return data
    
    def process_json_file(filename):
        with open(filename, 'r') as f:
            data = json.load(f)
            # Process the data here...
            processed_data = process_data(data)
            return processed_data
    
    # Create a pool of workers to process the files concurrently
    with Pool() as pool:
        # Apply the processing function to each JSON file concurrently
        results = pool.map(process_json_file, json_files)
    
    # Do something with the results
    for result in results:
        print(result)