Search code examples
pythonjsoncsvpython-3.6yelp

Yelp dataset: Json to csv converter (python 3)


So, I'm trying to work with the Yelp dataset. I'm using python 3! I've found a code to convert a json file to a csv and I've opened cmd on Windows and typed: C:\Users\AppData\Local\Programs\Python\Python36-32>python.exe json_to_csv_converter.py review.json

The command gives me no more errors but it actually didn't convert (there is no csv file as output).

Can you please help me understand why it's not working?

The code inside the converter.py is:

import argparse
import collections
import csv
import json


def read_and_write_file(json_file_path, csv_file_path, column_names):
    with open(csv_file_path, 'w+') as fout:
        csv_file = csv.writer(fout)
    csv_file.writerow(list(column_names))
    with open(json_file_path, encoding="utf8") as fin:

        for line in fin:
            line_contents = json.loads(line)
            csv_file.writerow(get_row(line_contents, column_names))


def get_superset_of_column_names_from_file(json_file_path):
    column_names = set()
    with open(json_file_path) as fin:

        for line in fin:
            line_contents = json.loads(line)
        column_names.update(
            set(get_column_names(line_contents).keys())
            )
        return column_names


def get_column_names(line_contents, parent_key=''):
    column_names = []
    for k, v in line_contents.items():
        column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
    if isinstance(v, collections.MutableMapping):
        column_names.extend(
                get_column_names(v, column_name).items()
                )
    else:
        column_names.append((column_name, v))
    return dict(column_names)


def get_nested_value(d, key):

    if '.' not in key:
        if key not in d:
            return None
    return d[key]
    base_key, sub_key = key.split('.', 1)
    if base_key not in d:
        return None
    sub_dict = d[base_key]
    return get_nested_value(sub_dict, sub_key)


def get_row(line_contents, column_names):
    row = []
    for column_name in column_names:
        line_value = get_nested_value(
            line_contents,
            column_name,
            )
    if isinstance(line_value, str):
        row.append('{0}'.format(line_value.encode('utf-8')))
    elif line_value is not None:
        row.append('{0}'.format(line_value))
    else:
        row.append('')
    return row

    if name == 'main':
        parser = argparse.ArgumentParser(
        description='Convert Yelp data from JSON format to CSV.',
        )

        parser.add_argument(
            'json_file',
            type=str,
            help='The json file to convert.',
            )

        args = parser.parse_args()

        json_file = args.json_file
        csv_file = '{0}.csv'.format(json_file.split('.json')[0])

        column_names = get_superset_of_column_names_from_file(json_file)
        read_and_write_file(json_file, csv_file, column_names)

It might be a silly question (sorry I'm not used to python), but do I need to input the json path, json file name, csv path, csv file name, etc? If so, what is the appropriate way of doing it?

Thank you so much.

Carmem


Solution

  • What's wrong with your code is the following line

    if name == 'main':
    

    i think it was

    if __name__ == '__main__':
    

    so, here is the code that might work.

    import argparse
    import collections
    import csv
    import json
    
    
    def read_and_write_file(json_file_path, csv_file_path, column_names):
        with open(csv_file_path, 'w+') as fout:
            csv_file = csv.writer(fout)
        csv_file.writerow(list(column_names))
        with open(json_file_path, encoding="utf8") as fin:
    
            for line in fin:
                line_contents = json.loads(line)
                csv_file.writerow(get_row(line_contents, column_names))
    
    
    def get_superset_of_column_names_from_file(json_file_path):
        column_names = set()
        with open(json_file_path) as fin:
    
            for line in fin:
                line_contents = json.loads(line)
            column_names.update(
                set(get_column_names(line_contents).keys())
                )
            return column_names
    
    
    def get_column_names(line_contents, parent_key=''):
        column_names = []
        for k, v in line_contents.items():
            column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
        if isinstance(v, collections.MutableMapping):
            column_names.extend(
                    get_column_names(v, column_name).items()
                    )
        else:
            column_names.append((column_name, v))
        return dict(column_names)
    
    
    def get_nested_value(d, key):
    
        if '.' not in key:
            if key not in d:
                return None
        return d[key]
        base_key, sub_key = key.split('.', 1)
        if base_key not in d:
            return None
        sub_dict = d[base_key]
        return get_nested_value(sub_dict, sub_key)
    
    
    def get_row(line_contents, column_names):
        row = []
        for column_name in column_names:
            line_value = get_nested_value(
                line_contents,
                column_name,
                )
        if isinstance(line_value, str):
            row.append('{0}'.format(line_value.encode('utf-8')))
        elif line_value is not None:
            row.append('{0}'.format(line_value))
        else:
            row.append('')
        return row
    
    if __name__ == '__main__':
            parser = argparse.ArgumentParser(
            description='Convert Yelp data from JSON format to CSV.',
            )
    
            parser.add_argument(
                'json_file',
                type=str,
                help='The json file to convert.',
                )
    
            args = parser.parse_args()
    
            json_file = args.json_file
            csv_file = '{0}.csv'.format(json_file.split('.json')[0])
    
            column_names = get_superset_of_column_names_from_file(json_file)
            read_and_write_file(json_file, csv_file, column_names)