I have 30 .bz2 files that i want to read in. Each file is too large to be read in, so x size chunk is sufficient from each file. I then want to join all these 30 files together.
import pandas as pd
import numpy as np
import glob
path = r'/content/drive/My Drive/' # use your path
all_files = glob.glob(os.path.join(path, "*.bz2")) # advisable to use os.path.join as this makes concatenation OS independent
# Below I read 10,000 lines X 11 in for each file because of RAM limit and append it together.
# How do I make it so it also appends each of the 30 files together?? I made an attempt below.
chunks = (pd.read_json(f, lines=True, chunksize = 1000) for f in all_files)
i = 0
chunk_list = []
for chunk in chunks:
if i >= 11:
break
i += 1
chunk_list.append(chunk)
df = pd.concat(chunk_list, sort = True)
#print(df)
df
Sample .bz2 data can be found at: https://csr.lanl.gov/data/2017.html
import os, json
import pandas as pd
import numpy as np
import glob
pd.set_option('display.max_columns', None)
temp = pd.DataFrame()
path_to_json = '/content/drive/My Drive/'
json_pattern = os.path.join(path_to_json,'*.bz2')
file_list = glob.glob(json_pattern)
for file in file_list:
chunks = pd.read_json(file, lines=True, chunksize=1000)
i = 0
chunk_list = []
for chunk in chunks:
if i >= 10:
break
i += 1
chunk_list.append(chunk)
df = pd.concat(chunk_list, sort = True)
temp = temp.append(df, sort = True)
temp
this seems to work