I have a massive table (several billion rows), and I need to analyze two numerical variables within it by a) creating a frequency table and b) creating distribution plots.
VarA has a range of 0.00 to 1.00 (in 0.01 increments)
VarB is distributed around 0.00 (in 0.01 increments)
I want to iterate over reading 1,000 rows then updating the frequency table. I have tried the following code:
c_size = 1000
result = {'A': dict(), 'B': dict()}
def update_dict(key, val):
if val not in result[key]:
result[key][val] = 1
else:
result[key][val] += 1
for data_chunk in pd.read_csv('data.csv', names=['ValA','ValB'], skiprows=10, chunksize=c_size):
for row in data_chunk:
valA, valB = row
update_dict('A', valA)
update_dict('B', valB)
print(result['A'])
print(result['B'])
Here's the working code. Thanks, @Peter-Du for your help.
c_size = 1000
result = {'VarA': dict(), 'VarB': dict()}
def update_dict(key, val):
if val not in result[key]:
result[key][val] = 1
else:
result[key][val] += 1
reader = pd.read_csv('file.csv', names=['VarA','VarB'], skiprows=10, chunksize=c_size)
for i, data_chunk in enumerate(reader):
for row in data_chunk.values:
valA, valB = row
update_dict('VarA', np.round(valA,2))
update_dict('VarB', np.round(valB,2))