python python-3.x csv data-conversion execution

Python file conversion takes more than 2 days

I have a folder which includes around 400 txt files. MAx size of txt file is 2 to 2.5 mb.

I am trying to convert these files to csv with python code. My code perfectly works and quickly converts txt to csv when I have small size of txt( even more than 500 files ) But when size it little heavy it takes quite long time.

Well it's obvious to take long time for heavy data but the problem is I am running this conversion process since 2 days and not even 50% is completed.

Is there any idea to convert these txt file to csv quickly?? I mean withing few hours. If it takes more than 2 days then I will not have enough time to analyze it.

My code is here:

import glob
import os, os.path, glob
import numpy as np
import matplotlib.pyplot as plt
from natsort import natsorted
import pandas as pd
from matplotlib.patches import Ellipse
from matplotlib.text import OffsetFrom


from mpl_toolkits.mplot3d import Axes3D
from random import random

data_folder = "./all/"
data_folder
files = natsorted(glob.glob(data_folder + 'dump*.data'))
number_of_files = len(files)
#print(number_of_files)
#files

file_open = open("./all/dump80000.data", "r")
with open("./all/dump80000.data") as f:
  lines = f.readlines()
#removing 'ITEM:' 
s = 'ITEM: ATOMS '
lines[8] = lines[8].replace(s, '')

#getting the header names
headers = lines[8].split()

headers.append('TIMESTEP')
df = pd.DataFrame(columns=headers)

counter = 0
for total_files in range(number_of_files):
    with open(files[total_files]) as f:
        lines = f.readlines()
        total_atoms = int(lines[3])
        for i in range(total_atoms):
            row_elements = lines[9+i].split()
            row_elements.append(int(lines[1]))
            df.loc[counter] = row_elements
            counter=counter+1

    df.to_csv(r'all.csv', index = False)

Any idea ? Suggestion?

Thank you

In case, if you need txt sample:

https://raw.githubusercontent.com/Laudarisd/dump46000.data

https://raw.githubusercontent.com/Laudarisd/test/main/dump46000.data

Solution

How about using simple readline? I am suspect readlines and/or pd.DataFrame are consuming so much time. The following seems to be fast enough for me.

import glob
import time

start = time.time()

data_folder = "./all/"
files = glob.glob(data_folder + 'dump*.data')

# get header from one of the files
with open('all/dump46000.data', 'r') as f:
    for _ in range(8):
        next(f) # skip first 8 lines
    header = ','.join(f.readline().split()[2:]) + '\n'

for file in files:
    with open(file, 'r') as f, open(f'all.csv', 'a') as g: # note the 'a'
        g.write(header) # write the header
        for _ in range(9):
            next(f) # skip first 9 lines
        for line in f:
            g.write(line.rstrip().replace(' ', ',') + '\n')

print(time.time() - start)

# id,type,x,y,z,vx,vy,vz,fx,fy,fz
# 201,1,0.00933075,-0.195667,1.53332,-0.000170702,-0.000265168,0.000185569,0.00852572,-0.00882728,-0.0344813
# 623,1,-0.101572,-0.159675,1.52102,-0.000125008,-0.000129469,6.1561e-05,0.0143586,-0.0020444,-0.0400259
# 851,1,-0.0654623,-0.176443,1.52014,-0.00017815,-0.000224676,0.000329338,0.0101743,0.00116504,-0.0344114
# 159,1,-0.0268728,-0.186269,1.51979,-0.000262947,-0.000386994,0.000254515,0.00961213,-0.00640215,-0.0397847