I am trying to get the filename of where the bad line is happening. I have a function for getting all the bad lines and printing it to a .txt file, but when I pass in a parameter for the filename, it just prints all the filenames.
This is the bad line function:
def badlines_collect(self, bad_line: list[str]) -> None:
badline_lst.append(bad_line)
today = date.today()
todaytime = datetime.datetime.now().strftime("%Y%m%d")
with open("bad_line1_{}.txt".format(todaytime), 'w') as fp:
for line in badline_lst:
fp.write("Today's date: " + str(today) + currentfile + ": {}\n".format(line))
fp.close()
print(badline_lst)
return None
This is the function where I am calling it and passing in a parameter to get the filename:
def getCSV(self, cur_publisher):
"""
:return:
"""
print(bucket_name + '/' + cur_publisher)
dfm = pd.DataFrame()
filename = list(self.bucket.list_blobs(prefix=cur_publisher))
print(filename)
for file_name in filename:
if '.csv' in str(file_name.name):
print("Crawling on File {} ......\n".format(file_name.name))
currentfile = file_name.name
print(currentfile)
blop = self.bucket.blob(blob_name = "{}".format(file_name.name))
data = blop.download_as_string()
df = pd.read_csv(io.BytesIO(data), encoding='utf-8', sep=",", engine='python',
on_bad_lines=self.badlines_collect)
if (df.count().sum()) > 0:
df.insert(0, "filename", file_name.name)
dfm = pd.concat([dfm, df], ignore_index=True)
dfm = pd.concat([dfm, df], ignore_index=True)
dfm = dfm.rename_axis(index='', columns="index")
print(dfm)
else:
pass
print("{} is empty \n".format(file_name.name))
else:
pass
return self.stack
The result I get is all the filenames in the gcs bucket printed into the bad_line1.txt and not the bad line errors
def badlines_collect(self, bad_line: list[str], filename: str) -> None:
badline_lst.append(bad_line)
today = date.today()
todaytime = datetime.datetime.now().strftime("%Y%m%d")
with open("C:\\badline_log_{}.txt".format(todaytime),
'w') as fp:
for line in badline_lst:
fp.write(filename +" " + "Today's date: " + str(today) + ": {}\n".format(line))
fp.close()
df = pd.read_csv(io.BytesIO(data), encoding='utf-8', sep=",",engine='python', on_bad_lines=lambda x: self.badlines_collect(x, file_name.name))
I have able to get the filename doing it this way