I am processing a large amount of text data (11m rows), and get the error below. Is there a way I can trace the row of text that's causing this error?
My code:
from difflib import ndiff # find differences between strings
import pandas as pd
from tqdm import tqdm # add a timer to pandas apply()
tqdm.pandas() # start timer
# read in all keystrokes
dat = pd.read_csv("all_ks_dat_good.csv", delimiter="|",
encoding="ISO-8859-1")
# use the ndiff function to find additions to strings, i.e. c[0]=='+'
def diff(x):
s1 = str(x['last_text'])
s2 = str(x['scrubbed_text'])
l = [c[-1] for c in ndiff(s1, s2) if c[0] == '+']
return ''.join(l)
# add a column for the additional keystrokes,
# using tqdm's progress_apply() instead of apply()
dat['add_ks'] = dat.progress_apply(diff, axis=1)
dat.to_csv('all_ks_word_dat.csv', sep="|", encoding="utf-8")
The abridged error:
File "/home/goodkindan/.conda/envs/ks0/lib/python3.11/difflib.py", line 997, in _fancy_helper
yield from g
File "/home/goodkindan/.conda/envs/ks0/lib/python3.11/difflib.py", line 985, in _fancy_replace
yield from self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
File "/home/goodkindan/.conda/envs/ks0/lib/python3.11/difflib.py", line 997, in _fancy_helper
yield from g
File "/home/goodkindan/.conda/envs/ks0/lib/python3.11/difflib.py", line 915, in _fancy_replace
cruncher = SequenceMatcher(self.charjunk)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/goodkindan/.conda/envs/ks0/lib/python3.11/difflib.py", line 182, in __init__
self.set_seqs(a, b)
File "/home/goodkindan/.conda/envs/ks0/lib/python3.11/difflib.py", line 194, in set_seqs
self.set_seq2(b)
File "/home/goodkindan/.conda/envs/ks0/lib/python3.11/difflib.py", line 248, in set_seq2
self.__chain_b()
File "/home/goodkindan/.conda/envs/ks0/lib/python3.11/difflib.py", line 288, in __chain_b
for elt in b2j.keys():
For debugging purposes, you could try to iterate on the dataframe with Pandas iterrows and print the row causing the error, like this:
for _, row in dat.iterrows():
try:
diff(row)
except Exception:
print(row)