i have written a pre processing script in python that helps consolidating confidence. Below is my script:
import pandas as pd
import numpy as np
from pathlib import Path
import glob as glob
inp_dir = Path(r'C:/Users/jtharian/Desktop/bbc/')
for file in inp_dir.glob('*.csv'):
df = pd.read_csv(file, sep=',', quotechar='|',error_bad_lines=False)
df['confidence'] = df['confidence'].replace(np.nan, 0.01)
df.to_csv(file,index=False)
Error:
Traceback (most recent call last):
File "C:\Users\jtharian\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 3080, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 4554, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 4562, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'confidence'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "<ipython-input-1-0cbf17caf540>", line 11, in <module>
df['confidence'] = df['confidence'].replace(np.nan, 0.01)
File "C:\Users\jtharian\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py", line 3024, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\jtharian\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 3082, in get_loc
raise KeyError(key) from err
KeyError: 'confidence'
I unferstand I am receiving this error because one of the files in my directory does not have the column 'confidence'. But how can I locate that file or print the file name?
Maybe check the column names has confidence
listed and break if not...
import pandas as pd
import numpy as np
from pathlib import Path
import glob as glob
inp_dir = Path(r'C:/Users/jtharian/Desktop/bbc/')
for file in inp_dir.glob('*.csv'):
df = pd.read_csv(file, sep=',', quotechar='|',error_bad_lines=False)
if 'confidence' not in df.columns:
print('filename: ' + str(file))
break
df['confidence'] = df['confidence'].replace(np.nan, 0.01)
df.to_csv(file,index=False)