Hi I am having trouble merging .csv files using Jupyter Notebook. The reason is that the csv files contain hex values, so when I load these csv files normally I need to use "encoding = 'latin'" I do not know how to do this when I am using the glob module. Here is the code that I am using. ByDistrict79 is the name of all the csv files and they will load, the issue is with pd.concat where I am getting the error code, thank you. Also the structure of the files are all the same.
My Code
import pandas as pd
from glob import glob
stock_files = sorted(glob('ByDistrict79_*.csv'))
stock_files
pd.concat((pd.read_csv(file).assign(filename = file)
for file in stock_files),ignore_index =True)
Error Code
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_tokens()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_with_dtype()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._string_convert()
pandas\_libs\parsers.pyx in pandas._libs.parsers._string_box_utf8()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 14: invalid start byte
During handling of the above exception, another exception occurred:
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-3-f3eb9c7dddcd> in <module>
1 pd.concat((pd.read_csv(file).assign(filename = file)
----> 2 for file in stock_files),ignore_index =True)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\reshape\concat.py in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, sort, copy)
253 verify_integrity=verify_integrity,
254 copy=copy,
--> 255 sort=sort,
256 )
257
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\reshape\concat.py in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy, sort)
299 objs = [objs[k] for k in keys]
300 else:
--> 301 objs = list(objs)
302
303 if len(objs) == 0:
<ipython-input-3-f3eb9c7dddcd> in <genexpr>(.0)
1 pd.concat((pd.read_csv(file).assign(filename = file)
----> 2 for file in stock_files),ignore_index =True)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
683 )
684
--> 685 return _read(filepath_or_buffer, kwds)
686
687 parser_f.__name__ = name
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
461
462 try:
--> 463 data = parser.read(nrows)
464 finally:
465 parser.close()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
1152 def read(self, nrows=None):
1153 nrows = _validate_integer("nrows", nrows)
-> 1154 ret = self._engine.read(nrows)
1155
1156 # May alter columns / col_dict
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
2057 def read(self, nrows=None):
2058 try:
-> 2059 data = self._reader.read(nrows)
2060 except StopIteration:
2061 if self._first_chunk:
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.read()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_rows()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_column_data()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_tokens()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_with_dtype()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._string_convert()
pandas\_libs\parsers.pyx in pandas._libs.parsers._string_box_utf8()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 14: invalid start byte
The pandas read_csv() function accepts an optional encoding parameter. The revised function call would be: pd.read_csv(file, encoding='latin-1')
Also, the UnicodeDecodError
suggests there may be a byte-order mark (BOM) in the file.