I need to store a lot of messages in HDFStore, some of them contain emoticons or special characters like éěščřžýáí. Everything seems to work ok until I try to load it, then it crashes with error below. Here is example code that ends up with the error
import pandas as pd
df = pd.DataFrame(columns=["A"])
toAppend = {"A": "é"}
df = df.append(toAppend, ignore_index = True)
df['A'] = df['A'].astype(str)
store = pd.HDFStore(r'thiswillcrash.h5')
store.put('df', df, format='table', encoding="utf-8")
d = store["df"]
And here is the error
UnicodeDecodeError Traceback (most recent call last)
C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in _unconvert_string_array(data, nan_rep, encoding)
4407 dtype = "S{0}".format(itemsize)
-> 4408 data = data.astype(dtype, copy=False).astype(object, copy=False)
4409 except (Exception) as e:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128)
During handling of the above exception, another exception occurred:
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-8-f2a5372d5498> in <module>()
8 store = pd.HDFStore(r'iwillcrash18.h5')
9 store.put('df', df, format='table', encoding="utf-8")
---> 10 d = store["df"]
11 print(d)
C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in __getitem__(self, key)
417 def __getitem__(self, key):
--> 418 return self.get(key)
420 def __setitem__(self, key, value):
C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in get(self, key)
626 if group is None:
627 raise KeyError('No object named %s in the file' % key)
--> 628 return self._read_group(group)
630 def select(self, key, where=None, start=None, stop=None, columns=None,
C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in _read_group(self, group, **kwargs)
1274 s = self._create_storer(group)
1275 s.infer_axes()
-> 1276 return s.read(**kwargs)
C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in read(self, where, columns, **kwargs)
4006 def read(self, where=None, columns=None, **kwargs):
-> 4008 if not self.read_axes(where=where, **kwargs):
4009 return None
C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in read_axes(self, where, **kwargs)
3218 for a in self.axes:
3219 a.set_info(self.info)
-> 3220 a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding)
3222 return True
C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in convert(self, values, nan_rep, encoding)
2071 if _ensure_decoded(self.kind) == u('string'):
2072 self.data = _unconvert_string_array(
-> 2073 self.data, nan_rep=nan_rep, encoding=encoding)
2075 return self
C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in _unconvert_string_array(data, nan_rep, encoding)
4409 except (Exception) as e:
4410 f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object])
-> 4411 data = f(data)
4413 if nan_rep is None:
C:\Users\Filip\Anaconda3\lib\site-packages\numpy\lib\function_base.py in __call__(self, *args, **kwargs)
1698 vargs.extend([kwargs[_n] for _n in names])
-> 1700 return self._vectorize_call(func=func, args=vargs)
1702 def _get_ufunc_and_otypes(self, func, args):
C:\Users\Filip\Anaconda3\lib\site-packages\numpy\lib\function_base.py in _vectorize_call(self, func, args)
1767 for _a in args]
-> 1769 outputs = ufunc(*inputs)
1771 if ufunc.nout == 1:
C:\Users\Filip\Anaconda3\lib\site-packages\pandas\io\pytables.py in <lambda>(x)
4408 data = data.astype(dtype, copy=False).astype(object, copy=False)
4409 except (Exception) as e:
-> 4410 f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object])
4411 data = f(data)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 0: unexpected end of data
I have Pandas 0.16.2 and PyTables 3.2.2
It was a bug and it should be solved now, see this link to view more details