I have the following code:
import pandas as pd
x = [u'string with some unicode: \x16']
df = pd.DataFrame(x)
If I try to write this dataframe as an excel file:
df.to_excel("test.xlsx")
Or, if I try to write this dataframe as an excel file, with utf-8 encoding:
ew = pd.ExcelWriter('test.xlsx',options={'encoding':'utf-8'})
df.to_excel(ew)
I get the following error:
IllegalCharacterError Traceback (most recent call last)
<ipython-input-4-62adec25ae8d> in <module>()
1 ew = pd.ExcelWriter('test.xlsx',options={'encoding':'utf-8'})
2 #df.to_excel("test.xlsx")
----> 3 df.to_excel(ew)
/usr/local/lib/python2.7/dist-packages/pandas/util/decorators.pyc in wrapper(*args, **kwargs)
86 else:
87 kwargs[new_arg_name] = new_arg_value
---> 88 return func(*args, **kwargs)
89 return wrapper
90 return _deprecate_kwarg
/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in to_excel(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, encoding, inf_rep)
1258 formatted_cells = formatter.get_formatted_cells()
1259 excel_writer.write_cells(formatted_cells, sheet_name,
-> 1260 startrow=startrow, startcol=startcol)
1261 if need_save:
1262 excel_writer.save()
/usr/local/lib/python2.7/dist-packages/pandas/io/excel.pyc in write_cells(self, cells, sheet_name, startrow, startcol)
679 colletter = get_column_letter(startcol + cell.col + 1)
680 xcell = wks.cell("%s%s" % (colletter, startrow + cell.row + 1))
--> 681 xcell.value = _conv_value(cell.val)
682 style_kwargs = {}
683
/usr/local/lib/python2.7/dist-packages/openpyxl/cell/cell.pyc in value(self, value)
360 def value(self, value):
361 """Set the value and infer type and display options."""
--> 362 self._bind_value(value)
363
364 @property
/usr/local/lib/python2.7/dist-packages/openpyxl/cell/cell.pyc in _bind_value(self, value)
269 elif self.guess_types:
270 value = self._infer_value(value)
--> 271 self.set_explicit_value(value, self.data_type)
272
273
/usr/local/lib/python2.7/dist-packages/openpyxl/cell/cell.pyc in set_explicit_value(self, value, data_type)
235 raise ValueError('Invalid data type: %s' % data_type)
236 if isinstance(value, STRING_TYPES):
--> 237 value = self.check_string(value)
238 self._value = value
239 self.data_type = data_type
/usr/local/lib/python2.7/dist-packages/openpyxl/cell/cell.pyc in check_string(self, value)
220 value = value[:32767]
221 if next(ILLEGAL_CHARACTERS_RE.finditer(value), None):
--> 222 raise IllegalCharacterError
223 return value
224
IllegalCharacterError:
How can I write a pandas dataframe containing unicode to an excel file?
Not a Unicode issue as such... \x16
(or in Unicode strings \u0016
refers to the same character) is ASCII control code 22 (SYN). Pandas says it's invalid to have control codes (other than tab and newlines) in an Excel file, and though I don't know much about Excel files it would certainly be impossible to include them in an XML 1.0 file, which is what's inside a xlsx.
So most likely there is no way to include arbitrary character sequences (with control codes) in an Excel. You should filter them out before writing, or if you really need to preserve the original data use some form of ad hoc encoding recognised only by your application.