Found this not-so-helpful traceback when incorrectly constructing a data frame with an index. My question is, is this a bug that I should report to Pandas as an issue or feature request or am I doing something wrong?
What I wanted to do:
index = pd.Index(np.array([0, 1]))
df = pd.DataFrame({'A': [0, 1], 'B': [1.1, 1.2]},
index=index)
print(df)
A B
0 0 1.1
1 1 1.2
(works, no problem)
What I actually did (note dimension of index array data):
index = pd.Index(np.array([[0], [1]]))
df = pd.DataFrame({'A': [0, 1], 'B': [1.1, 1.2]},
index=index)
print(df)
Traceback message (very long):
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-95-af090c2ae470> in <module>
2 df = pd.DataFrame({'A': [0, 1], 'B': [1.1, 1.2]},
3 index=index)
----> 4 print(df)
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/frame.py in __repr__(self)
653 max_cols=max_cols,
654 line_width=width,
--> 655 show_dimensions=show_dimensions,
656 )
657
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/frame.py in to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, min_rows, max_cols, show_dimensions, decimal, line_width)
774 line_width=line_width,
775 )
--> 776 formatter.to_string()
777
778 if buf is None:
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in to_string(self)
686 else:
687
--> 688 strcols = self._to_str_columns()
689 if self.line_width is None: # no need to wrap around just print
690 # the whole frame
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in _to_str_columns(self)
586 # may include levels names also
587
--> 588 str_index = self._get_formatted_index(frame)
589
590 if not is_list_like(self.header) and not self.header:
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in _get_formatted_index(self, frame)
919 )
920 else:
--> 921 fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)]
922
923 fmt_index = [
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/indexes/base.py in format(self, name, formatter, **kwargs)
1106 return header + list(self.map(formatter))
1107
-> 1108 return self._format_with_header(header, **kwargs)
1109
1110 def _format_with_header(self, header, na_rep="NaN", **kwargs):
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/indexes/base.py in _format_with_header(self, header, na_rep, **kwargs)
1130
1131 else:
-> 1132 result = _trim_front(format_array(values, None, justify="left"))
1133 return header + result
1134
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space)
1031 )
1032
-> 1033 return fmt_obj.get_result()
1034
1035
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in get_result(self)
1062
1063 def get_result(self):
-> 1064 fmt_values = self._format_strings()
1065 return _make_fixed_width(fmt_values, self.justify)
1066
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in _format_strings(self)
1293 def _format_strings(self):
1294 formatter = self.formatter or (lambda x: "{x: d}".format(x=x))
-> 1295 fmt_values = [formatter(x) for x in self.values]
1296 return fmt_values
1297
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in <listcomp>(.0)
1293 def _format_strings(self):
1294 formatter = self.formatter or (lambda x: "{x: d}".format(x=x))
-> 1295 fmt_values = [formatter(x) for x in self.values]
1296 return fmt_values
1297
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/io/formats/format.py in <lambda>(x)
1292 class IntArrayFormatter(GenericArrayFormatter):
1293 def _format_strings(self):
-> 1294 formatter = self.formatter or (lambda x: "{x: d}".format(x=x))
1295 fmt_values = [formatter(x) for x in self.values]
1296 return fmt_values
TypeError: unsupported format string passed to numpy.ndarray.__format__
Note that the dataframe was constructed (it just can't be printed):
In [14]: df.shape
Out[14]: (2, 2)
In [15]: df.index
Out[15]: Int64Index([[0], [1]], dtype='int64')
In [16]: df.values
Out[16]:
array([[0. , 1.1],
[1. , 1.2]])
In [18]: df.columns
Out[18]: Index(['A', 'B'], dtype='object')
In [19]: df.index[0]
Out[19]: array([0])
In [20]: df.index.dtype
Out[20]: dtype('int64')
Also note that if you make the same mistake with the data arguments...
index = pd.Index(np.array([0, 1]))
df = pd.DataFrame({'A': np.array([[0], [1]]), 'B': np.array([[1.1], [1.2]])},
index=index)
print(df)
...you get a nice, informative error message:
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-96-218c77e99705> in <module>
1 index = pd.Index(np.array([0, 1]))
2 df = pd.DataFrame({'A': np.array([[0], [1]]), 'B': np.array([[1.1], [1.2]])},
----> 3 index=index)
4 print(df)
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
409 )
410 elif isinstance(data, dict):
--> 411 mgr = init_dict(data, index, columns, dtype=dtype)
412 elif isinstance(data, ma.MaskedArray):
413 import numpy.ma.mrecords as mrecords
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/internals/construction.py in init_dict(data, index, columns, dtype)
255 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
256 ]
--> 257 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
258
259
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype)
80
81 # don't force copy because getting jammed in an ndarray anyway
---> 82 arrays = _homogenize(arrays, index, dtype)
83
84 # from BlockManager perspective
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/internals/construction.py in _homogenize(data, index, dtype)
321 val = lib.fast_multiget(val, oindex.values, default=np.nan)
322 val = sanitize_array(
--> 323 val, index, dtype=dtype, copy=False, raise_cast_failure=False
324 )
325
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/internals/construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
727 elif subarr.ndim > 1:
728 if isinstance(data, np.ndarray):
--> 729 raise Exception("Data must be 1-dimensional")
730 else:
731 subarr = com.asarray_tuplesafe(data, dtype=dtype)
Exception: Data must be 1-dimensional
Version info (pd.__version__ , np.__version__
)
('0.25.3', '1.17.4')
('0.24.2', '1.16.2')
(I don't like to raise issues until I'm sure it's something worth considering).
I raised this issue on the numpy GitHub and was advised it is an issue with Pandas. However, I also checked with the latest version of Pandas (1.0.0) and it seems to have been fixed:
>>> index = pd.Index(np.array([[0], [1]]))
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-5-ce554b72776c> in <module>
----> 1 index = pd.Index(np.array([[0], [1]]))
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, tupleize_cols, **kwargs)
388 # maybe coerce to a sub-class
389 if is_signed_integer_dtype(data.dtype):
--> 390 return Int64Index(data, copy=copy, dtype=dtype, name=name)
391 elif is_unsigned_integer_dtype(data.dtype):
392 return UInt64Index(data, copy=copy, dtype=dtype, name=name)
/anaconda3/envs/torch/lib/python3.7/site-packages/pandas/core/indexes/numeric.py in __new__(cls, data, dtype, copy, name)
76 if subarr.ndim > 1:
77 # GH#13601, GH#20285, GH#27125
---> 78 raise ValueError("Index data must be 1-dimensional")
79
80 name = maybe_extract_name(name, data, cls)
ValueError: Index data must be 1-dimensional