I want to create a multiway contingency table from my pandas dataframe and store it in an xarray. It seems to me it ought to be straightfoward enough using pandas.crosstab followed by DataFrame.to_xarray() but I'm getting "TypeError: Cannot interpret 'interval[int64]' as a data type" in pandas v1.1.5. (v1.0.1 gives "ValueError: all arrays must be same length").
In [1]: import numpy as np
...: import pandas as pd
...: pd.__version__
Out[1]: '1.1.5'
In [2]: import xarray as xr
...: xr.__version__
Out[2]: '0.17.0'
In [3]: n = 100
...: np.random.seed(42)
...: x = pd.cut(np.random.uniform(low=0, high=3, size=n), range(5))
...: x
Out[3]:
[(1, 2], (2, 3], (2, 3], (1, 2], (0, 1], ..., (1, 2], (1, 2], (1, 2], (0, 1], (0, 1]]
Length: 100
Categories (4, interval[int64]): [(0, 1] < (1, 2] < (2, 3] < (3, 4]]
In [4]: x.value_counts().sort_index()
Out[4]:
(0, 1] 41
(1, 2] 28
(2, 3] 31
(3, 4] 0
dtype: int64
Note I need my table to include empty categories such as (3, 4].
In [6]: idx=pd.date_range('2001-01-01', periods=n, freq='8H')
...: df = pd.DataFrame({'x': x}, index=idx)
...: df['xlag'] = df.x.shift(1, 'D')
...: df['h'] = df.index.hour
...: xtab = pd.crosstab([df.h, df.xlag], df.x, dropna=False, normalize='index')
...: xtab
Out[6]:
x (0, 1] (1, 2] (2, 3] (3, 4]
h xlag
0 (0, 1] 0.000000 0.700000 0.300000 0.0
(1, 2] 0.470588 0.411765 0.117647 0.0
(2, 3] 0.500000 0.333333 0.166667 0.0
(3, 4] 0.000000 0.000000 0.000000 0.0
8 (0, 1] 0.588235 0.000000 0.411765 0.0
(1, 2] 1.000000 0.000000 0.000000 0.0
(2, 3] 0.428571 0.142857 0.428571 0.0
(3, 4] 0.000000 0.000000 0.000000 0.0
16 (0, 1] 0.333333 0.250000 0.416667 0.0
(1, 2] 0.444444 0.222222 0.333333 0.0
(2, 3] 0.454545 0.363636 0.181818 0.0
(3, 4] 0.000000 0.000000 0.000000 0.0
That's fine, but my actual application has more categories and more dimensions, so this seems a clear use-case for xarray, but I get an error:
In [8]: xtab.to_xarray()
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-8-aaedf730bb97> in <module>
----> 1 xtab.to_xarray()
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/pandas/core/generic.py in to_xarray(self)
2818 return xarray.DataArray.from_series(self)
2819 else:
-> 2820 return xarray.Dataset.from_dataframe(self)
2821
2822 @Substitution(returns=fmt.return_docstring)
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/xarray/core/dataset.py in from_dataframe(cls, dataframe, sparse)
5131 obj._set_sparse_data_from_dataframe(idx, arrays, dims)
5132 else:
-> 5133 obj._set_numpy_data_from_dataframe(idx, arrays, dims)
5134 return obj
5135
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/xarray/core/dataset.py in _set_numpy_data_from_dataframe(self, idx, arrays, dims)
5062 data = np.zeros(shape, values.dtype)
5063 data[indexer] = values
-> 5064 self[name] = (dims, data)
5065
5066 @classmethod
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/xarray/core/dataset.py in __setitem__(self, key, value)
1427 )
1428
-> 1429 self.update({key: value})
1430
1431 def __delitem__(self, key: Hashable) -> None:
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/xarray/core/dataset.py in update(self, other)
3897 Dataset.assign
3898 """
-> 3899 merge_result = dataset_update_method(self, other)
3900 return self._replace(inplace=True, **merge_result._asdict())
3901
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/xarray/core/merge.py in dataset_update_method(dataset, other)
958 priority_arg=1,
959 indexes=indexes,
--> 960 combine_attrs="override",
961 )
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/xarray/core/merge.py in merge_core(objects, compat, join, combine_attrs, priority_arg, explicit_coords, indexes, fill_value)
609 coerced = coerce_pandas_values(objects)
610 aligned = deep_align(
--> 611 coerced, join=join, copy=False, indexes=indexes, fill_value=fill_value
612 )
613 collected = collect_variables_and_indexes(aligned)
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/xarray/core/alignment.py in deep_align(objects, join, copy, indexes, exclude, raise_on_invalid, fill_value)
428 indexes=indexes,
429 exclude=exclude,
--> 430 fill_value=fill_value,
431 )
432
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/xarray/core/alignment.py in align(join, copy, indexes, exclude, fill_value, *objects)
352 if not valid_indexers:
353 # fast path for no reindexing necessary
--> 354 new_obj = obj.copy(deep=copy)
355 else:
356 new_obj = obj.reindex(
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/xarray/core/dataset.py in copy(self, deep, data)
1218 """
1219 if data is None:
-> 1220 variables = {k: v.copy(deep=deep) for k, v in self._variables.items()}
1221 elif not utils.is_dict_like(data):
1222 raise ValueError("Data must be dict-like")
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/xarray/core/dataset.py in <dictcomp>(.0)
1218 """
1219 if data is None:
-> 1220 variables = {k: v.copy(deep=deep) for k, v in self._variables.items()}
1221 elif not utils.is_dict_like(data):
1222 raise ValueError("Data must be dict-like")
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/xarray/core/variable.py in copy(self, deep, data)
2632 """
2633 if data is None:
-> 2634 data = self._data.copy(deep=deep)
2635 else:
2636 data = as_compatible_data(data)
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/xarray/core/indexing.py in copy(self, deep)
1484 # 8000341
1485 array = self.array.copy(deep=True) if deep else self.array
-> 1486 return PandasIndexAdapter(array, self._dtype)
/opt/scitools/environments/default/2021_03_18-1/lib/python3.6/site-packages/xarray/core/indexing.py in __init__(self, array, dtype)
1407 dtype_ = array.dtype
1408 else:
-> 1409 dtype_ = np.dtype(dtype)
1410 self._dtype = dtype_
1411
TypeError: Cannot interpret 'interval[int64]' as a data type
I can avoid the error by converting x (and xlag) to a different dtype instead of pandas.Categorical before using pandas.crosstab, but then I lose any empty categories, which I need to keep in my real application.
The issue here is not the use of a CategoricalIndex
but the category labels (x.categories) is an IntervalIndex
which xarray
doesn't like.
To remedy this, you can simply replace the categories within your x
variable with their string representation, which coerces x.categories
to be an "object" dtype instead of an "interval[int64]" dtype:
x = (
pd.cut(np.random.uniform(low=0, high=3, size=n), range(5))
.rename_categories(str)
)
Then calculate your crosstab as you have already done and it should work!
To get your dataset in the coordinates you want (I think), all you need to do is to stack everything in a single MultiIndex
row shape. (instead of a crosstab MultiIndex
row/Index
column shape).
xtab = (
pd.crosstab([df.h, df.xlag], df.x, dropna=False, normalize="index")
.stack()
.reorder_levels(["x", "h", "xlag"])
.sort_index()
)
xtab.to_xarray()
If you want to shorten your code and lose some of the explicit ordering of index levels, you can also use unstack
instead of stack which gives you the correct ordering right away:
xtab = (
pd.crosstab([df.h, df.xlag], df.x, dropna=False, normalize="index")
.unstack([0, 1])
)
xtab.to_xarray()
Regardless of the stack()
vs unstack([0, 1])
approach you use, you get this output:
<xarray.DataArray (x: 4, h: 3, xlag: 4)>
array([[[0. , 0.47058824, 0.5 , 0. ],
[0.58823529, 1. , 0.42857143, 0. ],
[0.33333333, 0.44444444, 0.45454545, 0. ]],
[[0.7 , 0.41176471, 0.33333333, 0. ],
[0. , 0. , 0.14285714, 0. ],
[0.25 , 0.22222222, 0.36363636, 0. ]],
[[0.3 , 0.11764706, 0.16666667, 0. ],
[0.41176471, 0. , 0.42857143, 0. ],
[0.41666667, 0.33333333, 0.18181818, 0. ]],
[[0. , 0. , 0. , 0. ],
[0. , 0. , 0. , 0. ],
[0. , 0. , 0. , 0. ]]])
Coordinates:
* x (x) object '(0, 1]' '(1, 2]' '(2, 3]' '(3, 4]'
* h (h) int64 0 8 16
* xlag (xlag) object '(0, 1]' '(1, 2]' '(2, 3]' '(3, 4]'