Search code examples
pythonpandasmatrix-multiplication

Python Matrix Multiplication '<' not supported between different datatypes


I am using Python, and would like to get a calculated number (price * ratio) from two data frames for each group:

Table 1: df1

Group Category price_1 price_2 price_3 price_4
a Single 20.1 19.8 19.7 19.9
a Multi 25.1 26.8 24.7 24.9
b Multi 27.1 27.8 27.7 26.9

Table 2: df2

Group Category ratio_1 ratio_2 ratio_3 ratio_4
a Single 1.0 0.8 0.7 0.5
a Multi 1.0 0.7 0.6 0.4
b Multi 1.0 0.7 0.5 0.3

Desired Output: df

Group Category value
a Single 59.68
a Multi 68.64
b Multi 68.48

Example, for Group = 'b' and Category = 'Multi', value = 27.1 * 1.0 + 27.8 * 0.7 + 27.7 * 0.5 + 26.9 * 0.3 = 68.48

Input tables: 'Group' and 'Category' in df1 and df2 are 'string' data type; the rest columns like 'ratio_1' & 'price_1' are 'float' data type.

Output table: 'Group' and 'Category' are 'string' data type and 'value' is 'float' data type.

With the following code that someone has helped in another thread:

idx_cols = ['Group', 'Category']
df1 = df1.set_index(idx_cols)
df1.columns = df1.columns.str.rsplit('_', n=1, expand=True)
df2 = df2.set_index(idx_cols)
df2.columns = df2.columns.str.rsplit('_', n=1, expand=True)

df = df1['price'].mul(df2['ratio']).sum(axis=1).reset_index(name="value")

Updates: As advised, I've changed the datatypes for df1 and df2 all as 'object' but I got the following full error messages:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/algorithms.py in safe_sort(values, codes, na_sentinel, assume_unique, verify)
   2014         try:
-> 2015             sorter = values.argsort()
   2016             ordered = values.take(sorter)

TypeError: '<' not supported between instances of 'float' and 'str'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-363-e5c47716d89d> in <module>
      8 
      9 # Compute DF3
---> 10 ltv_output = df1['price'].mul(df2['ratio']).sum(axis=1).reset_index(name="ltv_value")

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/ops/__init__.py in f(self, other, axis, level, fill_value)
    764             pass_op = pass_op if not is_logical else op
    765 
--> 766             left, right = self.align(other, join="outer", level=level, copy=False)
    767             new_data = left._combine_frame(right, pass_op, fill_value)
    768             return left._construct_result(new_data)

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in align(self, other, join, axis, level, copy, fill_value, method, limit, fill_axis, broadcast_axis)
   3822         broadcast_axis=None,
   3823     ) -> "DataFrame":
-> 3824         return super().align(
   3825             other,
   3826             join=join,

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in align(self, other, join, axis, level, copy, fill_value, method, limit, fill_axis, broadcast_axis)
   8463             axis = self._get_axis_number(axis)
   8464         if isinstance(other, ABCDataFrame):
-> 8465             return self._align_frame(
   8466                 other,
   8467                 join=join,

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in _align_frame(self, other, join, axis, level, copy, fill_value, method, limit, fill_axis)
   8510         if axis is None or axis == 0:
   8511             if not self.index.equals(other.index):
-> 8512                 join_index, ilidx, iridx = self.index.join(
   8513                     other.index, how=join, level=level, return_indexers=True
   8514                 )

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in join(self, other, how, level, return_indexers, sort)
   3308                 )
   3309             else:
-> 3310                 return self._join_non_unique(
   3311                     other, how=how, return_indexers=return_indexers
   3312                 )

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in _join_non_unique(self, other, how, return_indexers)
   3426         from pandas.core.reshape.merge import _get_join_indexers
   3427 
-> 3428         left_idx, right_idx = _get_join_indexers(
   3429             [self._ndarray_values], [other._ndarray_values], how=how, sort=True
   3430         )

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in _get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
   1309         for n in range(len(left_keys))
   1310     )
-> 1311     zipped = zip(*mapped)
   1312     llab, rlab, shape = [list(x) for x in zipped]
   1313 

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in <genexpr>(.0)
   1306     # get left & right join labels and num. of levels at each location
   1307     mapped = (
-> 1308         _factorize_keys(left_keys[n], right_keys[n], sort=sort)
   1309         for n in range(len(left_keys))
   1310     )

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in _factorize_keys(lk, rk, sort)
   1922     if sort:
   1923         uniques = rizer.uniques.to_array()
-> 1924         llab, rlab = _sort_labels(uniques, llab, rlab)
   1925 
   1926     # NA group

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in _sort_labels(uniques, left, right)
   1948     labels = np.concatenate([left, right])
   1949 
-> 1950     _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
   1951     new_labels = ensure_int64(new_labels)
   1952     new_left, new_right = new_labels[:llength], new_labels[llength:]

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/algorithms.py in safe_sort(values, codes, na_sentinel, assume_unique, verify)
   2017         except TypeError:
   2018             # try this anyway
-> 2019             ordered = sort_mixed(values)
   2020 
   2021     # codes:

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/algorithms.py in sort_mixed(values)
   2000         # order ints before strings, safe in py3
   2001         str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
-> 2002         nums = np.sort(values[~str_pos])
   2003         strs = np.sort(values[str_pos])
   2004         return np.concatenate([nums, np.asarray(strs, dtype=object)])

<__array_function__ internals> in sort(*args, **kwargs)

~/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py in sort(a, axis, kind, order)
    987     else:
    988         a = asanyarray(a).copy(order="K")
--> 989     a.sort(axis=axis, kind=kind, order=order)
    990     return a
    991 

TypeError: '<' not supported between instances of 'float' and 'str'

Anything I've missed I imagine?


Solution

  • You don't need to use set_index() and split() method so let's start from beginning(initial phase of your both df's)

    try via merge() and join both dataframes on 'Group' and 'Category' then multiplying both 'price' and 'ratio' by selecting them using filter() method and calculate sum by using sum() method and then finally remove those columns(which are used in multiplication) by using drop() method:

    out=df1.merge(df2,on=['Group','Category'])
    out['value']=pd.DataFrame(out.filter(like='price').values*out.filter(like='ratio').values).sum(axis=1)
    out=out.drop(out.filter(regex='price|ratio').columns,1)
    

    output of out:

      Group     Category    value
    0   a       Single      59.68
    1   a       Multi       68.64
    2   b       Multi       68.48