I am using Python, and would like to get a calculated number (price * ratio) from two data frames for each group:
Table 1: df1
Group | Category | price_1 | price_2 | price_3 | price_4 |
---|---|---|---|---|---|
a | Single | 20.1 | 19.8 | 19.7 | 19.9 |
a | Multi | 25.1 | 26.8 | 24.7 | 24.9 |
b | Multi | 27.1 | 27.8 | 27.7 | 26.9 |
Table 2: df2
Group | Category | ratio_1 | ratio_2 | ratio_3 | ratio_4 |
---|---|---|---|---|---|
a | Single | 1.0 | 0.8 | 0.7 | 0.5 |
a | Multi | 1.0 | 0.7 | 0.6 | 0.4 |
b | Multi | 1.0 | 0.7 | 0.5 | 0.3 |
Desired Output: df
Group | Category | value |
---|---|---|
a | Single | 59.68 |
a | Multi | 68.64 |
b | Multi | 68.48 |
Example, for Group = 'b' and Category = 'Multi', value = 27.1 * 1.0 + 27.8 * 0.7 + 27.7 * 0.5 + 26.9 * 0.3 = 68.48
Input tables: 'Group' and 'Category' in df1
and df2
are 'string' data type; the rest columns like 'ratio_1' & 'price_1' are 'float' data type.
Output table: 'Group' and 'Category' are 'string' data type and 'value' is 'float' data type.
With the following code that someone has helped in another thread:
idx_cols = ['Group', 'Category']
df1 = df1.set_index(idx_cols)
df1.columns = df1.columns.str.rsplit('_', n=1, expand=True)
df2 = df2.set_index(idx_cols)
df2.columns = df2.columns.str.rsplit('_', n=1, expand=True)
df = df1['price'].mul(df2['ratio']).sum(axis=1).reset_index(name="value")
Updates: As advised, I've changed the datatypes for df1
and df2
all as 'object' but I got the following full error messages:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/algorithms.py in safe_sort(values, codes, na_sentinel, assume_unique, verify)
2014 try:
-> 2015 sorter = values.argsort()
2016 ordered = values.take(sorter)
TypeError: '<' not supported between instances of 'float' and 'str'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-363-e5c47716d89d> in <module>
8
9 # Compute DF3
---> 10 ltv_output = df1['price'].mul(df2['ratio']).sum(axis=1).reset_index(name="ltv_value")
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/ops/__init__.py in f(self, other, axis, level, fill_value)
764 pass_op = pass_op if not is_logical else op
765
--> 766 left, right = self.align(other, join="outer", level=level, copy=False)
767 new_data = left._combine_frame(right, pass_op, fill_value)
768 return left._construct_result(new_data)
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in align(self, other, join, axis, level, copy, fill_value, method, limit, fill_axis, broadcast_axis)
3822 broadcast_axis=None,
3823 ) -> "DataFrame":
-> 3824 return super().align(
3825 other,
3826 join=join,
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in align(self, other, join, axis, level, copy, fill_value, method, limit, fill_axis, broadcast_axis)
8463 axis = self._get_axis_number(axis)
8464 if isinstance(other, ABCDataFrame):
-> 8465 return self._align_frame(
8466 other,
8467 join=join,
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in _align_frame(self, other, join, axis, level, copy, fill_value, method, limit, fill_axis)
8510 if axis is None or axis == 0:
8511 if not self.index.equals(other.index):
-> 8512 join_index, ilidx, iridx = self.index.join(
8513 other.index, how=join, level=level, return_indexers=True
8514 )
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in join(self, other, how, level, return_indexers, sort)
3308 )
3309 else:
-> 3310 return self._join_non_unique(
3311 other, how=how, return_indexers=return_indexers
3312 )
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in _join_non_unique(self, other, how, return_indexers)
3426 from pandas.core.reshape.merge import _get_join_indexers
3427
-> 3428 left_idx, right_idx = _get_join_indexers(
3429 [self._ndarray_values], [other._ndarray_values], how=how, sort=True
3430 )
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in _get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
1309 for n in range(len(left_keys))
1310 )
-> 1311 zipped = zip(*mapped)
1312 llab, rlab, shape = [list(x) for x in zipped]
1313
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in <genexpr>(.0)
1306 # get left & right join labels and num. of levels at each location
1307 mapped = (
-> 1308 _factorize_keys(left_keys[n], right_keys[n], sort=sort)
1309 for n in range(len(left_keys))
1310 )
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in _factorize_keys(lk, rk, sort)
1922 if sort:
1923 uniques = rizer.uniques.to_array()
-> 1924 llab, rlab = _sort_labels(uniques, llab, rlab)
1925
1926 # NA group
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/reshape/merge.py in _sort_labels(uniques, left, right)
1948 labels = np.concatenate([left, right])
1949
-> 1950 _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
1951 new_labels = ensure_int64(new_labels)
1952 new_left, new_right = new_labels[:llength], new_labels[llength:]
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/algorithms.py in safe_sort(values, codes, na_sentinel, assume_unique, verify)
2017 except TypeError:
2018 # try this anyway
-> 2019 ordered = sort_mixed(values)
2020
2021 # codes:
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/algorithms.py in sort_mixed(values)
2000 # order ints before strings, safe in py3
2001 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
-> 2002 nums = np.sort(values[~str_pos])
2003 strs = np.sort(values[str_pos])
2004 return np.concatenate([nums, np.asarray(strs, dtype=object)])
<__array_function__ internals> in sort(*args, **kwargs)
~/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py in sort(a, axis, kind, order)
987 else:
988 a = asanyarray(a).copy(order="K")
--> 989 a.sort(axis=axis, kind=kind, order=order)
990 return a
991
TypeError: '<' not supported between instances of 'float' and 'str'
Anything I've missed I imagine?
You don't need to use set_index()
and split()
method so let's start from beginning(initial phase of your both df's)
try via merge()
and join both dataframes on 'Group' and 'Category' then multiplying both 'price' and 'ratio' by selecting them using filter()
method and calculate sum by using sum()
method and then finally remove those columns(which are used in multiplication) by using drop()
method:
out=df1.merge(df2,on=['Group','Category'])
out['value']=pd.DataFrame(out.filter(like='price').values*out.filter(like='ratio').values).sum(axis=1)
out=out.drop(out.filter(regex='price|ratio').columns,1)
output of out
:
Group Category value
0 a Single 59.68
1 a Multi 68.64
2 b Multi 68.48