I need to multiply two MultiIndexed frames (say df1, df2
) that have the same highest level index, such that for each of the highest level index each row of df1
is multiplied to each row of df2
elementwise. I have implemented the following example that does what I want, however it looks pretty ugly:
a = ['alpha', 'beta']
b = ['A', 'B', 'C']
c = ['foo', 'bar']
df1 = pd.DataFrame(np.random.randn(6, 4),
index=pd.MultiIndex.from_product(
[a, b],
names=['greek', 'latin']),
columns=['C1', 'C2', 'C3', 'C4'])
df2 = pd.DataFrame(
np.array([[1, 0, 1, 0], [1, 1, 1, 1], [0, 0, 0, 0], [0, 2, 0, 4]]),
index=pd.MultiIndex.from_product([a, c], names=['greek', 'foobar']),
columns=['C1', 'C2', 'C3', 'C4'])
df3 = pd.DataFrame(
columns=['greek', 'latin', 'foobar', 'C1', 'C2', 'C3', 'C4'])
for i in df1.index.get_level_values('greek').unique():
for j in df1.loc[i].index.get_level_values('latin').unique():
for k in df2.loc[i].index.get_level_values('foobar').unique():
df3 = df3.append(pd.Series([i, j, k],
index=['greek', 'latin', 'foobar']
).append(
df1.loc[i, j] * df2.loc[i, k]), ignore_index=True)
df3.set_index(['greek', 'latin', 'foobar'], inplace=True)
As you can see the code is very manual that defines the columns etc manually multiple times, and sets the index in the end. Here is the input and the optput. They are correct and exactly what I want:
df1:
C1 C2 C3 C4
greek latin
alpha A 0.208380 0.856373 -1.041598 1.219707
B 1.547903 -0.001023 0.918973 1.153554
C 0.195868 2.772840 0.060960 0.311247
beta A 0.690405 -1.258012 0.118000 -0.346677
B 0.488327 -1.206428 0.967658 1.198287
C 0.420098 -0.165721 0.626893 -0.377909,
df2:
C1 C2 C3 C4
greek foobar
alpha foo 1 0 1 0
bar 1 1 1 1
beta foo 0 0 0 0
bar 0 2 0 4
result:
C1 C2 C3 C4
greek latin foobar
alpha A foo 0.208380 0.000000 -1.041598 0.000000
bar 0.208380 0.856373 -1.041598 1.219707
B foo 1.547903 -0.000000 0.918973 0.000000
bar 1.547903 -0.001023 0.918973 1.153554
C foo 0.195868 0.000000 0.060960 0.000000
bar 0.195868 2.772840 0.060960 0.311247
beta A foo 0.000000 -0.000000 0.000000 -0.000000
bar 0.000000 -2.516025 0.000000 -1.386708
B foo 0.000000 -0.000000 0.000000 0.000000
bar 0.000000 -2.412855 0.000000 4.793149
C foo 0.000000 -0.000000 0.000000 -0.000000
bar 0.000000 -0.331443 0.000000 -1.511638
Thanks in advance!
I created the following solution that seems to work and provide the right outcome. While Stephen's answer remains the fastest solution, this is close enough but provides a big advantage, it works for arbitrary MultiIndexed frames, as opposed to the ones where the index is a product of lists. This was the case I needed to solve for, though the example I provided did not reflect that. Thanks to Stephen for the excellent and fast solution for that case - certainly learned a few things from that code!
Code:
dft = df2.swaplevel()
dft.sortlevel(level=0,inplace=True)
df5=pd.concat([df1*dft.loc[i,:] for i in dft.index.get_level_values('foobar').unique() ], keys=dft.index.get_level_values('foobar').unique().tolist(), names=['foobar'])
df5=df5.reorder_levels(['greek', 'latin', 'foobar'],axis=0)
df5.sortlevel(0,inplace=True)
Test Data:
import pandas as pd
import numpy as np
a = ['alpha', 'beta']
b = ['A', 'B', 'C']
c = ['foo', 'bar']
data_columns = ['C1', 'C2', 'C3', 'C4']
columns = ['greek', 'latin', 'foobar'] + data_columns
df1 = pd.DataFrame(np.random.randn(len(a) * len(b), len(data_columns)),
index=pd.MultiIndex.from_product(
[a,b], names=columns[0:2]),
columns=data_columns
)
df2 = pd.DataFrame(np.array([[1, 0, 1, 0],
[1, 1, 1, 1],
[0, 0, 0, 0],
[0, 2, 0, 4],
]),
index=pd.MultiIndex.from_product(
[a, c],
names=[columns[0], columns[2]]),
columns=data_columns
)
Timing Code:
def method1():
df3 = pd.DataFrame(columns=columns)
for i in df1.index.get_level_values('greek').unique():
for j in df1.loc[i].index.get_level_values('latin').unique():
for k in df2.loc[i].index.get_level_values('foobar').unique():
df3 = df3.append(pd.Series(
[i, j, k],
index=columns[:3]).append(
df1.loc[i, j] * df2.loc[i, k]), ignore_index=True)
df3.set_index(columns[:3], inplace=True)
return df3
def method2():
# build an index from the three index columns
idx = [df1.index.get_level_values(col).unique() for col in columns[:2]
] + [df2.index.get_level_values(columns[2]).unique()]
size = [len(x) for x in idx]
index = pd.MultiIndex.from_product(idx, names=columns[:3])
# get the indices needed for df1 and df2
idx_a = np.indices((size[0] * size[1], size[2])).reshape(2, -1)
idx_b = np.indices((size[0], size[1] * size[2])).reshape(2, -1)
idx_1 = idx_a[0]
idx_2 = idx_a[1] + idx_b[0] * size[2]
# map the two frames into a multiply-able form
y1 = df1.values[idx_1, :]
y2 = df2.values[idx_2, :]
# multiply the to frames
df4 = pd.DataFrame(y1 * y2, index=index, columns=columns[3:])
return df4
def method3():
dft = df2.swaplevel()
dft.sortlevel(level=0,inplace=True)
df5=pd.concat([df1*dft.loc[i,:] for i in dft.index.get_level_values('foobar').unique() ], keys=dft.index.get_level_values('foobar').unique().tolist(), names=['foobar'])
df5=df5.reorder_levels(['greek', 'latin', 'foobar'],axis=0)
df5.sortlevel(0,inplace=True)
return df5
from timeit import timeit
print(timeit(method1, number=50))
print(timeit(method2, number=50))
print(timeit(method3, number=50))
Results:
4.089807642158121
0.12291539693251252
0.33667341712862253