Search code examples
pythonpandasmergemulti-index

Merging dataframes with multi indexes and column value


I have two dataframes with multi indexes and dates as a columns:

df1

df1 = pd.DataFrame.from_dict({('group', ''): {0: 'A',
  1: 'A',
  2: 'A',
  3: 'A',
  4: 'A',
  5: 'A',
  6: 'A',
  7: 'A',
  8: 'B',
  9: 'B',
  10: 'B',
  11: 'B',
  12: 'B',
  13: 'B',
  14: 'B',
  15: 'B',
  16: 'C',
  17: 'C',
  18: 'C',
  19: 'C',
  20: 'C',
  21: 'C',
  22: 'C',
  23: 'C',
  24: 'D',
  25: 'D',
  26: 'D',
  27: 'D',
  28: 'D',
  29: 'D',
  30: 'D'},
 ('category', ''): {0: 'Apple',
  1: 'Amazon',
  2: 'Google',
  3: 'Netflix',
  4: 'Facebook',
  5: 'Uber',
  6: 'Tesla',
  7: 'total',
  8: 'Apple',
  9: 'Amazon',
  10: 'Google',
  11: 'Netflix',
  12: 'Facebook',
  13: 'Uber',
  14: 'Tesla',
  15: 'total',
  16: 'Apple',
  17: 'Amazon',
  18: 'Google',
  19: 'Netflix',
  20: 'Facebook',
  21: 'Uber',
  22: 'Tesla',
  23: 'total',
  24: 'Apple',
  25: 'Amazon',
  26: 'Google',
  27: 'Netflix',
  28: 'Uber',
  29: 'Tesla',
  30: 'total'},
 (pd.Timestamp('2021-06-28 00:00:00'), 'total_orders'): {0: 88.0,
  1: 66.0,
  2: 191.0,
  3: 558.0,
  4: 12.0,
  5: 4.0,
  6: 56.0,
  7: 975.0,
  8: 90.0,
  9: 26.0,
  10: 49.0,
  11: 250.0,
  12: 7.0,
  13: 2.0,
  14: 44.0,
  15: 468.0,
  16: 36.0,
  17: 52.0,
  18: 94.0,
  19: 750.0,
  20: 10.0,
  21: 0.0,
  22: 52.0,
  23: 994.0,
  24: 16.0,
  25: 22.0,
  26: 5.0,
  27: 57.0,
  28: 3.0,
  29: 33.0,
  30: 136.0},
 (pd.Timestamp('2021-06-28 00:00:00'), 'total_sales'): {0: 4603.209999999999,
  1: 2485.059999999998,
  2: 4919.39999999998,
  3: 6097.77,
  4: 31.22,
  5: 155.71,
  6: 3484.99,
  7: 17237.35999999996,
  8: 561.54,
  9: 698.75,
  10: 1290.13,
  11: 4292.68000000001,
  12: 947.65,
  13: 329.0,
  14: 2889.65,
  15: 9989.4,
  16: 330.8899999999994,
  17: 2076.26,
  18: 2982.270000000004,
  19: 11978.62000000002,
  20: 683.0,
  21: 0.0,
  22: 3812.16999999999,
  23: 20963.21000000002,
  24: 234.4900000000002,
  25: 896.1,
  26: 231.0,
  27: 893.810000000001,
  28: 129.0,
  29: 1712.329999999998,
  30: 4106.729999999996},
 (pd.Timestamp('2021-07-05 00:00:00'), 'total_orders'): {0: 109.0,
  1: 48.0,
  2: 174.0,
  3: 592.0,
  4: 13.0,
  5: 5.0,
  6: 43.0,
  7: 984.0,
  8: 62.0,
  9: 13.0,
  10: 37.0,
  11: 196.0,
  12: 8.0,
  13: 1.0,
  14: 3.0,
  15: 30.0,
  16: 76.0,
  17: 5.0,
  18: 147.0,
  19: 88.0,
  20: 8.0,
  21: 1.0,
  22: 78.0,
  23: 1248.0,
  24: 1.0,
  25: 18.0,
  26: 23.0,
  27: 83.0,
  28: 0.0,
  29: 29.0,
  30: 154.0},
 (pd.Timestamp('2021-07-05 00:00:00'), 'total_sales'): {0: 3453.02,
  1: 17868.730000000003,
  2: 44707.82999999999,
  3: 61425.99,
  4: 1261.0,
  5: 1914.6000000000001,
  6: 24146.09,
  7: 154777.25999999998,
  8: 6201.489999999999,
  9: 5513.960000000001,
  10: 9645.87,
  11: 25086.785,
  12: 663.0,
  13: 448.61,
  14: 26332.7,
  15: 73892.415,
  16: 556.749999999999,
  17: 1746.859999999997,
  18: 4103.219999999994,
  19: 15571.52000000008,
  20: 86.0,
  21: 69.0,
  22: 5882.759999999995,
  23: 26476.11000000004,
  24: 53.0,
  25: 801.220000000001,
  26: 684.56,
  27: 1232.600000000002,
  28: 0.0,
  29: 15902.1,
  30: 43943.48},
 (pd.Timestamp('2021-07-12 00:00:00'), 'total_orders'): {0: 32.0,
  1: 15.0,
  2: 89.0,
  3: 239.0,
  4: 2.0,
  5: 3.0,
  6: 20.0,
  7: 400.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 21.0,
  17: 14.0,
  18: 58.0,
  19: 281.0,
  20: 3.0,
  21: 3.0,
  22: 33.0,
  23: 413.0,
  24: 7.0,
  25: 6.0,
  26: 4.0,
  27: 13.0,
  28: 0.0,
  29: 18.0,
  30: 48.0},
 (pd.Timestamp('2021-07-12 00:00:00'), 'total_sales'): {0: 2147.7000000000003,
  1: 4767.3,
  2: 2399.300000000003,
  3: 3137.440000000002,
  4: 178.0,
  5: 866.61,
  6: 10639.03,
  7: 73235.38,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 220.94,
  17: 727.5199999999995,
  18: 2500.96999999999,
  19: 4414.00999999998,
  20: 15.0,
  21: 196.71,
  22: 2170.1,
  23: 9745.24999999997,
  24: 126.55,
  25: 290.2,
  26: 146.01,
  27: 233.0,
  28: 0.0,
  29: 973.18,
  30: 1658.940000000002}}).set_index(['group','category'])

df2

df2 = pd.DataFrame.from_dict({'group': {0: 'total_full',
  1: 'total_full',
  2: 'A',
  3: 'A',
  4: 'B',
  5: 'B',
  6: 'C',
  7: 'C',
  8: 'D',
  9: 'D',
  10: 'Apple_total',
  11: 'Apple_total',
  12: 'A',
  13: 'A',
  14: 'B',
  15: 'B',
  16: 'C',
  17: 'C',
  18: 'D',
  19: 'D',
  20: 'Amazon_total',
  21: 'Amazon_total',
  22: 'A',
  23: 'A',
  24: 'B',
  25: 'B',
  26: 'C',
  27: 'C',
  28: 'D',
  29: 'D',
  30: 'Google_total',
  31: 'Google_total',
  32: 'A',
  33: 'A',
  34: 'B',
  35: 'B',
  36: 'C',
  37: 'C',
  38: 'D',
  39: 'D',
  40: 'Facebook_total',
  41: 'Facebook_total',
  42: 'A',
  43: 'A',
  44: 'B',
  45: 'B',
  46: 'C',
  47: 'C',
  48: 'D',
  49: 'D',
  50: 'Netflix_total',
  51: 'Netflix_total',
  52: 'A',
  53: 'A',
  54: 'B',
  55: 'B',
  56: 'C',
  57: 'C',
  58: 'D',
  59: 'D',
  60: 'Tesla_total',
  61: 'Tesla_total',
  62: 'A',
  63: 'A',
  64: 'B',
  65: 'B',
  66: 'C',
  67: 'C',
  68: 'D',
  69: 'D',
  70: 'Uber_total',
  71: 'Uber_total',
  72: 'A',
  73: 'A',
  74: 'B',
  75: 'B',
  76: 'C',
  77: 'C',
  78: 'D',
  79: 'D'},
 'category': {0: 'total_full',
  1: 'total_full',
  2: 'group_total',
  3: 'group_total',
  4: 'group_total',
  5: 'group_total',
  6: 'group_total',
  7: 'group_total',
  8: 'group_total',
  9: 'group_total',
  10: 'Apple_total',
  11: 'Apple_total',
  12: 'Apple',
  13: 'Apple',
  14: 'Apple',
  15: 'Apple',
  16: 'Apple',
  17: 'Apple',
  18: 'Apple',
  19: 'Apple',
  20: 'Amazon_total',
  21: 'Amazon_total',
  22: 'Amazon',
  23: 'Amazon',
  24: 'Amazon',
  25: 'Amazon',
  26: 'Amazon',
  27: 'Amazon',
  28: 'Amazon',
  29: 'Amazon',
  30: 'Google_total',
  31: 'Google_total',
  32: 'Google',
  33: 'Google',
  34: 'Google',
  35: 'Google',
  36: 'Google',
  37: 'Google',
  38: 'Google',
  39: 'Google',
  40: 'Facebook_total',
  41: 'Facebook_total',
  42: 'Facebook',
  43: 'Facebook',
  44: 'Facebook',
  45: 'Facebook',
  46: 'Facebook',
  47: 'Facebook',
  48: 'Facebook',
  49: 'Facebook',
  50: 'Netflix_total',
  51: 'Netflix_total',
  52: 'Netflix',
  53: 'Netflix',
  54: 'Netflix',
  55: 'Netflix',
  56: 'Netflix',
  57: 'Netflix',
  58: 'Netflix',
  59: 'Netflix',
  60: 'Tesla_total',
  61: 'Tesla_total',
  62: 'Tesla',
  63: 'Tesla',
  64: 'Tesla',
  65: 'Tesla',
  66: 'Tesla',
  67: 'Tesla',
  68: 'Tesla',
  69: 'Tesla',
  70: 'Uber_total',
  71: 'Uber_total',
  72: 'Uber',
  73: 'Uber',
  74: 'Uber',
  75: 'Uber',
  76: 'Uber',
  77: 'Uber',
  78: 'Uber',
  79: 'Uber'},
 'type': {0: 'Sales_1',
  1: 'Sales_2',
  2: 'Sales_1',
  3: 'Sales_2',
  4: 'Sales_1',
  5: 'Sales_2',
  6: 'Sales_1',
  7: 'Sales_2',
  8: 'Sales_1',
  9: 'Sales_2',
  10: 'Sales_1',
  11: 'Sales_2',
  12: 'Sales_1',
  13: 'Sales_2',
  14: 'Sales_1',
  15: 'Sales_2',
  16: 'Sales_1',
  17: 'Sales_2',
  18: 'Sales_1',
  19: 'Sales_2',
  20: 'Sales_1',
  21: 'Sales_2',
  22: 'Sales_1',
  23: 'Sales_2',
  24: 'Sales_1',
  25: 'Sales_2',
  26: 'Sales_1',
  27: 'Sales_2',
  28: 'Sales_1',
  29: 'Sales_2',
  30: 'Sales_1',
  31: 'Sales_2',
  32: 'Sales_1',
  33: 'Sales_2',
  34: 'Sales_1',
  35: 'Sales_2',
  36: 'Sales_1',
  37: 'Sales_2',
  38: 'Sales_1',
  39: 'Sales_2',
  40: 'Sales_1',
  41: 'Sales_2',
  42: 'Sales_1',
  43: 'Sales_2',
  44: 'Sales_1',
  45: 'Sales_2',
  46: 'Sales_1',
  47: 'Sales_2',
  48: 'Sales_1',
  49: 'Sales_2',
  50: 'Sales_1',
  51: 'Sales_2',
  52: 'Sales_1',
  53: 'Sales_2',
  54: 'Sales_1',
  55: 'Sales_2',
  56: 'Sales_1',
  57: 'Sales_2',
  58: 'Sales_1',
  59: 'Sales_2',
  60: 'Sales_1',
  61: 'Sales_2',
  62: 'Sales_1',
  63: 'Sales_2',
  64: 'Sales_1',
  65: 'Sales_2',
  66: 'Sales_1',
  67: 'Sales_2',
  68: 'Sales_1',
  69: 'Sales_2',
  70: 'Sales_1',
  71: 'Sales_2',
  72: 'Sales_1',
  73: 'Sales_2',
  74: 'Sales_1',
  75: 'Sales_2',
  76: 'Sales_1',
  77: 'Sales_2',
  78: 'Sales_1',
  79: 'Sales_2'},
 '2021-06-28': {0: 67.5277641202152,
  1: 82.7854700135998,
  2: 21.50082266792856,
  3: 22.03644997199996,
  4: 64.460440147,
  5: 10.1060499896,
  6: 65.1530371974946,
  7: 50.6429700519999,
  8: 56.413464107792045,
  9: 0,
  10: 17.48074540313092,
  11: 26.8376199976,
  12: 52.172,
  13: 61.16600000040001,
  14: 20.9447844,
  15: 40.69122000000001,
  16: 83.55718929717925,
  17: 14.98039999719995,
  18: 20.806771705951697,
  19: np.nan,
  20: 18.3766353690825,
  21: 12.82565001479992,
  22: 52.425508769690694,
  23: 25.661999978399994,
  24: 17.88071596,
  25: 24.384659998799997,
  26: 91.10086982794643,
  27: 12.77899003759993,
  28: 16.969540811445366,
  29: np.nan,
  30: 18.8795397517309,
  31: 26.73017999840005,
  32: 53.52039700062155,
  33: 58.81199999639999,
  34: 12.1243325,
  35: 24.0544100028,
  36: 55.94068246571674,
  37: 133.86376999920006,
  38: 7.294127785392621,
  39: np.nan,
  40: 6.07807089184563,
  41: 7.27483001599998,
  42: 2.300470581874837,
  43: 30.71300000639998,
  44: 5.810764652,
  45: 12.333119997600003,
  46: 25.475930745418292,
  47: 64.228710012,
  48: 9.490904912552498,
  49: np.nan,
  50: 8.184780211399392,
  51: 24.59321999400001,
  52: 6.807138946302334,
  53: 12.0879999972,
  54: 0.869207661,
  55: 0.324,
  56: 0.5084336040970575,
  57: 12.181219996800007,
  58: 0,
  59: np.nan,
  60: 9.293956915067886,
  61: 11.171379993599999,
  62: 6.384936971649232,
  63: 3.657999996,
  64: 0.913782413,
  65: 1.9992000012000002,
  66: 1.5322078073061867,
  67: 5.514179996399999,
  68: 0.4630297231124678,
  69: np.nan,
  70: 36.23403557795798,
  71: 53.35258999919999,
  72: 21.890370397789923,
  73: 9.937449997200002,
  74: 5.916852561,
  75: 6.319439989199998,
  76: 7.03772344983066,
  77: 37.095700012799995,
  78: 1.3890891693374032,
  79: np.nan},
 '2021-07-05': {0: 65.4690491915759,
  1: 98.5235100112003,
  2: 21.4573181155924,
  3: 241.06741999679997,
  4: 67.481716829,
  5: 11.60325000040002,
  6: 27.5807099999998,
  7: 65.8528400140003,
  8: 58.949304246983736,
  9: 0.0,
  10: 185.65887577993723,
  11: 318.9965699964001,
  12: 54.517,
  13: 66.55265999039996,
  14: 21.92632044,
  15: 43.67116000320002,
  16: 87.47349898707688,
  17: 208.7727500028001,
  18: 21.742056352860352,
  19: np.nan,
  20: 16.6038963173654,
  21: 25.28952001920013,
  22: 54.7820864335212,
  23: 36.75802000560001,
  24: 18.71872129,
  25: 30.1634600016,
  26: 95.37075040035738,
  27: 138.3680400120001,
  28: 17.73233819348684,
  29: np.nan,
  30: 14.80302342121337,
  31: 251.83851001200003,
  32: 55.926190956481534,
  33: 72.4443400032,
  34: 12.69221484,
  35: 26.032340003999998,
  36: 58.56261169338368,
  37: 153.36183000480003,
  38: 7.622005931348156,
  39: np.nan,
  40: 72.24367956241771,
  41: 14.83083001279999,
  42: 29.5726042895728,
  43: 38.723000005199985,
  44: 6.083562133,
  45: 12.845630001599998,
  46: 26.66998281055652,
  47: 63.26220000600001,
  48: 9.917530329288393,
  49: np.nan,
  50: 8.555606693927,
  51: 23.802009994800002,
  52: 7.113126469779095,
  53: 7.206999998399999,
  54: 0.910216433,
  55: 1.4089999991999997,
  56: 0.5322637911479053,
  57: 15.186009997200001,
  58: 0.0,
  59: np.nan,
  60: 9.716385738295367,
  61: 14.7327399948,
  62: 6.671946105284065,
  63: 5.691999996,
  64: 0.956574175,
  65: 1.0203399996,
  66: 1.6040220980113027,
  67: 8.020399999199999,
  68: 0.4838433599999999,
  69: np.nan,
  70: 37.88758167841983,
  71: 59.03332998119994,
  72: 22.874363860953647,
  73: 13.690399997999998,
  74: 6.194107518,
  75: 6.4613199911999954,
  76: 7.367580219466185,
  77: 38.881609991999944,
  78: 1.4515300799999995,
  79: np.nan},
 '2021-07-12': {0: 607.2971827405001,
  1: 88.9671100664001,
  2: 21.26749278974862,
  3: 17.1524199804,
  4: 64.471138092,
  5: 89.84481002279999,
  6: 26.2044999999998,
  7: 51.9698800632001,
  8: 5.354051858751745,
  9: 0.0,
  10: 177.42361595891452,
  11: 287.5395700032,
  12: 52.117,
  13: 47.388199995600004,
  14: 20.94835038,
  15: 41.4250800048,
  16: 83.57340667555117,
  17: 198.72629000280003,
  18: 20.784858903363354,
  19: np.nan,
  20: 178.323907459086,
  21: 185.83897002839998,
  22: 52.37029646474982,
  23: 27.87144997800001,
  24: 17.88339044,
  25: 23.645340010799984,
  26: 91.11855133792106,
  27: 134.3221800396,
  28: 16.95166921641509,
  29: np.nan,
  30: 128.82813286243115,
  31: 192.6867300156,
  32: 53.46403160619618,
  33: 41.412320006399995,
  34: 12.1261155,
  35: 11.840830002000002,
  36: 55.95153983444301,
  37: 139.43358000720002,
  38: 7.286445921791947,
  39: np.nan,
  40: 69.04410667683521,
  41: 93.877410018,
  42: 28.270665735943805,
  43: 27.512680004399986,
  44: 5.811656147,
  45: 5.2319800032,
  46: 25.480875296710053,
  47: 61.132750010400024,
  48: 9.480909497181356,
  49: np.nan,
  50: 8.178601399067174,
  51: 17.6743199976,
  52: 6.7999699585309585,
  53: 6.131999998799999,
  54: 0.870099156,
  55: 0.6185600004,
  56: 0.5085322845362154,
  57: 10.923759998400003,
  58: 0.0,
  59: np.nan,
  60: 9.287042311133577,
  61: 19.966500000000007,
  62: 6.378212628950804,
  63: 6.524999997600001,
  64: 0.913782413,
  65: 1.9303400016,
  66: 1.5325051891827732,
  67: 11.511160000800006,
  68: 0.4625420799999998,
  69: np.nan,
  70: 36.21177607303267,
  71: 51.3836100036,
  72: 21.86731639537707,
  73: 10.310769999600003,
  74: 5.917744056,
  75: 5.152679999999999,
  76: 7.039089381655591,
  77: 35.920160003999996,
  78: 1.3876262399999995,
  79: np.nan}}).set_index(['group','category','type'])

I am trying to merge df2 on df1 by group, category, date (date is a column) so that my output would look like this:

I omitted the values from df2 of sales_1 & sales_2 in my desired output example, but those rows should be filled with the corresponding group and category values from df2.

                    2021-06-28                                                               2021-07-05                                                      2021-07-12
                    total_orders    total_sales         sales_1        sales_2                  total_orders    total_sales         sales_1        sales_2            total_orders    total_sales         sales_1        sales_2
group    category
A        Apple        88.000          4,603.210
         Amazon       66.000          2,485.060
         Google      191.000          4,919.400
         Netflix     558.000          6,097.770
         Facebook     12.000             31.220
         Uber          4.000            155.710
         Tesla        56.000          3,484.990
         total       975.000         17,237.360
B        Apple        90.000            561.540
         Amazon       26.000            698.750
         Google       49.000          1,290.130
         Netflix     250.000          4,292.680
         Facebook      7.000            947.650
         Uber          2.000            329.000
         Tesla        44.000          2,889.650
         total       468.000          9,989.400
C        Apple        36.000            330.890
         Amazon       52.000          2,076.260
         Google       94.000          2,982.270
         Netflix     750.000         11,978.620
         Facebook     10.000            683.000
         Uber          0.000              0.000
         Tesla        52.000          3,812.170
         total       994.000         20,963.210
D        Apple        16.000            234.490
         Amazon       22.000            896.100
         Google        5.000            231.000
         Netflix      57.000            893.810
         Uber          3.000            129.000
         Tesla        33.000          1,712.330
         total       136.000          4,106.730

So that sales_1 & sales_2 are merged on group & category and are on the same date column. The total_x from df2 can be ignored as it can be calculated from the fields.

The total_values are not used in merge, only the ones after it.

Enter image description here

What I've tried:

df1.reset_index().merge(df2.reset_index(), left_on=['group', 'category'], right_on=['group', 'category'])

Which throws a warning:

UserWarning: merging between different levels can give an unintended result (2 levels on the left,1 on the right)

And is not how I expect it to merge. How could I achieve my desired output?

Using

df = df1.merge(df2.unstack(), left_index=True, right_index=True)

Produces:

Enter image description here

Is it then just reordering of columns as I want to have a unique date and 4 columns for it? Or it might be that one date has 00:00:00 to it?


Solution

  • Create DatetimeIndex in column in df2 first, then unstack and merge by MultiIndexes:

    f = lambda x: pd.to_datetime(x)
    df = (df1.merge(df2.rename(columns=f).unstack(), left_index=True, right_index=True)
            .sort_index(axis=1))
    
    print (df.head())
    
                   2021-06-28                                  2021-07-05  \
                      Sales_1 Sales_2 total_orders total_sales    Sales_1   
    group category                                                          
    A     Apple     52.172000  61.166         88.0     4603.21  54.517000   
          Amazon    52.425509  25.662         66.0     2485.06  54.782086   
          Google    53.520397  58.812        191.0     4919.40  55.926191   
          Netflix    6.807139  12.088        558.0     6097.77   7.113126   
          Facebook   2.300471  30.713         12.0       31.22  29.572604   
    
                                                      2021-07-12            \
                     Sales_2 total_orders total_sales    Sales_1   Sales_2   
    group category                                                           
    A     Apple     66.55266        109.0     3453.02  52.117000  47.38820   
          Amazon    36.75802         48.0    17868.73  52.370296  27.87145   
          Google    72.44434        174.0    44707.83  53.464032  41.41232   
          Netflix    7.20700        592.0    61425.99   6.799970   6.13200   
          Facebook  38.72300         13.0     1261.00  28.270666  27.51268   
    
                                             
                   total_orders total_sales  
    group category                           
    A     Apple            32.0     2147.70  
          Amazon           15.0     4767.30  
          Google           89.0     2399.30  
          Netflix         239.0     3137.44  
          Facebook          2.0      178.00