Search code examples
pandasdataframegroup-by

Pandas Apply function on Groupby behaviour


I am trying to create a column that computes the ratio of consumption based on a monthly basis. I have created the function but once i use i run it pandas shows a typeerror: Below is the function error and error stack

The consumption ratio function is:

def consumption_ratio(row): 
    c_consumption = row["consumption"].iloc[0]
    month = row["month"].iloc[0]
    year = row["year"].iloc[0]
    house = row["houseid-meterid"].iloc[0]

    if month == 2 and year == 2019: 
        return 0
    else: 
        if month == 1:
            # print(f"This is the {month} month")
            prevyear = year - 1
            prevmonth = 12
            prev_record = water_df.query("`houseid-meterid` == @house and year == @prevyear and month == @prevmonth")
            try:
                ratio = c_consumption / prev_record["consumption"]
            except ZeroDivisionError:
                ratio = 0
            # print(f"Non regular rations {ratio}")
            return ratio
        else: 
            prevmonth  = month - 1
            prev_record = water_df.query("`houseid-meterid` == @house and year == @year and month == @prevmonth")
            # print(prev_record)
            try:
                ratio = c_consumption/ prev_record["consumption"]
            except ZeroDivisionError:
                ratio = 0
            # ratio = c_consumption / prev_record["consumption"]
            # print(f"Regular ratios {ratio}")
            return ratio

The code executes here:

water_df["consumption_ratio"] = water_df.groupby(['Datetime', 'houseid-meterid']).apply(consumption_ratio)

The error stack looks like this:

ValueError                                Traceback (most recent call last)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:12017, in _reindex_for_setitem(value, index)
  12016 try:
> 12017     reindexed_value = value.reindex(index)._values
  12018 except ValueError as err:
  12019     # raised in MultiIndex.from_tuples, see test_insert_error_msmgs

File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\series.py:5094, in Series.reindex(self, *args, **kwargs)
   5093     kwargs.update({"index": index})
-> 5094 return super().reindex(**kwargs)

File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\generic.py:5289, in NDFrame.reindex(self, *args, **kwargs)
   5288 # perform the reindex on the axes
-> 5289 return self._reindex_axes(
   5290     axes, level, limit, tolerance, method, fill_value, copy
   5291 ).__finalize__(self, method="reindex")

File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\generic.py:5304, in NDFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   5303 ax = self._get_axis(a)
-> 5304 new_index, indexer = ax.reindex(
   5305     labels, level=level, limit=limit, tolerance=tolerance, method=method
   5306 )
   5308 axis = self._get_axis_number(a)

File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\indexes\base.py:4477, in Index.reindex(self, target, method, level, limit, tolerance)
   4470             warnings.warn(
   4471                 "reindexing with a non-unique Index is deprecated and "
   4472                 "will raise in a future version.",
   4473                 FutureWarning,
   4474                 stacklevel=find_stack_level(),
   4475             )
-> 4477 target = self._wrap_reindex_result(target, indexer, preserve_names)
   4478 return target, indexer

File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\indexes\multi.py:2556, in MultiIndex._wrap_reindex_result(self, target, indexer, preserve_names)
   2555 try:
-> 2556     target = MultiIndex.from_tuples(target)
   2557 except TypeError:
   2558     # not all tuples, see test_constructor_dict_multiindex_reindex_flat

File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\indexes\multi.py:205, in names_compat.<locals>.new_meth(self_or_cls, *args, **kwargs)
    203     kwargs["names"] = kwargs.pop("name")
--> 205 return meth(self_or_cls, *args, **kwargs)

File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\indexes\multi.py:573, in MultiIndex.from_tuples(cls, tuples, sortorder, names)
    571         tuples = np.asarray(tuples._values)
--> 573     arrays = list(lib.tuples_to_object_array(tuples).T)
    574 elif isinstance(tuples, list):

File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\_libs\lib.pyx:2978, in pandas._libs.lib.tuples_to_object_array()

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long long'

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[34], line 1
----> 1 water_df["consumption_ratio"] = water_df.groupby(['Datetime', 'houseid-meterid']).apply(consumption_ratio)

File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:3978, in DataFrame.__setitem__(self, key, value)
   3975     self._setitem_array([key], value)
   3976 else:
   3977     # set column
-> 3978     self._set_item(key, value)

File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:4172, in DataFrame._set_item(self, key, value)
   4162 def _set_item(self, key, value) -> None:
   4163     """
   4164     Add series to DataFrame in specified column.
   4165 
   (...)
   4170     ensure homogeneity.
   4171     """
-> 4172     value = self._sanitize_column(value)
   4174     if (
   4175         key in self.columns
   4176         and value.ndim == 1
   4177         and not is_extension_array_dtype(value)
   4178     ):
   4179         # broadcast across multiple columns if necessary
   4180         if not self.columns.is_unique or isinstance(self.columns, MultiIndex):

File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:4909, in DataFrame._sanitize_column(self, value)
   4907     return _reindex_for_setitem(value, self.index)
   4908 elif is_dict_like(value):
-> 4909     return _reindex_for_setitem(Series(value), self.index)
   4911 if is_list_like(value):
   4912     com.require_length_match(value, self.index)

File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\pandas\core\frame.py:12024, in _reindex_for_setitem(value, index)
  12020     if not value.index.is_unique:
  12021         # duplicate axis
  12022         raise err
> 12024     raise TypeError(
  12025         "incompatible index of inserted column with frame index"
  12026     ) from err
  12027 return reindexed_value

TypeError: incompatible index of inserted column with frame index

The dataset is of the form

year    month   houseid-meterid     Datetime    cleaned_quantity
2019    2      m5             2019-02-01              7.0
2019    3      m5             2019-03-01              23.0
2019    4      m5             2019-04-01              14.0
2019    4      m5             2019-05-01              22.0

The expected output should be

year    month   houseid-meterid     Datetime     consumption  consumption-ratio
2019    2      m5             2019-02-01              7.0           0
2019    3      m5             2019-03-01              23.0          3.285
2019    4      m5             2019-04-01              14.0          0.608
2019    4      m5             2019-05-01              22.0          1.571

What am i doing wrong?


Solution

  • Cahnge your function for next with iter for first consumption, if no exist is added 0, then append ratio to column consumption_ratio and return row instead ratio or 0, last remove assign to water_df["consumption_ratio"] = in groupby in last row of code:

    def consumption_ratio(row): 
        c_consumption = row["consumption"].iloc[0]
        # print (c_consumption)
        month = row["month"].iloc[0]
        year = row["year"].iloc[0]
        house = row["houseid-meterid"].iloc[0]
    
        if month == 2 and year == 2019: 
            ratio=0
        else: 
            if month == 1:
                # print(f"This is the {month} month")
                prevyear = year - 1
                prevmonth = 12
                prev_record = water_df.query("`houseid-meterid` == @house and year == @prevyear and month == @prevmonth")
                try:
                    ratio = c_consumption / next(iter(prev_record["consumption"]), 0)
    
                except ZeroDivisionError:
                    ratio = 0
                # print(f"Non regular rations {ratio}")
    
            else: 
                prevmonth  = month - 1
                prev_record = water_df.query("`houseid-meterid` == @house and year == @year and month == @prevmonth")
                # print(prev_record)
                try:
                    ratio = c_consumption/ next(iter(prev_record["consumption"]), 0)
    
                except ZeroDivisionError:
                    ratio = 0
                # ratio = c_consumption / prev_record["consumption"]
                # print(f"Regular ratios {ratio}")
    
        row['consumption_ratio'] = ratio
        return row
    

    df = water_df.groupby(['Datetime', 'houseid-meterid']).apply(consumption_ratio)