Search code examples
pythonpandasdataframedata-wrangling

(ERROR) Select one object and all float & int in pandas groupby


I have this dataframe.

import pandas as pd

x = {
  "year": ["2012", "2012", "2013", "2014", "2012", "2014", "2013", "2013", "2012", "2013", "2012", "2014", "2014", "2013", "2012", "2014"],
  "class": ["A", "B", "C", "A", "C", "B", "B", "C", "A", "C", "B", "C", "A", "C", "B", "A"],
  "gender": ["M", "F", "F", "M", "F", "M", "M", "F", "F", "F", "M", "M", "F", "M", "F", "F"],
  "score1": ["6", "6", "8", "10", "6", "7", "6", "7", "8", "7", "10", "9", "9", "9", "8", "9"],
  "score2": ["5", "9", "10", "5", "10", "9", "5", "7", "8", "9", "8", "8", "5", "5", "8", "5"],
  "score3": ["5", "9", "9", "7", "8", "5", "9", "5", "7", "6", "5", "10", "8", "8", "6", "8"],
  "score4": ["10", "8", "8", "10", "9", "8", "10", "9", "7", "8", "10", "9", "7", "7", "10", "7"]
}

data = pd.DataFrame(x)

enter image description here

I want to find the median on every column with dtypes = 'int64'. Then I do groupby class columns on my df.

data.groupby('class').median()

But it shows an error on it.

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1490, in GroupBy._cython_agg_general..array_func(values)
   1489 try:
-> 1490     result = self.grouper._cython_operation(
   1491         "aggregate",
   1492         values,
   1493         how,
   1494         axis=data.ndim - 1,
   1495         min_count=min_count,
   1496         **kwargs,
   1497     )
   1498 except NotImplementedError:
   1499     # generally if we have numeric_only=False
   1500     # and non-applicable functions
   1501     # try to python agg
   1502     # TODO: shouldn't min_count matter?

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:959, in BaseGrouper._cython_operation(self, kind, values, how, axis, min_count, **kwargs)
    958 ngroups = self.ngroups
--> 959 return cy_op.cython_operation(
    960     values=values,
    961     axis=axis,
    962     min_count=min_count,
    963     comp_ids=ids,
    964     ngroups=ngroups,
    965     **kwargs,
    966 )

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:657, in WrappedCythonOp.cython_operation(self, values, axis, min_count, comp_ids, ngroups, **kwargs)
    649     return self._ea_wrap_cython_operation(
    650         values,
    651         min_count=min_count,
   (...)
    654         **kwargs,
    655     )
--> 657 return self._cython_op_ndim_compat(
    658     values,
    659     min_count=min_count,
    660     ngroups=ngroups,
    661     comp_ids=comp_ids,
    662     mask=None,
    663     **kwargs,
    664 )

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:497, in WrappedCythonOp._cython_op_ndim_compat(self, values, min_count, ngroups, comp_ids, mask, result_mask, **kwargs)
    495     return res.T
--> 497 return self._call_cython_op(
    498     values,
    499     min_count=min_count,
    500     ngroups=ngroups,
    501     comp_ids=comp_ids,
    502     mask=mask,
    503     result_mask=result_mask,
    504     **kwargs,
    505 )

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:541, in WrappedCythonOp._call_cython_op(self, values, min_count, ngroups, comp_ids, mask, result_mask, **kwargs)
    540 out_shape = self._get_output_shape(ngroups, values)
--> 541 func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric)
    542 values = self._get_cython_vals(values)

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:167, in WrappedCythonOp._get_cython_function(cls, kind, how, dtype, is_numeric)
    165 if how in ["median", "cumprod"]:
    166     # no fused types -> no __signatures__
--> 167     raise NotImplementedError(
    168         f"function is not implemented for this dtype: "
    169         f"[how->{how},dtype->{dtype_str}]"
    170     )
    171 if "object" not in f.__signatures__:
    172     # raise NotImplementedError here rather than TypeError later

NotImplementedError: function is not implemented for this dtype: [how->median,dtype->object]

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\nanops.py:786, in nanmedian(values, axis, skipna, mask)
    785 try:
--> 786     values = values.astype("f8")
    787 except ValueError as err:
    788     # e.g. "could not convert string to float: 'a'"

ValueError: could not convert string to float: 'M'

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[135], line 1
----> 1 data.groupby('class').median()

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1883, in GroupBy.median(self, numeric_only)
   1862 @final
   1863 def median(self, numeric_only: bool = False):
   1864     """
   1865     Compute median of groups, excluding missing values.
   1866 
   (...)
   1881         Median of values within each group.
   1882     """
-> 1883     result = self._cython_agg_general(
   1884         "median",
   1885         alt=lambda x: Series(x).median(numeric_only=numeric_only),
   1886         numeric_only=numeric_only,
   1887     )
   1888     return result.__finalize__(self.obj, method="groupby")

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1507, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs)
   1503         result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
   1505     return result
-> 1507 new_mgr = data.grouped_reduce(array_func)
   1508 res = self._wrap_agged_manager(new_mgr)
   1509 out = self._wrap_aggregated_output(res)

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\internals\managers.py:1503, in BlockManager.grouped_reduce(self, func)
   1499 if blk.is_object:
   1500     # split on object-dtype blocks bc some columns may raise
   1501     #  while others do not.
   1502     for sb in blk._split():
-> 1503         applied = sb.apply(func)
   1504         result_blocks = extend_blocks(applied, result_blocks)
   1505 else:

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\internals\blocks.py:329, in Block.apply(self, func, **kwargs)
    323 @final
    324 def apply(self, func, **kwargs) -> list[Block]:
    325     """
    326     apply the function to my values; return a block if we are not
    327     one
    328     """
--> 329     result = func(self.values, **kwargs)
    331     return self._split_op_result(result)

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1503, in GroupBy._cython_agg_general..array_func(values)
   1490     result = self.grouper._cython_operation(
   1491         "aggregate",
   1492         values,
   (...)
   1496         **kwargs,
   1497     )
   1498 except NotImplementedError:
   1499     # generally if we have numeric_only=False
   1500     # and non-applicable functions
   1501     # try to python agg
   1502     # TODO: shouldn't min_count matter?
-> 1503     result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
   1505 return result

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1457, in GroupBy._agg_py_fallback(self, values, ndim, alt)
   1452     ser = df.iloc[:, 0]
   1454 # We do not get here with UDFs, so we know that our dtype
   1455 #  should always be preserved by the implemented aggregations
   1456 # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
-> 1457 res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
   1459 if isinstance(values, Categorical):
   1460     # Because we only get here with known dtype-preserving
   1461     #  reductions, we cast back to Categorical.
   1462     # TODO: if we ever get "rank" working, exclude it here.
   1463     res_values = type(values)._from_sequence(res_values, dtype=values.dtype)

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:994, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
    987 if len(obj) > 0 and not isinstance(obj._values, np.ndarray):
    988     # we can preserve a little bit more aggressively with EA dtype
    989     #  because maybe_cast_pointwise_result will do a try/except
    990     #  with _from_sequence.  NB we are assuming here that _from_sequence
    991     #  is sufficiently strict that it casts appropriately.
    992     preserve_dtype = True
--> 994 result = self._aggregate_series_pure_python(obj, func)
    996 npvalues = lib.maybe_convert_objects(result, try_float=False)
    997 if preserve_dtype:

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:1015, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
   1012 splitter = self._get_splitter(obj, axis=0)
   1014 for i, group in enumerate(splitter):
-> 1015     res = func(group)
   1016     res = libreduction.extract_result(res)
   1018     if not initialized:
   1019         # We only do this validation on the first iteration

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1885, in GroupBy.median..(x)
   1862 @final
   1863 def median(self, numeric_only: bool = False):
   1864     """
   1865     Compute median of groups, excluding missing values.
   1866 
   (...)
   1881         Median of values within each group.
   1882     """
   1883     result = self._cython_agg_general(
   1884         "median",
-> 1885         alt=lambda x: Series(x).median(numeric_only=numeric_only),
   1886         numeric_only=numeric_only,
   1887     )
   1888     return result.__finalize__(self.obj, method="groupby")

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py:11623, in NDFrame._add_numeric_operations..median(self, axis, skipna, numeric_only, **kwargs)
  11606 @doc(
  11607     _num_doc,
  11608     desc="Return the median of the values over the requested axis.",
   (...)
  11621     **kwargs,
  11622 ):
> 11623     return NDFrame.median(self, axis, skipna, numeric_only, **kwargs)

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py:11212, in NDFrame.median(self, axis, skipna, numeric_only, **kwargs)
  11205 def median(
  11206     self,
  11207     axis: Axis | None = 0,
   (...)
  11210     **kwargs,
  11211 ) -> Series | float:
> 11212     return self._stat_function(
  11213         "median", nanops.nanmedian, axis, skipna, numeric_only, **kwargs
  11214     )

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py:11158, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  11154     nv.validate_stat_func((), kwargs, fname=name)
  11156 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11158 return self._reduce(
  11159     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11160 )

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\series.py:4670, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   4665     raise TypeError(
   4666         f"Series.{name} does not allow {kwd_name}={numeric_only} "
   4667         "with non-numeric dtypes."
   4668     )
   4669 with np.errstate(all="ignore"):
-> 4670     return op(delegate, skipna=skipna, **kwds)

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\nanops.py:158, in bottleneck_switch.__call__..f(values, axis, skipna, **kwds)
    156         result = alt(values, axis=axis, skipna=skipna, **kwds)
    157 else:
--> 158     result = alt(values, axis=axis, skipna=skipna, **kwds)
    160 return result

File c:\ProgramData\anaconda3\Lib\site-packages\pandas\core\nanops.py:789, in nanmedian(values, axis, skipna, mask)
    786         values = values.astype("f8")
    787     except ValueError as err:
    788         # e.g. "could not convert string to float: 'a'"
--> 789         raise TypeError(str(err)) from err
    790 if mask is not None:
    791     values[mask] = np.nan

TypeError: could not convert string to float: 'M'

From the error box above, it shows that groupby do aggregation gender columns. But when I watch someone on YouTube do this with the same dataframe and the same code, it's all fine and shows no error.

So the question is:

  • Why is this happening? Is it because I ran at the newest Python/Pandas version? (I run on Python 3.11.5 and Pandas 2.0.3. While I watched that YouTube video, it was posted 2 years ago).
  • Am I missing something on the groupby?

Solution

  • The issue id due to the columns score1, score2, score3, and coree4 in your DataFrame are stored as strings, not as numeric types. Do this

    import pandas as pd
    
    x = {
      "year": ["2012", "2012", "2013", "2014", "2012", "2014", "2013", "2013", "2012", "2013", "2012", "2014", "2014", "2013", "2012", "2014"],
      "class": ["A", "B", "C", "A", "C", "B", "B", "C", "A", "C", "B", "C", "A", "C", "B", "A"],
      "gender": ["M", "F", "F", "M", "F", "M", "M", "F", "F", "F", "M", "M", "F", "M", "F", "F"],
      "score1": ["6", "6", "8", "10", "6", "7", "6", "7", "8", "7", "10", "9", "9", "9", "8", "9"],
      "score2": ["5", "9", "10", "5", "10", "9", "5", "7", "8", "9", "8", "8", "5", "5", "8", "5"],
      "score3": ["5", "9", "9", "7", "8", "5", "9", "5", "7", "6", "5", "10", "8", "8", "6", "8"],
      "score4": ["10", "8", "8", "10", "9", "8", "10", "9", "7", "8", "10", "9", "7", "7", "10", "7"]
    }
    
    data = pd.DataFrame(x)
    data[["score1", "score2", "score3", "score4"]] = data[["score1", "score2", "score3", "score4"]].apply(pd.to_numeric)
    
    numeric_cols = data.select_dtypes(include='number')
    result = numeric_cols.join(data[['class']]).groupby('class').median()
    print(result)
    
    

    which gives

          score1  score2  score3  score4
    class                                
    A         9.0     5.0     7.0     7.0
    B         7.0     8.0     6.0    10.0
    C         7.5     8.5     8.0     8.5