| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532 |
- import re
- import numpy as np
- import pytest
- from pandas._libs import lib
- import pandas as pd
- from pandas import (
- DataFrame,
- Index,
- Series,
- Timestamp,
- date_range,
- )
- import pandas._testing as tm
- from pandas.tests.groupby import get_groupby_method_args
- class TestNumericOnly:
- # make sure that we are passing thru kwargs to our agg functions
- @pytest.fixture
- def df(self):
- # GH3668
- # GH5724
- df = DataFrame(
- {
- "group": [1, 1, 2],
- "int": [1, 2, 3],
- "float": [4.0, 5.0, 6.0],
- "string": Series(["a", "b", "c"], dtype="str"),
- "object": Series(["a", "b", "c"], dtype=object),
- "category_string": Series(list("abc")).astype("category"),
- "category_int": [7, 8, 9],
- "datetime": date_range("20130101", periods=3),
- "datetimetz": date_range("20130101", periods=3, tz="US/Eastern"),
- "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
- },
- columns=[
- "group",
- "int",
- "float",
- "string",
- "object",
- "category_string",
- "category_int",
- "datetime",
- "datetimetz",
- "timedelta",
- ],
- )
- return df
- @pytest.mark.parametrize("method", ["mean", "median"])
- def test_averages(self, df, method):
- # mean / median
- expected_columns_numeric = Index(["int", "float", "category_int"])
- gb = df.groupby("group")
- expected = DataFrame(
- {
- "category_int": [7.5, 9],
- "float": [4.5, 6.0],
- "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
- "int": [1.5, 3],
- "datetime": [
- Timestamp("2013-01-01 12:00:00"),
- Timestamp("2013-01-03 00:00:00"),
- ],
- "datetimetz": [
- Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
- Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
- ],
- },
- index=Index([1, 2], name="group"),
- columns=[
- "int",
- "float",
- "category_int",
- ],
- )
- result = getattr(gb, method)(numeric_only=True)
- tm.assert_frame_equal(result.reindex_like(expected), expected)
- expected_columns = expected.columns
- self._check(df, method, expected_columns, expected_columns_numeric)
- @pytest.mark.parametrize("method", ["min", "max"])
- def test_extrema(self, df, method):
- # TODO: min, max *should* handle
- # categorical (ordered) dtype
- expected_columns = Index(
- [
- "int",
- "float",
- "string",
- "category_int",
- "datetime",
- "datetimetz",
- "timedelta",
- ]
- )
- expected_columns_numeric = expected_columns
- self._check(df, method, expected_columns, expected_columns_numeric)
- @pytest.mark.parametrize("method", ["first", "last"])
- def test_first_last(self, df, method):
- expected_columns = Index(
- [
- "int",
- "float",
- "string",
- "object",
- "category_string",
- "category_int",
- "datetime",
- "datetimetz",
- "timedelta",
- ]
- )
- expected_columns_numeric = expected_columns
- self._check(df, method, expected_columns, expected_columns_numeric)
- @pytest.mark.parametrize("method", ["sum", "cumsum"])
- def test_sum_cumsum(self, df, method):
- expected_columns_numeric = Index(["int", "float", "category_int"])
- expected_columns = Index(
- ["int", "float", "string", "category_int", "timedelta"]
- )
- if method == "cumsum":
- # cumsum loses string
- expected_columns = Index(["int", "float", "category_int", "timedelta"])
- self._check(df, method, expected_columns, expected_columns_numeric)
- @pytest.mark.parametrize("method", ["prod", "cumprod"])
- def test_prod_cumprod(self, df, method):
- expected_columns = Index(["int", "float", "category_int"])
- expected_columns_numeric = expected_columns
- self._check(df, method, expected_columns, expected_columns_numeric)
- @pytest.mark.parametrize("method", ["cummin", "cummax"])
- def test_cummin_cummax(self, df, method):
- # like min, max, but don't include strings
- expected_columns = Index(
- ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
- )
- # GH#15561: numeric_only=False set by default like min/max
- expected_columns_numeric = expected_columns
- self._check(df, method, expected_columns, expected_columns_numeric)
- def _check(self, df, method, expected_columns, expected_columns_numeric):
- gb = df.groupby("group")
- # object dtypes for transformations are not implemented in Cython and
- # have no Python fallback
- exception = (
- (NotImplementedError, TypeError) if method.startswith("cum") else TypeError
- )
- if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
- # The methods default to numeric_only=False and raise TypeError
- msg = "|".join(
- [
- "Categorical is not ordered",
- f"Cannot perform {method} with non-ordered Categorical",
- re.escape(f"agg function failed [how->{method},dtype->object]"),
- # cumsum/cummin/cummax/cumprod
- "function is not implemented for this dtype",
- f"dtype 'str' does not support operation '{method}'",
- ]
- )
- with pytest.raises(exception, match=msg):
- getattr(gb, method)()
- elif method in ("sum", "mean", "median", "prod"):
- msg = "|".join(
- [
- "category type does not support sum operations",
- re.escape(f"agg function failed [how->{method},dtype->object]"),
- re.escape(f"agg function failed [how->{method},dtype->string]"),
- f"dtype 'str' does not support operation '{method}'",
- ]
- )
- with pytest.raises(exception, match=msg):
- getattr(gb, method)()
- else:
- result = getattr(gb, method)()
- tm.assert_index_equal(result.columns, expected_columns_numeric)
- if method not in ("first", "last"):
- msg = "|".join(
- [
- "Categorical is not ordered",
- "category type does not support",
- "function is not implemented for this dtype",
- f"Cannot perform {method} with non-ordered Categorical",
- re.escape(f"agg function failed [how->{method},dtype->object]"),
- re.escape(f"agg function failed [how->{method},dtype->string]"),
- f"dtype 'str' does not support operation '{method}'",
- ]
- )
- with pytest.raises(exception, match=msg):
- getattr(gb, method)(numeric_only=False)
- else:
- result = getattr(gb, method)(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
- @pytest.mark.parametrize("numeric_only", [True, False, None])
- def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string):
- if groupby_func in ("idxmax", "idxmin"):
- pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1")
- if groupby_func in ("corrwith", "skew"):
- msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1"
- request.applymarker(pytest.mark.xfail(reason=msg))
- df = DataFrame(
- np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"]
- )
- df["E"] = "x"
- groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
- gb = df.groupby(groups)
- method = getattr(gb, groupby_func)
- args = get_groupby_method_args(groupby_func, df)
- kwargs = {"axis": 1}
- if numeric_only is not None:
- # when numeric_only is None we don't pass any argument
- kwargs["numeric_only"] = numeric_only
- # Functions without numeric_only and axis args
- no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift")
- # Functions with axis args
- has_axis = (
- "cumprod",
- "cumsum",
- "diff",
- "pct_change",
- "rank",
- "shift",
- "cummax",
- "cummin",
- "idxmin",
- "idxmax",
- "fillna",
- )
- warn_msg = f"DataFrameGroupBy.{groupby_func} with axis=1 is deprecated"
- if numeric_only is not None and groupby_func in no_args:
- msg = "got an unexpected keyword argument 'numeric_only'"
- if groupby_func in ["cumprod", "cumsum"]:
- with pytest.raises(TypeError, match=msg):
- with tm.assert_produces_warning(FutureWarning, match=warn_msg):
- method(*args, **kwargs)
- else:
- with pytest.raises(TypeError, match=msg):
- method(*args, **kwargs)
- elif groupby_func not in has_axis:
- msg = "got an unexpected keyword argument 'axis'"
- with pytest.raises(TypeError, match=msg):
- method(*args, **kwargs)
- # fillna and shift are successful even on object dtypes
- elif (numeric_only is None or not numeric_only) and groupby_func not in (
- "fillna",
- "shift",
- ):
- msgs = (
- # cummax, cummin, rank
- "not supported between instances of",
- # cumprod
- "can't multiply sequence by non-int of type 'float'",
- # cumsum, diff, pct_change
- "unsupported operand type",
- "has no kernel",
- "operation 'sub' not supported for dtype 'str' with dtype 'float64'",
- )
- if using_infer_string:
- pa = pytest.importorskip("pyarrow")
- errs = (TypeError, pa.lib.ArrowNotImplementedError)
- else:
- errs = TypeError
- with pytest.raises(errs, match=f"({'|'.join(msgs)})"):
- with tm.assert_produces_warning(FutureWarning, match=warn_msg):
- method(*args, **kwargs)
- else:
- with tm.assert_produces_warning(FutureWarning, match=warn_msg):
- result = method(*args, **kwargs)
- df_expected = df.drop(columns="E").T if numeric_only else df.T
- expected = getattr(df_expected, groupby_func)(*args).T
- if groupby_func == "shift" and not numeric_only:
- # shift with axis=1 leaves the leftmost column as numeric
- # but transposing for expected gives us object dtype
- expected = expected.astype(float)
- tm.assert_equal(result, expected)
- @pytest.mark.parametrize(
- "kernel, has_arg",
- [
- ("all", False),
- ("any", False),
- ("bfill", False),
- ("corr", True),
- ("corrwith", True),
- ("cov", True),
- ("cummax", True),
- ("cummin", True),
- ("cumprod", True),
- ("cumsum", True),
- ("diff", False),
- ("ffill", False),
- ("fillna", False),
- ("first", True),
- ("idxmax", True),
- ("idxmin", True),
- ("last", True),
- ("max", True),
- ("mean", True),
- ("median", True),
- ("min", True),
- ("nth", False),
- ("nunique", False),
- ("pct_change", False),
- ("prod", True),
- ("quantile", True),
- ("sem", True),
- ("skew", True),
- ("std", True),
- ("sum", True),
- ("var", True),
- ],
- )
- @pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
- @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
- def test_numeric_only(kernel, has_arg, numeric_only, keys):
- # GH#46072
- # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False
- # has_arg: Whether the op has a numeric_only arg
- df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]})
- args = get_groupby_method_args(kernel, df)
- kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only}
- gb = df.groupby(keys)
- method = getattr(gb, kernel)
- if has_arg and numeric_only is True:
- # Cases where b does not appear in the result
- result = method(*args, **kwargs)
- assert "b" not in result.columns
- elif (
- # kernels that work on any dtype and have numeric_only arg
- kernel in ("first", "last")
- or (
- # kernels that work on any dtype and don't have numeric_only arg
- kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique")
- and numeric_only is lib.no_default
- )
- ):
- warn = FutureWarning if kernel == "fillna" else None
- msg = "DataFrameGroupBy.fillna is deprecated"
- with tm.assert_produces_warning(warn, match=msg):
- result = method(*args, **kwargs)
- assert "b" in result.columns
- elif has_arg:
- assert numeric_only is not True
- # kernels that are successful on any dtype were above; this will fail
- # object dtypes for transformations are not implemented in Cython and
- # have no Python fallback
- exception = NotImplementedError if kernel.startswith("cum") else TypeError
- msg = "|".join(
- [
- "not allowed for this dtype",
- "cannot be performed against 'object' dtypes",
- # On PY39 message is "a number"; on PY310 and after is "a real number"
- "must be a string or a.* number",
- "unsupported operand type",
- "function is not implemented for this dtype",
- re.escape(f"agg function failed [how->{kernel},dtype->object]"),
- ]
- )
- if kernel == "quantile":
- msg = "dtype 'object' does not support operation 'quantile'"
- elif kernel == "idxmin":
- msg = "'<' not supported between instances of 'type' and 'type'"
- elif kernel == "idxmax":
- msg = "'>' not supported between instances of 'type' and 'type'"
- with pytest.raises(exception, match=msg):
- method(*args, **kwargs)
- elif not has_arg and numeric_only is not lib.no_default:
- with pytest.raises(
- TypeError, match="got an unexpected keyword argument 'numeric_only'"
- ):
- method(*args, **kwargs)
- else:
- assert kernel in ("diff", "pct_change")
- assert numeric_only is lib.no_default
- # Doesn't have numeric_only argument and fails on nuisance columns
- with pytest.raises(TypeError, match=r"unsupported operand type"):
- method(*args, **kwargs)
- @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning")
- @pytest.mark.parametrize("dtype", [bool, int, float, object])
- def test_deprecate_numeric_only_series(dtype, groupby_func, request):
- # GH#46560
- grouper = [0, 0, 1]
- ser = Series([1, 0, 0], dtype=dtype)
- gb = ser.groupby(grouper)
- if groupby_func == "corrwith":
- # corrwith is not implemented on SeriesGroupBy
- assert not hasattr(gb, groupby_func)
- return
- method = getattr(gb, groupby_func)
- expected_ser = Series([1, 0, 0])
- expected_gb = expected_ser.groupby(grouper)
- expected_method = getattr(expected_gb, groupby_func)
- args = get_groupby_method_args(groupby_func, ser)
- fails_on_numeric_object = (
- "corr",
- "cov",
- "cummax",
- "cummin",
- "cumprod",
- "cumsum",
- "quantile",
- )
- # ops that give an object result on object input
- obj_result = (
- "first",
- "last",
- "nth",
- "bfill",
- "ffill",
- "shift",
- "sum",
- "diff",
- "pct_change",
- "var",
- "mean",
- "median",
- "min",
- "max",
- "prod",
- "skew",
- )
- # Test default behavior; kernels that fail may be enabled in the future but kernels
- # that succeed should not be allowed to fail (without deprecation, at least)
- if groupby_func in fails_on_numeric_object and dtype is object:
- if groupby_func == "quantile":
- msg = "dtype 'object' does not support operation 'quantile'"
- else:
- msg = "is not supported for object dtype"
- warn = FutureWarning if groupby_func == "fillna" else None
- warn_msg = "DataFrameGroupBy.fillna is deprecated"
- with tm.assert_produces_warning(warn, match=warn_msg):
- with pytest.raises(TypeError, match=msg):
- method(*args)
- elif dtype is object:
- warn = FutureWarning if groupby_func == "fillna" else None
- warn_msg = "SeriesGroupBy.fillna is deprecated"
- with tm.assert_produces_warning(warn, match=warn_msg):
- result = method(*args)
- with tm.assert_produces_warning(warn, match=warn_msg):
- expected = expected_method(*args)
- if groupby_func in obj_result:
- expected = expected.astype(object)
- tm.assert_series_equal(result, expected)
- has_numeric_only = (
- "first",
- "last",
- "max",
- "mean",
- "median",
- "min",
- "prod",
- "quantile",
- "sem",
- "skew",
- "std",
- "sum",
- "var",
- "cummax",
- "cummin",
- "cumprod",
- "cumsum",
- )
- if groupby_func not in has_numeric_only:
- msg = "got an unexpected keyword argument 'numeric_only'"
- with pytest.raises(TypeError, match=msg):
- method(*args, numeric_only=True)
- elif dtype is object:
- msg = "|".join(
- [
- "SeriesGroupBy.sem called with numeric_only=True and dtype object",
- "Series.skew does not allow numeric_only=True with non-numeric",
- "cum(sum|prod|min|max) is not supported for object dtype",
- r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric",
- ]
- )
- with pytest.raises(TypeError, match=msg):
- method(*args, numeric_only=True)
- elif dtype == bool and groupby_func == "quantile":
- msg = "Allowing bool dtype in SeriesGroupBy.quantile"
- with tm.assert_produces_warning(FutureWarning, match=msg):
- # GH#51424
- result = method(*args, numeric_only=True)
- expected = method(*args, numeric_only=False)
- tm.assert_series_equal(result, expected)
- else:
- result = method(*args, numeric_only=True)
- expected = method(*args, numeric_only=False)
- tm.assert_series_equal(result, expected)
|