test_stat_reductions.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. """
  2. Tests for statistical reductions of 2nd moment or higher: var, skew, kurt, ...
  3. """
  4. import inspect
  5. import numpy as np
  6. import pytest
  7. import pandas as pd
  8. from pandas import (
  9. DataFrame,
  10. Series,
  11. date_range,
  12. )
  13. import pandas._testing as tm
  14. class TestDatetimeLikeStatReductions:
  15. @pytest.mark.parametrize("box", [Series, pd.Index, pd.array])
  16. def test_dt64_mean(self, tz_naive_fixture, box):
  17. tz = tz_naive_fixture
  18. dti = date_range("2001-01-01", periods=11, tz=tz)
  19. # shuffle so that we are not just working with monotone-increasing
  20. dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
  21. dtarr = dti._data
  22. obj = box(dtarr)
  23. assert obj.mean() == pd.Timestamp("2001-01-06", tz=tz)
  24. assert obj.mean(skipna=False) == pd.Timestamp("2001-01-06", tz=tz)
  25. # dtarr[-2] will be the first date 2001-01-1
  26. dtarr[-2] = pd.NaT
  27. obj = box(dtarr)
  28. assert obj.mean() == pd.Timestamp("2001-01-06 07:12:00", tz=tz)
  29. assert obj.mean(skipna=False) is pd.NaT
  30. @pytest.mark.parametrize("box", [Series, pd.Index, pd.array])
  31. @pytest.mark.parametrize("freq", ["s", "h", "D", "W", "B"])
  32. def test_period_mean(self, box, freq):
  33. # GH#24757
  34. dti = date_range("2001-01-01", periods=11)
  35. # shuffle so that we are not just working with monotone-increasing
  36. dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
  37. warn = FutureWarning if freq == "B" else None
  38. msg = r"PeriodDtype\[B\] is deprecated"
  39. with tm.assert_produces_warning(warn, match=msg):
  40. parr = dti._data.to_period(freq)
  41. obj = box(parr)
  42. with pytest.raises(TypeError, match="ambiguous"):
  43. obj.mean()
  44. with pytest.raises(TypeError, match="ambiguous"):
  45. obj.mean(skipna=True)
  46. # parr[-2] will be the first date 2001-01-1
  47. parr[-2] = pd.NaT
  48. with pytest.raises(TypeError, match="ambiguous"):
  49. obj.mean()
  50. with pytest.raises(TypeError, match="ambiguous"):
  51. obj.mean(skipna=True)
  52. @pytest.mark.parametrize("box", [Series, pd.Index, pd.array])
  53. def test_td64_mean(self, box):
  54. m8values = np.array([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], "m8[D]")
  55. tdi = pd.TimedeltaIndex(m8values).as_unit("ns")
  56. tdarr = tdi._data
  57. obj = box(tdarr, copy=False)
  58. result = obj.mean()
  59. expected = np.array(tdarr).mean()
  60. assert result == expected
  61. tdarr[0] = pd.NaT
  62. assert obj.mean(skipna=False) is pd.NaT
  63. result2 = obj.mean(skipna=True)
  64. assert result2 == tdi[1:].mean()
  65. # exact equality fails by 1 nanosecond
  66. assert result2.round("us") == (result * 11.0 / 10).round("us")
  67. class TestSeriesStatReductions:
  68. # Note: the name TestSeriesStatReductions indicates these tests
  69. # were moved from a series-specific test file, _not_ that these tests are
  70. # intended long-term to be series-specific
  71. def _check_stat_op(
  72. self, name, alternate, string_series_, check_objects=False, check_allna=False
  73. ):
  74. with pd.option_context("use_bottleneck", False):
  75. f = getattr(Series, name)
  76. # add some NaNs
  77. string_series_[5:15] = np.nan
  78. # mean, idxmax, idxmin, min, and max are valid for dates
  79. if name not in ["max", "min", "mean", "median", "std"]:
  80. ds = Series(date_range("1/1/2001", periods=10))
  81. msg = f"does not support reduction '{name}'"
  82. with pytest.raises(TypeError, match=msg):
  83. f(ds)
  84. # skipna or no
  85. assert pd.notna(f(string_series_))
  86. assert pd.isna(f(string_series_, skipna=False))
  87. # check the result is correct
  88. nona = string_series_.dropna()
  89. tm.assert_almost_equal(f(nona), alternate(nona.values))
  90. tm.assert_almost_equal(f(string_series_), alternate(nona.values))
  91. allna = string_series_ * np.nan
  92. if check_allna:
  93. assert np.isnan(f(allna))
  94. # dtype=object with None, it works!
  95. s = Series([1, 2, 3, None, 5])
  96. f(s)
  97. # GH#2888
  98. items = [0]
  99. items.extend(range(2**40, 2**40 + 1000))
  100. s = Series(items, dtype="int64")
  101. tm.assert_almost_equal(float(f(s)), float(alternate(s.values)))
  102. # check date range
  103. if check_objects:
  104. s = Series(pd.bdate_range("1/1/2000", periods=10))
  105. res = f(s)
  106. exp = alternate(s)
  107. assert res == exp
  108. # check on string data
  109. if name not in ["sum", "min", "max"]:
  110. with pytest.raises(TypeError, match=None):
  111. f(Series(list("abc")))
  112. # Invalid axis.
  113. msg = "No axis named 1 for object type Series"
  114. with pytest.raises(ValueError, match=msg):
  115. f(string_series_, axis=1)
  116. if "numeric_only" in inspect.getfullargspec(f).args:
  117. # only the index is string; dtype is float
  118. f(string_series_, numeric_only=True)
  119. def test_sum(self):
  120. string_series = Series(range(20), dtype=np.float64, name="series")
  121. self._check_stat_op("sum", np.sum, string_series, check_allna=False)
  122. def test_mean(self):
  123. string_series = Series(range(20), dtype=np.float64, name="series")
  124. self._check_stat_op("mean", np.mean, string_series)
  125. def test_median(self):
  126. string_series = Series(range(20), dtype=np.float64, name="series")
  127. self._check_stat_op("median", np.median, string_series)
  128. # test with integers, test failure
  129. int_ts = Series(np.ones(10, dtype=int), index=range(10))
  130. tm.assert_almost_equal(np.median(int_ts), int_ts.median())
  131. def test_prod(self):
  132. string_series = Series(range(20), dtype=np.float64, name="series")
  133. self._check_stat_op("prod", np.prod, string_series)
  134. def test_min(self):
  135. string_series = Series(range(20), dtype=np.float64, name="series")
  136. self._check_stat_op("min", np.min, string_series, check_objects=True)
  137. def test_max(self):
  138. string_series = Series(range(20), dtype=np.float64, name="series")
  139. self._check_stat_op("max", np.max, string_series, check_objects=True)
  140. def test_var_std(self):
  141. string_series = Series(range(20), dtype=np.float64, name="series")
  142. datetime_series = Series(
  143. np.arange(10, dtype=np.float64),
  144. index=date_range("2020-01-01", periods=10),
  145. name="ts",
  146. )
  147. alt = lambda x: np.std(x, ddof=1)
  148. self._check_stat_op("std", alt, string_series)
  149. alt = lambda x: np.var(x, ddof=1)
  150. self._check_stat_op("var", alt, string_series)
  151. result = datetime_series.std(ddof=4)
  152. expected = np.std(datetime_series.values, ddof=4)
  153. tm.assert_almost_equal(result, expected)
  154. result = datetime_series.var(ddof=4)
  155. expected = np.var(datetime_series.values, ddof=4)
  156. tm.assert_almost_equal(result, expected)
  157. # 1 - element series with ddof=1
  158. s = datetime_series.iloc[[0]]
  159. result = s.var(ddof=1)
  160. assert pd.isna(result)
  161. result = s.std(ddof=1)
  162. assert pd.isna(result)
  163. def test_sem(self):
  164. string_series = Series(range(20), dtype=np.float64, name="series")
  165. datetime_series = Series(
  166. np.arange(10, dtype=np.float64),
  167. index=date_range("2020-01-01", periods=10),
  168. name="ts",
  169. )
  170. alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
  171. self._check_stat_op("sem", alt, string_series)
  172. result = datetime_series.sem(ddof=4)
  173. expected = np.std(datetime_series.values, ddof=4) / np.sqrt(
  174. len(datetime_series.values)
  175. )
  176. tm.assert_almost_equal(result, expected)
  177. # 1 - element series with ddof=1
  178. s = datetime_series.iloc[[0]]
  179. result = s.sem(ddof=1)
  180. assert pd.isna(result)
  181. def test_skew(self):
  182. sp_stats = pytest.importorskip("scipy.stats")
  183. string_series = Series(range(20), dtype=np.float64, name="series")
  184. alt = lambda x: sp_stats.skew(x, bias=False)
  185. self._check_stat_op("skew", alt, string_series)
  186. # test corner cases, skew() returns NaN unless there's at least 3
  187. # values
  188. min_N = 3
  189. for i in range(1, min_N + 1):
  190. s = Series(np.ones(i))
  191. df = DataFrame(np.ones((i, i)))
  192. if i < min_N:
  193. assert np.isnan(s.skew())
  194. assert np.isnan(df.skew()).all()
  195. else:
  196. assert 0 == s.skew()
  197. assert isinstance(s.skew(), np.float64) # GH53482
  198. assert (df.skew() == 0).all()
  199. def test_kurt(self):
  200. sp_stats = pytest.importorskip("scipy.stats")
  201. string_series = Series(range(20), dtype=np.float64, name="series")
  202. alt = lambda x: sp_stats.kurtosis(x, bias=False)
  203. self._check_stat_op("kurt", alt, string_series)
  204. def test_kurt_corner(self):
  205. # test corner cases, kurt() returns NaN unless there's at least 4
  206. # values
  207. min_N = 4
  208. for i in range(1, min_N + 1):
  209. s = Series(np.ones(i))
  210. df = DataFrame(np.ones((i, i)))
  211. if i < min_N:
  212. assert np.isnan(s.kurt())
  213. assert np.isnan(df.kurt()).all()
  214. else:
  215. assert 0 == s.kurt()
  216. assert isinstance(s.kurt(), np.float64) # GH53482
  217. assert (df.kurt() == 0).all()