test_value_counts.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. import collections
  2. from datetime import timedelta
  3. import numpy as np
  4. import pytest
  5. import pandas as pd
  6. from pandas import (
  7. DatetimeIndex,
  8. Index,
  9. Interval,
  10. IntervalIndex,
  11. MultiIndex,
  12. Series,
  13. Timedelta,
  14. TimedeltaIndex,
  15. array,
  16. )
  17. import pandas._testing as tm
  18. from pandas.tests.base.common import allow_na_ops
  19. @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
  20. def test_value_counts(index_or_series_obj):
  21. obj = index_or_series_obj
  22. obj = np.repeat(obj, range(1, len(obj) + 1))
  23. result = obj.value_counts()
  24. counter = collections.Counter(obj)
  25. expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
  26. if obj.dtype != np.float16:
  27. expected.index = expected.index.astype(obj.dtype)
  28. else:
  29. with pytest.raises(NotImplementedError, match="float16 indexes are not "):
  30. expected.index.astype(obj.dtype)
  31. return
  32. if isinstance(expected.index, MultiIndex):
  33. expected.index.names = obj.names
  34. else:
  35. expected.index.name = obj.name
  36. if not isinstance(result.dtype, np.dtype):
  37. if getattr(obj.dtype, "storage", "") == "pyarrow":
  38. expected = expected.astype("int64[pyarrow]")
  39. else:
  40. # i.e IntegerDtype
  41. expected = expected.astype("Int64")
  42. # TODO(GH#32514): Order of entries with the same count is inconsistent
  43. # on CI (gh-32449)
  44. if obj.duplicated().any():
  45. result = result.sort_index()
  46. expected = expected.sort_index()
  47. tm.assert_series_equal(result, expected)
  48. @pytest.mark.parametrize("null_obj", [np.nan, None])
  49. @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
  50. def test_value_counts_null(null_obj, index_or_series_obj):
  51. orig = index_or_series_obj
  52. obj = orig.copy()
  53. if not allow_na_ops(obj):
  54. pytest.skip("type doesn't allow for NA operations")
  55. elif len(obj) < 1:
  56. pytest.skip("Test doesn't make sense on empty data")
  57. elif isinstance(orig, MultiIndex):
  58. pytest.skip(f"MultiIndex can't hold '{null_obj}'")
  59. values = obj._values
  60. values[0:2] = null_obj
  61. klass = type(obj)
  62. repeated_values = np.repeat(values, range(1, len(values) + 1))
  63. obj = klass(repeated_values, dtype=obj.dtype)
  64. # because np.nan == np.nan is False, but None == None is True
  65. # np.nan would be duplicated, whereas None wouldn't
  66. counter = collections.Counter(obj.dropna())
  67. expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
  68. if obj.dtype != np.float16:
  69. expected.index = expected.index.astype(obj.dtype)
  70. else:
  71. with pytest.raises(NotImplementedError, match="float16 indexes are not "):
  72. expected.index.astype(obj.dtype)
  73. return
  74. expected.index.name = obj.name
  75. result = obj.value_counts()
  76. if obj.duplicated().any():
  77. # TODO(GH#32514):
  78. # Order of entries with the same count is inconsistent on CI (gh-32449)
  79. expected = expected.sort_index()
  80. result = result.sort_index()
  81. if not isinstance(result.dtype, np.dtype):
  82. if getattr(obj.dtype, "storage", "") == "pyarrow":
  83. expected = expected.astype("int64[pyarrow]")
  84. else:
  85. # i.e IntegerDtype
  86. expected = expected.astype("Int64")
  87. tm.assert_series_equal(result, expected)
  88. expected[null_obj] = 3
  89. result = obj.value_counts(dropna=False)
  90. if obj.duplicated().any():
  91. # TODO(GH#32514):
  92. # Order of entries with the same count is inconsistent on CI (gh-32449)
  93. expected = expected.sort_index()
  94. result = result.sort_index()
  95. tm.assert_series_equal(result, expected)
  96. def test_value_counts_inferred(index_or_series, using_infer_string):
  97. klass = index_or_series
  98. s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
  99. s = klass(s_values)
  100. expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"], name="count")
  101. tm.assert_series_equal(s.value_counts(), expected)
  102. if isinstance(s, Index):
  103. exp = Index(np.unique(np.array(s_values, dtype=np.object_)))
  104. tm.assert_index_equal(s.unique(), exp)
  105. else:
  106. exp = np.unique(np.array(s_values, dtype=np.object_))
  107. if using_infer_string:
  108. exp = array(exp, dtype="str")
  109. tm.assert_equal(s.unique(), exp)
  110. assert s.nunique() == 4
  111. # don't sort, have to sort after the fact as not sorting is
  112. # platform-dep
  113. hist = s.value_counts(sort=False).sort_values()
  114. expected = Series([3, 1, 4, 2], index=list("acbd"), name="count").sort_values()
  115. tm.assert_series_equal(hist, expected)
  116. # sort ascending
  117. hist = s.value_counts(ascending=True)
  118. expected = Series([1, 2, 3, 4], index=list("cdab"), name="count")
  119. tm.assert_series_equal(hist, expected)
  120. # relative histogram.
  121. hist = s.value_counts(normalize=True)
  122. expected = Series(
  123. [0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"], name="proportion"
  124. )
  125. tm.assert_series_equal(hist, expected)
  126. def test_value_counts_bins(index_or_series, using_infer_string):
  127. klass = index_or_series
  128. s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
  129. s = klass(s_values)
  130. # bins
  131. msg = "bins argument only works with numeric data"
  132. with pytest.raises(TypeError, match=msg):
  133. s.value_counts(bins=1)
  134. s1 = Series([1, 1, 2, 3])
  135. res1 = s1.value_counts(bins=1)
  136. exp1 = Series({Interval(0.997, 3.0): 4}, name="count")
  137. tm.assert_series_equal(res1, exp1)
  138. res1n = s1.value_counts(bins=1, normalize=True)
  139. exp1n = Series({Interval(0.997, 3.0): 1.0}, name="proportion")
  140. tm.assert_series_equal(res1n, exp1n)
  141. if isinstance(s1, Index):
  142. tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
  143. else:
  144. exp = np.array([1, 2, 3], dtype=np.int64)
  145. tm.assert_numpy_array_equal(s1.unique(), exp)
  146. assert s1.nunique() == 3
  147. # these return the same
  148. res4 = s1.value_counts(bins=4, dropna=True)
  149. intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
  150. exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
  151. tm.assert_series_equal(res4, exp4)
  152. res4 = s1.value_counts(bins=4, dropna=False)
  153. intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
  154. exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
  155. tm.assert_series_equal(res4, exp4)
  156. res4n = s1.value_counts(bins=4, normalize=True)
  157. exp4n = Series(
  158. [0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]), name="proportion"
  159. )
  160. tm.assert_series_equal(res4n, exp4n)
  161. # handle NA's properly
  162. s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"]
  163. s = klass(s_values)
  164. expected = Series([4, 3, 2], index=["b", "a", "d"], name="count")
  165. tm.assert_series_equal(s.value_counts(), expected)
  166. if isinstance(s, Index):
  167. exp = Index(["a", "b", np.nan, "d"])
  168. tm.assert_index_equal(s.unique(), exp)
  169. else:
  170. exp = np.array(["a", "b", np.nan, "d"], dtype=object)
  171. if using_infer_string:
  172. exp = array(exp, dtype="str")
  173. tm.assert_equal(s.unique(), exp)
  174. assert s.nunique() == 3
  175. s = klass({}) if klass is dict else klass({}, dtype=object)
  176. expected = Series([], dtype=np.int64, name="count")
  177. tm.assert_series_equal(s.value_counts(), expected, check_index_type=False)
  178. # returned dtype differs depending on original
  179. if isinstance(s, Index):
  180. tm.assert_index_equal(s.unique(), Index([]), exact=False)
  181. else:
  182. tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False)
  183. assert s.nunique() == 0
  184. def test_value_counts_datetime64(index_or_series, unit):
  185. klass = index_or_series
  186. # GH 3002, datetime64[ns]
  187. # don't test names though
  188. df = pd.DataFrame(
  189. {
  190. "person_id": ["xxyyzz", "xxyyzz", "xxyyzz", "xxyyww", "foofoo", "foofoo"],
  191. "dt": pd.to_datetime(
  192. [
  193. "2010-01-01",
  194. "2010-01-01",
  195. "2010-01-01",
  196. "2009-01-01",
  197. "2008-09-09",
  198. "2008-09-09",
  199. ]
  200. ).as_unit(unit),
  201. "food": ["PIE", "GUM", "EGG", "EGG", "PIE", "GUM"],
  202. }
  203. )
  204. s = klass(df["dt"].copy())
  205. s.name = None
  206. idx = pd.to_datetime(
  207. ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"]
  208. ).as_unit(unit)
  209. expected_s = Series([3, 2, 1], index=idx, name="count")
  210. tm.assert_series_equal(s.value_counts(), expected_s)
  211. expected = array(
  212. np.array(
  213. ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"],
  214. dtype=f"datetime64[{unit}]",
  215. )
  216. )
  217. result = s.unique()
  218. if isinstance(s, Index):
  219. tm.assert_index_equal(result, DatetimeIndex(expected))
  220. else:
  221. tm.assert_extension_array_equal(result, expected)
  222. assert s.nunique() == 3
  223. # with NaT
  224. s = df["dt"].copy()
  225. s = klass(list(s.values) + [pd.NaT] * 4)
  226. if klass is Series:
  227. s = s.dt.as_unit(unit)
  228. else:
  229. s = s.as_unit(unit)
  230. result = s.value_counts()
  231. assert result.index.dtype == f"datetime64[{unit}]"
  232. tm.assert_series_equal(result, expected_s)
  233. result = s.value_counts(dropna=False)
  234. expected_s = pd.concat(
  235. [
  236. Series([4], index=DatetimeIndex([pd.NaT]).as_unit(unit), name="count"),
  237. expected_s,
  238. ]
  239. )
  240. tm.assert_series_equal(result, expected_s)
  241. assert s.dtype == f"datetime64[{unit}]"
  242. unique = s.unique()
  243. assert unique.dtype == f"datetime64[{unit}]"
  244. # numpy_array_equal cannot compare pd.NaT
  245. if isinstance(s, Index):
  246. exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]).as_unit(unit)
  247. tm.assert_index_equal(unique, exp_idx)
  248. else:
  249. tm.assert_extension_array_equal(unique[:3], expected)
  250. assert pd.isna(unique[3])
  251. assert s.nunique() == 3
  252. assert s.nunique(dropna=False) == 4
  253. def test_value_counts_timedelta64(index_or_series, unit):
  254. # timedelta64[ns]
  255. klass = index_or_series
  256. day = Timedelta(timedelta(1)).as_unit(unit)
  257. tdi = TimedeltaIndex([day], name="dt").as_unit(unit)
  258. tdvals = np.zeros(6, dtype=f"m8[{unit}]") + day
  259. td = klass(tdvals, name="dt")
  260. result = td.value_counts()
  261. expected_s = Series([6], index=tdi, name="count")
  262. tm.assert_series_equal(result, expected_s)
  263. expected = tdi
  264. result = td.unique()
  265. if isinstance(td, Index):
  266. tm.assert_index_equal(result, expected)
  267. else:
  268. tm.assert_extension_array_equal(result, expected._values)
  269. td2 = day + np.zeros(6, dtype=f"m8[{unit}]")
  270. td2 = klass(td2, name="dt")
  271. result2 = td2.value_counts()
  272. tm.assert_series_equal(result2, expected_s)
  273. @pytest.mark.parametrize("dropna", [True, False])
  274. def test_value_counts_with_nan(dropna, index_or_series):
  275. # GH31944
  276. klass = index_or_series
  277. values = [True, pd.NA, np.nan]
  278. obj = klass(values)
  279. res = obj.value_counts(dropna=dropna)
  280. if dropna is True:
  281. expected = Series([1], index=Index([True], dtype=obj.dtype), name="count")
  282. else:
  283. expected = Series([1, 1, 1], index=[True, pd.NA, np.nan], name="count")
  284. tm.assert_series_equal(res, expected)
  285. def test_value_counts_object_inference_deprecated():
  286. # GH#56161
  287. dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
  288. idx = dti.astype(object)
  289. msg = "The behavior of value_counts with object-dtype is deprecated"
  290. with tm.assert_produces_warning(FutureWarning, match=msg):
  291. res = idx.value_counts()
  292. exp = dti.value_counts()
  293. tm.assert_series_equal(res, exp)