test_qcut.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. import os
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. from pandas import (
  6. Categorical,
  7. DatetimeIndex,
  8. Interval,
  9. IntervalIndex,
  10. NaT,
  11. Series,
  12. Timedelta,
  13. TimedeltaIndex,
  14. Timestamp,
  15. cut,
  16. date_range,
  17. isna,
  18. qcut,
  19. timedelta_range,
  20. )
  21. import pandas._testing as tm
  22. from pandas.api.types import CategoricalDtype
  23. from pandas.tseries.offsets import Day
  24. def test_qcut():
  25. arr = np.random.default_rng(2).standard_normal(1000)
  26. # We store the bins as Index that have been
  27. # rounded to comparisons are a bit tricky.
  28. labels, _ = qcut(arr, 4, retbins=True)
  29. ex_bins = np.quantile(arr, [0, 0.25, 0.5, 0.75, 1.0])
  30. result = labels.categories.left.values
  31. assert np.allclose(result, ex_bins[:-1], atol=1e-2)
  32. result = labels.categories.right.values
  33. assert np.allclose(result, ex_bins[1:], atol=1e-2)
  34. ex_levels = cut(arr, ex_bins, include_lowest=True)
  35. tm.assert_categorical_equal(labels, ex_levels)
  36. def test_qcut_bounds():
  37. arr = np.random.default_rng(2).standard_normal(1000)
  38. factor = qcut(arr, 10, labels=False)
  39. assert len(np.unique(factor)) == 10
  40. def test_qcut_specify_quantiles():
  41. arr = np.random.default_rng(2).standard_normal(100)
  42. factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0])
  43. expected = qcut(arr, 4)
  44. tm.assert_categorical_equal(factor, expected)
  45. def test_qcut_all_bins_same():
  46. with pytest.raises(ValueError, match="edges.*unique"):
  47. qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
  48. def test_qcut_include_lowest():
  49. values = np.arange(10)
  50. ii = qcut(values, 4)
  51. ex_levels = IntervalIndex(
  52. [
  53. Interval(-0.001, 2.25),
  54. Interval(2.25, 4.5),
  55. Interval(4.5, 6.75),
  56. Interval(6.75, 9),
  57. ]
  58. )
  59. tm.assert_index_equal(ii.categories, ex_levels)
  60. def test_qcut_nas():
  61. arr = np.random.default_rng(2).standard_normal(100)
  62. arr[:20] = np.nan
  63. result = qcut(arr, 4)
  64. assert isna(result[:20]).all()
  65. def test_qcut_index():
  66. result = qcut([0, 2], 2)
  67. intervals = [Interval(-0.001, 1), Interval(1, 2)]
  68. expected = Categorical(intervals, ordered=True)
  69. tm.assert_categorical_equal(result, expected)
  70. def test_qcut_binning_issues(datapath):
  71. # see gh-1978, gh-1979
  72. cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv"))
  73. arr = np.loadtxt(cut_file)
  74. result = qcut(arr, 20)
  75. starts = []
  76. ends = []
  77. for lev in np.unique(result):
  78. s = lev.left
  79. e = lev.right
  80. assert s != e
  81. starts.append(float(s))
  82. ends.append(float(e))
  83. for (sp, sn), (ep, en) in zip(
  84. zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:])
  85. ):
  86. assert sp < sn
  87. assert ep < en
  88. assert ep <= sn
  89. def test_qcut_return_intervals():
  90. ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
  91. res = qcut(ser, [0, 0.333, 0.666, 1])
  92. exp_levels = np.array(
  93. [Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]
  94. )
  95. exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
  96. CategoricalDtype(ordered=True)
  97. )
  98. tm.assert_series_equal(res, exp)
  99. @pytest.mark.parametrize("labels", ["foo", 1, True])
  100. def test_qcut_incorrect_labels(labels):
  101. # GH 13318
  102. values = range(5)
  103. msg = "Bin labels must either be False, None or passed in as a list-like argument"
  104. with pytest.raises(ValueError, match=msg):
  105. qcut(values, 4, labels=labels)
  106. @pytest.mark.parametrize("labels", [["a", "b", "c"], list(range(3))])
  107. def test_qcut_wrong_length_labels(labels):
  108. # GH 13318
  109. values = range(10)
  110. msg = "Bin labels must be one fewer than the number of bin edges"
  111. with pytest.raises(ValueError, match=msg):
  112. qcut(values, 4, labels=labels)
  113. @pytest.mark.parametrize(
  114. "labels, expected",
  115. [
  116. (["a", "b", "c"], Categorical(["a", "b", "c"], ordered=True)),
  117. (list(range(3)), Categorical([0, 1, 2], ordered=True)),
  118. ],
  119. )
  120. def test_qcut_list_like_labels(labels, expected):
  121. # GH 13318
  122. values = range(3)
  123. result = qcut(values, 3, labels=labels)
  124. tm.assert_categorical_equal(result, expected)
  125. @pytest.mark.parametrize(
  126. "kwargs,msg",
  127. [
  128. ({"duplicates": "drop"}, None),
  129. ({}, "Bin edges must be unique"),
  130. ({"duplicates": "raise"}, "Bin edges must be unique"),
  131. ({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"),
  132. ],
  133. )
  134. def test_qcut_duplicates_bin(kwargs, msg):
  135. # see gh-7751
  136. values = [0, 0, 0, 0, 1, 2, 3]
  137. if msg is not None:
  138. with pytest.raises(ValueError, match=msg):
  139. qcut(values, 3, **kwargs)
  140. else:
  141. result = qcut(values, 3, **kwargs)
  142. expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
  143. tm.assert_index_equal(result.categories, expected)
  144. @pytest.mark.parametrize(
  145. "data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)]
  146. )
  147. @pytest.mark.parametrize("length", [1, 2])
  148. @pytest.mark.parametrize("labels", [None, False])
  149. def test_single_quantile(data, start, end, length, labels):
  150. # see gh-15431
  151. ser = Series([data] * length)
  152. result = qcut(ser, 1, labels=labels)
  153. if labels is None:
  154. intervals = IntervalIndex([Interval(start, end)] * length, closed="right")
  155. expected = Series(intervals).astype(CategoricalDtype(ordered=True))
  156. else:
  157. expected = Series([0] * length, dtype=np.intp)
  158. tm.assert_series_equal(result, expected)
  159. @pytest.mark.parametrize(
  160. "ser",
  161. [
  162. Series(DatetimeIndex(["20180101", NaT, "20180103"])),
  163. Series(TimedeltaIndex(["0 days", NaT, "2 days"])),
  164. ],
  165. ids=lambda x: str(x.dtype),
  166. )
  167. def test_qcut_nat(ser, unit):
  168. # see gh-19768
  169. ser = ser.dt.as_unit(unit)
  170. td = Timedelta(1, unit=unit).as_unit(unit)
  171. left = Series([ser[0] - td, np.nan, ser[2] - Day()], dtype=ser.dtype)
  172. right = Series([ser[2] - Day(), np.nan, ser[2]], dtype=ser.dtype)
  173. intervals = IntervalIndex.from_arrays(left, right)
  174. expected = Series(Categorical(intervals, ordered=True))
  175. result = qcut(ser, 2)
  176. tm.assert_series_equal(result, expected)
  177. @pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)])
  178. def test_datetime_tz_qcut(bins):
  179. # see gh-19872
  180. tz = "US/Eastern"
  181. ser = Series(date_range("20130101", periods=3, tz=tz))
  182. result = qcut(ser, bins)
  183. expected = Series(
  184. IntervalIndex(
  185. [
  186. Interval(
  187. Timestamp("2012-12-31 23:59:59.999999999", tz=tz),
  188. Timestamp("2013-01-01 16:00:00", tz=tz),
  189. ),
  190. Interval(
  191. Timestamp("2013-01-01 16:00:00", tz=tz),
  192. Timestamp("2013-01-02 08:00:00", tz=tz),
  193. ),
  194. Interval(
  195. Timestamp("2013-01-02 08:00:00", tz=tz),
  196. Timestamp("2013-01-03 00:00:00", tz=tz),
  197. ),
  198. ]
  199. )
  200. ).astype(CategoricalDtype(ordered=True))
  201. tm.assert_series_equal(result, expected)
  202. @pytest.mark.parametrize(
  203. "arg,expected_bins",
  204. [
  205. [
  206. timedelta_range("1day", periods=3),
  207. TimedeltaIndex(["1 days", "2 days", "3 days"]),
  208. ],
  209. [
  210. date_range("20180101", periods=3),
  211. DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]),
  212. ],
  213. ],
  214. )
  215. def test_date_like_qcut_bins(arg, expected_bins):
  216. # see gh-19891
  217. ser = Series(arg)
  218. result, result_bins = qcut(ser, 2, retbins=True)
  219. tm.assert_index_equal(result_bins, expected_bins)
  220. @pytest.mark.parametrize("bins", [6, 7])
  221. @pytest.mark.parametrize(
  222. "box, compare",
  223. [
  224. (Series, tm.assert_series_equal),
  225. (np.array, tm.assert_categorical_equal),
  226. (list, tm.assert_equal),
  227. ],
  228. )
  229. def test_qcut_bool_coercion_to_int(bins, box, compare):
  230. # issue 20303
  231. data_expected = box([0, 1, 1, 0, 1] * 10)
  232. data_result = box([False, True, True, False, True] * 10)
  233. expected = qcut(data_expected, bins, duplicates="drop")
  234. result = qcut(data_result, bins, duplicates="drop")
  235. compare(result, expected)
  236. @pytest.mark.parametrize("q", [2, 5, 10])
  237. def test_qcut_nullable_integer(q, any_numeric_ea_dtype):
  238. arr = pd.array(np.arange(100), dtype=any_numeric_ea_dtype)
  239. arr[::2] = pd.NA
  240. result = qcut(arr, q)
  241. expected = qcut(arr.astype(float), q)
  242. tm.assert_categorical_equal(result, expected)