test_cut.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. Categorical,
  6. DataFrame,
  7. DatetimeIndex,
  8. Index,
  9. Interval,
  10. IntervalIndex,
  11. Series,
  12. TimedeltaIndex,
  13. Timestamp,
  14. cut,
  15. date_range,
  16. interval_range,
  17. isna,
  18. qcut,
  19. timedelta_range,
  20. to_datetime,
  21. )
  22. import pandas._testing as tm
  23. from pandas.api.types import CategoricalDtype
  24. import pandas.core.reshape.tile as tmod
  25. def test_simple():
  26. data = np.ones(5, dtype="int64")
  27. result = cut(data, 4, labels=False)
  28. expected = np.array([1, 1, 1, 1, 1])
  29. tm.assert_numpy_array_equal(result, expected, check_dtype=False)
  30. @pytest.mark.parametrize("func", [list, np.array])
  31. def test_bins(func):
  32. data = func([0.2, 1.4, 2.5, 6.2, 9.7, 2.1])
  33. result, bins = cut(data, 3, retbins=True)
  34. intervals = IntervalIndex.from_breaks(bins.round(3))
  35. intervals = intervals.take([0, 0, 0, 1, 2, 0])
  36. expected = Categorical(intervals, ordered=True)
  37. tm.assert_categorical_equal(result, expected)
  38. tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7]))
  39. def test_right():
  40. data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
  41. result, bins = cut(data, 4, right=True, retbins=True)
  42. intervals = IntervalIndex.from_breaks(bins.round(3))
  43. expected = Categorical(intervals, ordered=True)
  44. expected = expected.take([0, 0, 0, 2, 3, 0, 0])
  45. tm.assert_categorical_equal(result, expected)
  46. tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, 7.325, 9.7]))
  47. def test_no_right():
  48. data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
  49. result, bins = cut(data, 4, right=False, retbins=True)
  50. intervals = IntervalIndex.from_breaks(bins.round(3), closed="left")
  51. intervals = intervals.take([0, 0, 0, 2, 3, 0, 1])
  52. expected = Categorical(intervals, ordered=True)
  53. tm.assert_categorical_equal(result, expected)
  54. tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, 7.325, 9.7095]))
  55. def test_bins_from_interval_index():
  56. c = cut(range(5), 3)
  57. expected = c
  58. result = cut(range(5), bins=expected.categories)
  59. tm.assert_categorical_equal(result, expected)
  60. expected = Categorical.from_codes(
  61. np.append(c.codes, -1), categories=c.categories, ordered=True
  62. )
  63. result = cut(range(6), bins=expected.categories)
  64. tm.assert_categorical_equal(result, expected)
  65. def test_bins_from_interval_index_doc_example():
  66. # Make sure we preserve the bins.
  67. ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
  68. c = cut(ages, bins=[0, 18, 35, 70])
  69. expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)])
  70. tm.assert_index_equal(c.categories, expected)
  71. result = cut([25, 20, 50], bins=c.categories)
  72. tm.assert_index_equal(result.categories, expected)
  73. tm.assert_numpy_array_equal(result.codes, np.array([1, 1, 2], dtype="int8"))
  74. def test_bins_not_overlapping_from_interval_index():
  75. # see gh-23980
  76. msg = "Overlapping IntervalIndex is not accepted"
  77. ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)])
  78. with pytest.raises(ValueError, match=msg):
  79. cut([5, 6], bins=ii)
  80. def test_bins_not_monotonic():
  81. msg = "bins must increase monotonically"
  82. data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
  83. with pytest.raises(ValueError, match=msg):
  84. cut(data, [0.1, 1.5, 1, 10])
  85. @pytest.mark.parametrize(
  86. "x, bins, expected",
  87. [
  88. (
  89. date_range("2017-12-31", periods=3),
  90. [Timestamp.min, Timestamp("2018-01-01"), Timestamp.max],
  91. IntervalIndex.from_tuples(
  92. [
  93. (Timestamp.min, Timestamp("2018-01-01")),
  94. (Timestamp("2018-01-01"), Timestamp.max),
  95. ]
  96. ),
  97. ),
  98. (
  99. [-1, 0, 1],
  100. np.array(
  101. [np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], dtype="int64"
  102. ),
  103. IntervalIndex.from_tuples(
  104. [(np.iinfo(np.int64).min, 0), (0, np.iinfo(np.int64).max)]
  105. ),
  106. ),
  107. (
  108. [
  109. np.timedelta64(-1, "ns"),
  110. np.timedelta64(0, "ns"),
  111. np.timedelta64(1, "ns"),
  112. ],
  113. np.array(
  114. [
  115. np.timedelta64(-np.iinfo(np.int64).max, "ns"),
  116. np.timedelta64(0, "ns"),
  117. np.timedelta64(np.iinfo(np.int64).max, "ns"),
  118. ]
  119. ),
  120. IntervalIndex.from_tuples(
  121. [
  122. (
  123. np.timedelta64(-np.iinfo(np.int64).max, "ns"),
  124. np.timedelta64(0, "ns"),
  125. ),
  126. (
  127. np.timedelta64(0, "ns"),
  128. np.timedelta64(np.iinfo(np.int64).max, "ns"),
  129. ),
  130. ]
  131. ),
  132. ),
  133. ],
  134. )
  135. def test_bins_monotonic_not_overflowing(x, bins, expected):
  136. # GH 26045
  137. result = cut(x, bins)
  138. tm.assert_index_equal(result.categories, expected)
  139. def test_wrong_num_labels():
  140. msg = "Bin labels must be one fewer than the number of bin edges"
  141. data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
  142. with pytest.raises(ValueError, match=msg):
  143. cut(data, [0, 1, 10], labels=["foo", "bar", "baz"])
  144. @pytest.mark.parametrize(
  145. "x,bins,msg",
  146. [
  147. ([], 2, "Cannot cut empty array"),
  148. ([1, 2, 3], 0.5, "`bins` should be a positive integer"),
  149. ],
  150. )
  151. def test_cut_corner(x, bins, msg):
  152. with pytest.raises(ValueError, match=msg):
  153. cut(x, bins)
  154. @pytest.mark.parametrize("arg", [2, np.eye(2), DataFrame(np.eye(2))])
  155. @pytest.mark.parametrize("cut_func", [cut, qcut])
  156. def test_cut_not_1d_arg(arg, cut_func):
  157. msg = "Input array must be 1 dimensional"
  158. with pytest.raises(ValueError, match=msg):
  159. cut_func(arg, 2)
  160. @pytest.mark.parametrize(
  161. "data",
  162. [
  163. [0, 1, 2, 3, 4, np.inf],
  164. [-np.inf, 0, 1, 2, 3, 4],
  165. [-np.inf, 0, 1, 2, 3, 4, np.inf],
  166. ],
  167. )
  168. def test_int_bins_with_inf(data):
  169. # GH 24314
  170. msg = "cannot specify integer `bins` when input data contains infinity"
  171. with pytest.raises(ValueError, match=msg):
  172. cut(data, bins=3)
  173. def test_cut_out_of_range_more():
  174. # see gh-1511
  175. name = "x"
  176. ser = Series([0, -1, 0, 1, -3], name=name)
  177. ind = cut(ser, [0, 1], labels=False)
  178. exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name=name)
  179. tm.assert_series_equal(ind, exp)
  180. @pytest.mark.parametrize(
  181. "right,breaks,closed",
  182. [
  183. (True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"),
  184. (False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left"),
  185. ],
  186. )
  187. def test_labels(right, breaks, closed):
  188. arr = np.tile(np.arange(0, 1.01, 0.1), 4)
  189. result, bins = cut(arr, 4, retbins=True, right=right)
  190. ex_levels = IntervalIndex.from_breaks(breaks, closed=closed)
  191. tm.assert_index_equal(result.categories, ex_levels)
  192. def test_cut_pass_series_name_to_factor():
  193. name = "foo"
  194. ser = Series(np.random.default_rng(2).standard_normal(100), name=name)
  195. factor = cut(ser, 4)
  196. assert factor.name == name
  197. def test_label_precision():
  198. arr = np.arange(0, 0.73, 0.01)
  199. result = cut(arr, 4, precision=2)
  200. ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72])
  201. tm.assert_index_equal(result.categories, ex_levels)
  202. @pytest.mark.parametrize("labels", [None, False])
  203. def test_na_handling(labels):
  204. arr = np.arange(0, 0.75, 0.01)
  205. arr[::3] = np.nan
  206. result = cut(arr, 4, labels=labels)
  207. result = np.asarray(result)
  208. expected = np.where(isna(arr), np.nan, result)
  209. tm.assert_almost_equal(result, expected)
  210. def test_inf_handling():
  211. data = np.arange(6)
  212. data_ser = Series(data, dtype="int64")
  213. bins = [-np.inf, 2, 4, np.inf]
  214. result = cut(data, bins)
  215. result_ser = cut(data_ser, bins)
  216. ex_uniques = IntervalIndex.from_breaks(bins)
  217. tm.assert_index_equal(result.categories, ex_uniques)
  218. assert result[5] == Interval(4, np.inf)
  219. assert result[0] == Interval(-np.inf, 2)
  220. assert result_ser[5] == Interval(4, np.inf)
  221. assert result_ser[0] == Interval(-np.inf, 2)
  222. def test_cut_out_of_bounds():
  223. arr = np.random.default_rng(2).standard_normal(100)
  224. result = cut(arr, [-1, 0, 1])
  225. mask = isna(result)
  226. ex_mask = (arr < -1) | (arr > 1)
  227. tm.assert_numpy_array_equal(mask, ex_mask)
  228. @pytest.mark.parametrize(
  229. "get_labels,get_expected",
  230. [
  231. (
  232. lambda labels: labels,
  233. lambda labels: Categorical(
  234. ["Medium"] + 4 * ["Small"] + ["Medium", "Large"],
  235. categories=labels,
  236. ordered=True,
  237. ),
  238. ),
  239. (
  240. lambda labels: Categorical.from_codes([0, 1, 2], labels),
  241. lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels),
  242. ),
  243. ],
  244. )
  245. def test_cut_pass_labels(get_labels, get_expected):
  246. bins = [0, 25, 50, 100]
  247. arr = [50, 5, 10, 15, 20, 30, 70]
  248. labels = ["Small", "Medium", "Large"]
  249. result = cut(arr, bins, labels=get_labels(labels))
  250. tm.assert_categorical_equal(result, get_expected(labels))
  251. def test_cut_pass_labels_compat():
  252. # see gh-16459
  253. arr = [50, 5, 10, 15, 20, 30, 70]
  254. labels = ["Good", "Medium", "Bad"]
  255. result = cut(arr, 3, labels=labels)
  256. exp = cut(arr, 3, labels=Categorical(labels, categories=labels, ordered=True))
  257. tm.assert_categorical_equal(result, exp)
  258. @pytest.mark.parametrize("x", [np.arange(11.0), np.arange(11.0) / 1e10])
  259. def test_round_frac_just_works(x):
  260. # It works.
  261. cut(x, 2)
  262. @pytest.mark.parametrize(
  263. "val,precision,expected",
  264. [
  265. (-117.9998, 3, -118),
  266. (117.9998, 3, 118),
  267. (117.9998, 2, 118),
  268. (0.000123456, 2, 0.00012),
  269. ],
  270. )
  271. def test_round_frac(val, precision, expected):
  272. # see gh-1979
  273. result = tmod._round_frac(val, precision=precision)
  274. assert result == expected
  275. def test_cut_return_intervals():
  276. ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
  277. result = cut(ser, 3)
  278. exp_bins = np.linspace(0, 8, num=4).round(3)
  279. exp_bins[0] -= 0.008
  280. expected = Series(
  281. IntervalIndex.from_breaks(exp_bins, closed="right").take(
  282. [0, 0, 0, 1, 1, 1, 2, 2, 2]
  283. )
  284. ).astype(CategoricalDtype(ordered=True))
  285. tm.assert_series_equal(result, expected)
  286. def test_series_ret_bins():
  287. # see gh-8589
  288. ser = Series(np.arange(4))
  289. result, bins = cut(ser, 2, retbins=True)
  290. expected = Series(
  291. IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2)
  292. ).astype(CategoricalDtype(ordered=True))
  293. tm.assert_series_equal(result, expected)
  294. @pytest.mark.parametrize(
  295. "kwargs,msg",
  296. [
  297. ({"duplicates": "drop"}, None),
  298. ({}, "Bin edges must be unique"),
  299. ({"duplicates": "raise"}, "Bin edges must be unique"),
  300. ({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"),
  301. ],
  302. )
  303. def test_cut_duplicates_bin(kwargs, msg):
  304. # see gh-20947
  305. bins = [0, 2, 4, 6, 10, 10]
  306. values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"])
  307. if msg is not None:
  308. with pytest.raises(ValueError, match=msg):
  309. cut(values, bins, **kwargs)
  310. else:
  311. result = cut(values, bins, **kwargs)
  312. expected = cut(values, pd.unique(np.asarray(bins)))
  313. tm.assert_series_equal(result, expected)
  314. @pytest.mark.parametrize("data", [9.0, -9.0, 0.0])
  315. @pytest.mark.parametrize("length", [1, 2])
  316. def test_single_bin(data, length):
  317. # see gh-14652, gh-15428
  318. ser = Series([data] * length)
  319. result = cut(ser, 1, labels=False)
  320. expected = Series([0] * length, dtype=np.intp)
  321. tm.assert_series_equal(result, expected)
  322. @pytest.mark.parametrize(
  323. "array_1_writeable,array_2_writeable", [(True, True), (True, False), (False, False)]
  324. )
  325. def test_cut_read_only(array_1_writeable, array_2_writeable):
  326. # issue 18773
  327. array_1 = np.arange(0, 100, 10)
  328. array_1.flags.writeable = array_1_writeable
  329. array_2 = np.arange(0, 100, 10)
  330. array_2.flags.writeable = array_2_writeable
  331. hundred_elements = np.arange(100)
  332. tm.assert_categorical_equal(
  333. cut(hundred_elements, array_1), cut(hundred_elements, array_2)
  334. )
  335. @pytest.mark.parametrize(
  336. "conv",
  337. [
  338. lambda v: Timestamp(v),
  339. lambda v: to_datetime(v),
  340. lambda v: np.datetime64(v),
  341. lambda v: Timestamp(v).to_pydatetime(),
  342. ],
  343. )
  344. def test_datetime_bin(conv):
  345. data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")]
  346. bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"]
  347. expected = Series(
  348. IntervalIndex(
  349. [
  350. Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])),
  351. Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])),
  352. ]
  353. )
  354. ).astype(CategoricalDtype(ordered=True))
  355. bins = [conv(v) for v in bin_data]
  356. result = Series(cut(data, bins=bins))
  357. tm.assert_series_equal(result, expected)
  358. @pytest.mark.parametrize("box", [Series, Index, np.array, list])
  359. def test_datetime_cut(unit, box):
  360. # see gh-14714
  361. #
  362. # Testing time data when it comes in various collection types.
  363. data = to_datetime(["2013-01-01", "2013-01-02", "2013-01-03"]).astype(f"M8[{unit}]")
  364. data = box(data)
  365. result, _ = cut(data, 3, retbins=True)
  366. if box is list:
  367. # We don't (yet) do inference on these, so get nanos
  368. unit = "ns"
  369. if unit == "s":
  370. # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
  371. # for why we round to 8 seconds instead of 7
  372. left = DatetimeIndex(
  373. ["2012-12-31 23:57:08", "2013-01-01 16:00:00", "2013-01-02 08:00:00"],
  374. dtype=f"M8[{unit}]",
  375. )
  376. else:
  377. left = DatetimeIndex(
  378. [
  379. "2012-12-31 23:57:07.200000",
  380. "2013-01-01 16:00:00",
  381. "2013-01-02 08:00:00",
  382. ],
  383. dtype=f"M8[{unit}]",
  384. )
  385. right = DatetimeIndex(
  386. ["2013-01-01 16:00:00", "2013-01-02 08:00:00", "2013-01-03 00:00:00"],
  387. dtype=f"M8[{unit}]",
  388. )
  389. exp_intervals = IntervalIndex.from_arrays(left, right)
  390. expected = Series(exp_intervals).astype(CategoricalDtype(ordered=True))
  391. tm.assert_series_equal(Series(result), expected)
  392. @pytest.mark.parametrize("box", [list, np.array, Index, Series])
  393. def test_datetime_tz_cut_mismatched_tzawareness(box):
  394. # GH#54964
  395. bins = box(
  396. [
  397. Timestamp("2013-01-01 04:57:07.200000"),
  398. Timestamp("2013-01-01 21:00:00"),
  399. Timestamp("2013-01-02 13:00:00"),
  400. Timestamp("2013-01-03 05:00:00"),
  401. ]
  402. )
  403. ser = Series(date_range("20130101", periods=3, tz="US/Eastern"))
  404. msg = "Cannot use timezone-naive bins with timezone-aware values"
  405. with pytest.raises(ValueError, match=msg):
  406. cut(ser, bins)
  407. @pytest.mark.parametrize(
  408. "bins",
  409. [
  410. 3,
  411. [
  412. Timestamp("2013-01-01 04:57:07.200000", tz="UTC").tz_convert("US/Eastern"),
  413. Timestamp("2013-01-01 21:00:00", tz="UTC").tz_convert("US/Eastern"),
  414. Timestamp("2013-01-02 13:00:00", tz="UTC").tz_convert("US/Eastern"),
  415. Timestamp("2013-01-03 05:00:00", tz="UTC").tz_convert("US/Eastern"),
  416. ],
  417. ],
  418. )
  419. @pytest.mark.parametrize("box", [list, np.array, Index, Series])
  420. def test_datetime_tz_cut(bins, box):
  421. # see gh-19872
  422. tz = "US/Eastern"
  423. ser = Series(date_range("20130101", periods=3, tz=tz))
  424. if not isinstance(bins, int):
  425. bins = box(bins)
  426. result = cut(ser, bins)
  427. expected = Series(
  428. IntervalIndex(
  429. [
  430. Interval(
  431. Timestamp("2012-12-31 23:57:07.200000", tz=tz),
  432. Timestamp("2013-01-01 16:00:00", tz=tz),
  433. ),
  434. Interval(
  435. Timestamp("2013-01-01 16:00:00", tz=tz),
  436. Timestamp("2013-01-02 08:00:00", tz=tz),
  437. ),
  438. Interval(
  439. Timestamp("2013-01-02 08:00:00", tz=tz),
  440. Timestamp("2013-01-03 00:00:00", tz=tz),
  441. ),
  442. ]
  443. )
  444. ).astype(CategoricalDtype(ordered=True))
  445. tm.assert_series_equal(result, expected)
  446. def test_datetime_nan_error():
  447. msg = "bins must be of datetime64 dtype"
  448. with pytest.raises(ValueError, match=msg):
  449. cut(date_range("20130101", periods=3), bins=[0, 2, 4])
  450. def test_datetime_nan_mask():
  451. result = cut(
  452. date_range("20130102", periods=5), bins=date_range("20130101", periods=2)
  453. )
  454. mask = result.categories.isna()
  455. tm.assert_numpy_array_equal(mask, np.array([False]))
  456. mask = result.isna()
  457. tm.assert_numpy_array_equal(mask, np.array([False, True, True, True, True]))
  458. @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
  459. def test_datetime_cut_roundtrip(tz, unit):
  460. # see gh-19891
  461. ser = Series(date_range("20180101", periods=3, tz=tz, unit=unit))
  462. result, result_bins = cut(ser, 2, retbins=True)
  463. expected = cut(ser, result_bins)
  464. tm.assert_series_equal(result, expected)
  465. if unit == "s":
  466. # TODO: constructing DatetimeIndex with dtype="M8[s]" without truncating
  467. # the first entry here raises in array_to_datetime. Should truncate
  468. # instead of raising?
  469. # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
  470. # for why we round to 8 seconds instead of 7
  471. expected_bins = DatetimeIndex(
  472. ["2017-12-31 23:57:08", "2018-01-02 00:00:00", "2018-01-03 00:00:00"],
  473. dtype=f"M8[{unit}]",
  474. )
  475. else:
  476. expected_bins = DatetimeIndex(
  477. [
  478. "2017-12-31 23:57:07.200000",
  479. "2018-01-02 00:00:00",
  480. "2018-01-03 00:00:00",
  481. ],
  482. dtype=f"M8[{unit}]",
  483. )
  484. expected_bins = expected_bins.tz_localize(tz)
  485. tm.assert_index_equal(result_bins, expected_bins)
  486. def test_timedelta_cut_roundtrip():
  487. # see gh-19891
  488. ser = Series(timedelta_range("1day", periods=3))
  489. result, result_bins = cut(ser, 2, retbins=True)
  490. expected = cut(ser, result_bins)
  491. tm.assert_series_equal(result, expected)
  492. expected_bins = TimedeltaIndex(
  493. ["0 days 23:57:07.200000", "2 days 00:00:00", "3 days 00:00:00"]
  494. )
  495. tm.assert_index_equal(result_bins, expected_bins)
  496. @pytest.mark.parametrize("bins", [6, 7])
  497. @pytest.mark.parametrize(
  498. "box, compare",
  499. [
  500. (Series, tm.assert_series_equal),
  501. (np.array, tm.assert_categorical_equal),
  502. (list, tm.assert_equal),
  503. ],
  504. )
  505. def test_cut_bool_coercion_to_int(bins, box, compare):
  506. # issue 20303
  507. data_expected = box([0, 1, 1, 0, 1] * 10)
  508. data_result = box([False, True, True, False, True] * 10)
  509. expected = cut(data_expected, bins, duplicates="drop")
  510. result = cut(data_result, bins, duplicates="drop")
  511. compare(result, expected)
  512. @pytest.mark.parametrize("labels", ["foo", 1, True])
  513. def test_cut_incorrect_labels(labels):
  514. # GH 13318
  515. values = range(5)
  516. msg = "Bin labels must either be False, None or passed in as a list-like argument"
  517. with pytest.raises(ValueError, match=msg):
  518. cut(values, 4, labels=labels)
  519. @pytest.mark.parametrize("bins", [3, [0, 5, 15]])
  520. @pytest.mark.parametrize("right", [True, False])
  521. @pytest.mark.parametrize("include_lowest", [True, False])
  522. def test_cut_nullable_integer(bins, right, include_lowest):
  523. a = np.random.default_rng(2).integers(0, 10, size=50).astype(float)
  524. a[::2] = np.nan
  525. result = cut(
  526. pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest
  527. )
  528. expected = cut(a, bins, right=right, include_lowest=include_lowest)
  529. tm.assert_categorical_equal(result, expected)
  530. @pytest.mark.parametrize(
  531. "data, bins, labels, expected_codes, expected_labels",
  532. [
  533. ([15, 17, 19], [14, 16, 18, 20], ["A", "B", "A"], [0, 1, 0], ["A", "B"]),
  534. ([1, 3, 5], [0, 2, 4, 6, 8], [2, 0, 1, 2], [2, 0, 1], [0, 1, 2]),
  535. ],
  536. )
  537. def test_cut_non_unique_labels(data, bins, labels, expected_codes, expected_labels):
  538. # GH 33141
  539. result = cut(data, bins=bins, labels=labels, ordered=False)
  540. expected = Categorical.from_codes(
  541. expected_codes, categories=expected_labels, ordered=False
  542. )
  543. tm.assert_categorical_equal(result, expected)
  544. @pytest.mark.parametrize(
  545. "data, bins, labels, expected_codes, expected_labels",
  546. [
  547. ([15, 17, 19], [14, 16, 18, 20], ["C", "B", "A"], [0, 1, 2], ["C", "B", "A"]),
  548. ([1, 3, 5], [0, 2, 4, 6, 8], [3, 0, 1, 2], [0, 1, 2], [3, 0, 1, 2]),
  549. ],
  550. )
  551. def test_cut_unordered_labels(data, bins, labels, expected_codes, expected_labels):
  552. # GH 33141
  553. result = cut(data, bins=bins, labels=labels, ordered=False)
  554. expected = Categorical.from_codes(
  555. expected_codes, categories=expected_labels, ordered=False
  556. )
  557. tm.assert_categorical_equal(result, expected)
  558. def test_cut_unordered_with_missing_labels_raises_error():
  559. # GH 33141
  560. msg = "'labels' must be provided if 'ordered = False'"
  561. with pytest.raises(ValueError, match=msg):
  562. cut([0.5, 3], bins=[0, 1, 2], ordered=False)
  563. def test_cut_unordered_with_series_labels():
  564. # https://github.com/pandas-dev/pandas/issues/36603
  565. ser = Series([1, 2, 3, 4, 5])
  566. bins = Series([0, 2, 4, 6])
  567. labels = Series(["a", "b", "c"])
  568. result = cut(ser, bins=bins, labels=labels, ordered=False)
  569. expected = Series(["a", "a", "b", "b", "c"], dtype="category")
  570. tm.assert_series_equal(result, expected)
  571. def test_cut_no_warnings():
  572. df = DataFrame({"value": np.random.default_rng(2).integers(0, 100, 20)})
  573. labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)]
  574. with tm.assert_produces_warning(False):
  575. df["group"] = cut(df.value, range(0, 105, 10), right=False, labels=labels)
  576. def test_cut_with_duplicated_index_lowest_included():
  577. # GH 42185
  578. expected = Series(
  579. [Interval(-0.001, 2, closed="right")] * 3
  580. + [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")],
  581. index=[0, 1, 2, 3, 0],
  582. dtype="category",
  583. ).cat.as_ordered()
  584. ser = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0])
  585. result = cut(ser, bins=[0, 2, 4], include_lowest=True)
  586. tm.assert_series_equal(result, expected)
  587. @pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning")
  588. def test_cut_with_nonexact_categorical_indices():
  589. # GH 42424
  590. ser = Series(range(100))
  591. ser1 = cut(ser, 10).value_counts().head(5)
  592. ser2 = cut(ser, 10).value_counts().tail(5)
  593. result = DataFrame({"1": ser1, "2": ser2})
  594. index = pd.CategoricalIndex(
  595. [
  596. Interval(-0.099, 9.9, closed="right"),
  597. Interval(9.9, 19.8, closed="right"),
  598. Interval(19.8, 29.7, closed="right"),
  599. Interval(29.7, 39.6, closed="right"),
  600. Interval(39.6, 49.5, closed="right"),
  601. Interval(49.5, 59.4, closed="right"),
  602. Interval(59.4, 69.3, closed="right"),
  603. Interval(69.3, 79.2, closed="right"),
  604. Interval(79.2, 89.1, closed="right"),
  605. Interval(89.1, 99, closed="right"),
  606. ],
  607. ordered=True,
  608. )
  609. expected = DataFrame(
  610. {"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index
  611. )
  612. tm.assert_frame_equal(expected, result)
  613. def test_cut_with_timestamp_tuple_labels():
  614. # GH 40661
  615. labels = [(Timestamp(10),), (Timestamp(20),), (Timestamp(30),)]
  616. result = cut([2, 4, 6], bins=[1, 3, 5, 7], labels=labels)
  617. expected = Categorical.from_codes([0, 1, 2], labels, ordered=True)
  618. tm.assert_categorical_equal(result, expected)
  619. def test_cut_bins_datetime_intervalindex():
  620. # https://github.com/pandas-dev/pandas/issues/46218
  621. bins = interval_range(Timestamp("2022-02-25"), Timestamp("2022-02-27"), freq="1D")
  622. # passing Series instead of list is important to trigger bug
  623. result = cut(Series([Timestamp("2022-02-26")]).astype("M8[ns]"), bins=bins)
  624. expected = Categorical.from_codes([0], bins, ordered=True)
  625. tm.assert_categorical_equal(result.array, expected)
  626. def test_cut_with_nullable_int64():
  627. # GH 30787
  628. series = Series([0, 1, 2, 3, 4, pd.NA, 6, 7], dtype="Int64")
  629. bins = [0, 2, 4, 6, 8]
  630. intervals = IntervalIndex.from_breaks(bins)
  631. expected = Series(
  632. Categorical.from_codes([-1, 0, 0, 1, 1, -1, 2, 3], intervals, ordered=True)
  633. )
  634. result = cut(series, bins=bins)
  635. tm.assert_series_equal(result, expected)