test_quantile.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977
  1. import numpy as np
  2. import pytest
  3. from pandas._config import using_string_dtype
  4. import pandas as pd
  5. from pandas import (
  6. DataFrame,
  7. Index,
  8. Series,
  9. Timestamp,
  10. )
  11. import pandas._testing as tm
  12. @pytest.fixture(
  13. params=[["linear", "single"], ["nearest", "table"]], ids=lambda x: "-".join(x)
  14. )
  15. def interp_method(request):
  16. """(interpolation, method) arguments for quantile"""
  17. return request.param
  18. class TestDataFrameQuantile:
  19. @pytest.mark.parametrize(
  20. "df,expected",
  21. [
  22. [
  23. DataFrame(
  24. {
  25. 0: Series(pd.arrays.SparseArray([1, 2])),
  26. 1: Series(pd.arrays.SparseArray([3, 4])),
  27. }
  28. ),
  29. Series([1.5, 3.5], name=0.5),
  30. ],
  31. [
  32. DataFrame(Series([0.0, None, 1.0, 2.0], dtype="Sparse[float]")),
  33. Series([1.0], name=0.5),
  34. ],
  35. ],
  36. )
  37. def test_quantile_sparse(self, df, expected):
  38. # GH#17198
  39. # GH#24600
  40. result = df.quantile()
  41. expected = expected.astype("Sparse[float]")
  42. tm.assert_series_equal(result, expected)
  43. def test_quantile(
  44. self, datetime_frame, interp_method, using_array_manager, request
  45. ):
  46. interpolation, method = interp_method
  47. df = datetime_frame
  48. result = df.quantile(
  49. 0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method
  50. )
  51. expected = Series(
  52. [np.percentile(df[col], 10) for col in df.columns],
  53. index=df.columns,
  54. name=0.1,
  55. )
  56. if interpolation == "linear":
  57. # np.percentile values only comparable to linear interpolation
  58. tm.assert_series_equal(result, expected)
  59. else:
  60. tm.assert_index_equal(result.index, expected.index)
  61. request.applymarker(
  62. pytest.mark.xfail(
  63. using_array_manager, reason="Name set incorrectly for arraymanager"
  64. )
  65. )
  66. assert result.name == expected.name
  67. result = df.quantile(
  68. 0.9, axis=1, numeric_only=True, interpolation=interpolation, method=method
  69. )
  70. expected = Series(
  71. [np.percentile(df.loc[date], 90) for date in df.index],
  72. index=df.index,
  73. name=0.9,
  74. )
  75. if interpolation == "linear":
  76. # np.percentile values only comparable to linear interpolation
  77. tm.assert_series_equal(result, expected)
  78. else:
  79. tm.assert_index_equal(result.index, expected.index)
  80. request.applymarker(
  81. pytest.mark.xfail(
  82. using_array_manager, reason="Name set incorrectly for arraymanager"
  83. )
  84. )
  85. assert result.name == expected.name
  86. def test_empty(self, interp_method):
  87. interpolation, method = interp_method
  88. q = DataFrame({"x": [], "y": []}).quantile(
  89. 0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method
  90. )
  91. assert np.isnan(q["x"]) and np.isnan(q["y"])
  92. def test_non_numeric_exclusion(self, interp_method, request, using_array_manager):
  93. interpolation, method = interp_method
  94. df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]})
  95. rs = df.quantile(
  96. 0.5, numeric_only=True, interpolation=interpolation, method=method
  97. )
  98. xp = df.median(numeric_only=True).rename(0.5)
  99. if interpolation == "nearest":
  100. xp = (xp + 0.5).astype(np.int64)
  101. if method == "table" and using_array_manager:
  102. request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
  103. tm.assert_series_equal(rs, xp)
  104. def test_axis(self, interp_method, request, using_array_manager):
  105. # axis
  106. interpolation, method = interp_method
  107. df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
  108. result = df.quantile(0.5, axis=1, interpolation=interpolation, method=method)
  109. expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
  110. if interpolation == "nearest":
  111. expected = expected.astype(np.int64)
  112. if method == "table" and using_array_manager:
  113. request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
  114. tm.assert_series_equal(result, expected)
  115. result = df.quantile(
  116. [0.5, 0.75], axis=1, interpolation=interpolation, method=method
  117. )
  118. expected = DataFrame(
  119. {1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75]
  120. )
  121. if interpolation == "nearest":
  122. expected.iloc[0, :] -= 0.5
  123. expected.iloc[1, :] += 0.25
  124. expected = expected.astype(np.int64)
  125. tm.assert_frame_equal(result, expected, check_index_type=True)
  126. def test_axis_numeric_only_true(self, interp_method, request, using_array_manager):
  127. # We may want to break API in the future to change this
  128. # so that we exclude non-numeric along the same axis
  129. # See GH #7312
  130. interpolation, method = interp_method
  131. df = DataFrame([[1, 2, 3], ["a", "b", 4]])
  132. result = df.quantile(
  133. 0.5, axis=1, numeric_only=True, interpolation=interpolation, method=method
  134. )
  135. expected = Series([3.0, 4.0], index=[0, 1], name=0.5)
  136. if interpolation == "nearest":
  137. expected = expected.astype(np.int64)
  138. if method == "table" and using_array_manager:
  139. request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
  140. tm.assert_series_equal(result, expected)
  141. def test_quantile_date_range(self, interp_method, request, using_array_manager):
  142. # GH 2460
  143. interpolation, method = interp_method
  144. dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
  145. ser = Series(dti)
  146. df = DataFrame(ser)
  147. result = df.quantile(
  148. numeric_only=False, interpolation=interpolation, method=method
  149. )
  150. expected = Series(
  151. ["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]"
  152. )
  153. if method == "table" and using_array_manager:
  154. request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
  155. tm.assert_series_equal(result, expected)
  156. def test_quantile_axis_mixed(self, interp_method, request, using_array_manager):
  157. # mixed on axis=1
  158. interpolation, method = interp_method
  159. df = DataFrame(
  160. {
  161. "A": [1, 2, 3],
  162. "B": [2.0, 3.0, 4.0],
  163. "C": pd.date_range("20130101", periods=3),
  164. "D": ["foo", "bar", "baz"],
  165. }
  166. )
  167. result = df.quantile(
  168. 0.5, axis=1, numeric_only=True, interpolation=interpolation, method=method
  169. )
  170. expected = Series([1.5, 2.5, 3.5], name=0.5)
  171. if interpolation == "nearest":
  172. expected -= 0.5
  173. if method == "table" and using_array_manager:
  174. request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
  175. tm.assert_series_equal(result, expected)
  176. # must raise
  177. msg = "'<' not supported between instances of 'Timestamp' and 'float'"
  178. with pytest.raises(TypeError, match=msg):
  179. df.quantile(0.5, axis=1, numeric_only=False)
  180. def test_quantile_axis_parameter(self, interp_method, request, using_array_manager):
  181. # GH 9543/9544
  182. interpolation, method = interp_method
  183. if method == "table" and using_array_manager:
  184. request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
  185. df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
  186. result = df.quantile(0.5, axis=0, interpolation=interpolation, method=method)
  187. expected = Series([2.0, 3.0], index=["A", "B"], name=0.5)
  188. if interpolation == "nearest":
  189. expected = expected.astype(np.int64)
  190. tm.assert_series_equal(result, expected)
  191. expected = df.quantile(
  192. 0.5, axis="index", interpolation=interpolation, method=method
  193. )
  194. if interpolation == "nearest":
  195. expected = expected.astype(np.int64)
  196. tm.assert_series_equal(result, expected)
  197. result = df.quantile(0.5, axis=1, interpolation=interpolation, method=method)
  198. expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
  199. if interpolation == "nearest":
  200. expected = expected.astype(np.int64)
  201. tm.assert_series_equal(result, expected)
  202. result = df.quantile(
  203. 0.5, axis="columns", interpolation=interpolation, method=method
  204. )
  205. tm.assert_series_equal(result, expected)
  206. msg = "No axis named -1 for object type DataFrame"
  207. with pytest.raises(ValueError, match=msg):
  208. df.quantile(0.1, axis=-1, interpolation=interpolation, method=method)
  209. msg = "No axis named column for object type DataFrame"
  210. with pytest.raises(ValueError, match=msg):
  211. df.quantile(0.1, axis="column")
  212. def test_quantile_interpolation(self):
  213. # see gh-10174
  214. # interpolation method other than default linear
  215. df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
  216. result = df.quantile(0.5, axis=1, interpolation="nearest")
  217. expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
  218. tm.assert_series_equal(result, expected)
  219. # cross-check interpolation=nearest results in original dtype
  220. exp = np.percentile(
  221. np.array([[1, 2, 3], [2, 3, 4]]),
  222. 0.5,
  223. axis=0,
  224. method="nearest",
  225. )
  226. expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="int64")
  227. tm.assert_series_equal(result, expected)
  228. # float
  229. df = DataFrame({"A": [1.0, 2.0, 3.0], "B": [2.0, 3.0, 4.0]}, index=[1, 2, 3])
  230. result = df.quantile(0.5, axis=1, interpolation="nearest")
  231. expected = Series([1.0, 2.0, 3.0], index=[1, 2, 3], name=0.5)
  232. tm.assert_series_equal(result, expected)
  233. exp = np.percentile(
  234. np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]]),
  235. 0.5,
  236. axis=0,
  237. method="nearest",
  238. )
  239. expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="float64")
  240. tm.assert_series_equal(result, expected)
  241. # axis
  242. result = df.quantile([0.5, 0.75], axis=1, interpolation="lower")
  243. expected = DataFrame(
  244. {1: [1.0, 1.0], 2: [2.0, 2.0], 3: [3.0, 3.0]}, index=[0.5, 0.75]
  245. )
  246. tm.assert_frame_equal(result, expected)
  247. # test degenerate case
  248. df = DataFrame({"x": [], "y": []})
  249. q = df.quantile(0.1, axis=0, interpolation="higher")
  250. assert np.isnan(q["x"]) and np.isnan(q["y"])
  251. # multi
  252. df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
  253. result = df.quantile([0.25, 0.5], interpolation="midpoint")
  254. # https://github.com/numpy/numpy/issues/7163
  255. expected = DataFrame(
  256. [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
  257. index=[0.25, 0.5],
  258. columns=["a", "b", "c"],
  259. )
  260. tm.assert_frame_equal(result, expected)
  261. def test_quantile_interpolation_datetime(self, datetime_frame):
  262. # see gh-10174
  263. # interpolation = linear (default case)
  264. df = datetime_frame
  265. q = df.quantile(0.1, axis=0, numeric_only=True, interpolation="linear")
  266. assert q["A"] == np.percentile(df["A"], 10)
  267. def test_quantile_interpolation_int(self, int_frame):
  268. # see gh-10174
  269. df = int_frame
  270. # interpolation = linear (default case)
  271. q = df.quantile(0.1)
  272. assert q["A"] == np.percentile(df["A"], 10)
  273. # test with and without interpolation keyword
  274. q1 = df.quantile(0.1, axis=0, interpolation="linear")
  275. assert q1["A"] == np.percentile(df["A"], 10)
  276. tm.assert_series_equal(q, q1)
  277. def test_quantile_multi(self, interp_method, request, using_array_manager):
  278. interpolation, method = interp_method
  279. df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
  280. result = df.quantile([0.25, 0.5], interpolation=interpolation, method=method)
  281. expected = DataFrame(
  282. [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
  283. index=[0.25, 0.5],
  284. columns=["a", "b", "c"],
  285. )
  286. if interpolation == "nearest":
  287. expected = expected.astype(np.int64)
  288. if method == "table" and using_array_manager:
  289. request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
  290. tm.assert_frame_equal(result, expected)
  291. def test_quantile_multi_axis_1(self, interp_method, request, using_array_manager):
  292. interpolation, method = interp_method
  293. df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"])
  294. result = df.quantile(
  295. [0.25, 0.5], axis=1, interpolation=interpolation, method=method
  296. )
  297. expected = DataFrame(
  298. [[1.0, 2.0, 3.0]] * 2, index=[0.25, 0.5], columns=[0, 1, 2]
  299. )
  300. if interpolation == "nearest":
  301. expected = expected.astype(np.int64)
  302. if method == "table" and using_array_manager:
  303. request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
  304. tm.assert_frame_equal(result, expected)
  305. def test_quantile_multi_empty(self, interp_method):
  306. interpolation, method = interp_method
  307. result = DataFrame({"x": [], "y": []}).quantile(
  308. [0.1, 0.9], axis=0, interpolation=interpolation, method=method
  309. )
  310. expected = DataFrame(
  311. {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9]
  312. )
  313. tm.assert_frame_equal(result, expected)
  314. def test_quantile_datetime(self, unit):
  315. dti = pd.to_datetime(["2010", "2011"]).as_unit(unit)
  316. df = DataFrame({"a": dti, "b": [0, 5]})
  317. # exclude datetime
  318. result = df.quantile(0.5, numeric_only=True)
  319. expected = Series([2.5], index=["b"], name=0.5)
  320. tm.assert_series_equal(result, expected)
  321. # datetime
  322. result = df.quantile(0.5, numeric_only=False)
  323. expected = Series(
  324. [Timestamp("2010-07-02 12:00:00"), 2.5], index=["a", "b"], name=0.5
  325. )
  326. tm.assert_series_equal(result, expected)
  327. # datetime w/ multi
  328. result = df.quantile([0.5], numeric_only=False)
  329. expected = DataFrame(
  330. {"a": Timestamp("2010-07-02 12:00:00").as_unit(unit), "b": 2.5},
  331. index=[0.5],
  332. )
  333. tm.assert_frame_equal(result, expected)
  334. # axis = 1
  335. df["c"] = pd.to_datetime(["2011", "2012"]).as_unit(unit)
  336. result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False)
  337. expected = Series(
  338. [Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")],
  339. index=[0, 1],
  340. name=0.5,
  341. dtype=f"M8[{unit}]",
  342. )
  343. tm.assert_series_equal(result, expected)
  344. result = df[["a", "c"]].quantile([0.5], axis=1, numeric_only=False)
  345. expected = DataFrame(
  346. [[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]],
  347. index=[0.5],
  348. columns=[0, 1],
  349. dtype=f"M8[{unit}]",
  350. )
  351. tm.assert_frame_equal(result, expected)
  352. # empty when numeric_only=True
  353. result = df[["a", "c"]].quantile(0.5, numeric_only=True)
  354. expected = Series([], index=[], dtype=np.float64, name=0.5)
  355. tm.assert_series_equal(result, expected)
  356. result = df[["a", "c"]].quantile([0.5], numeric_only=True)
  357. expected = DataFrame(index=[0.5], columns=[])
  358. tm.assert_frame_equal(result, expected)
  359. @pytest.mark.parametrize(
  360. "dtype",
  361. [
  362. "datetime64[ns]",
  363. "datetime64[ns, US/Pacific]",
  364. "timedelta64[ns]",
  365. "Period[D]",
  366. ],
  367. )
  368. def test_quantile_dt64_empty(self, dtype, interp_method):
  369. # GH#41544
  370. interpolation, method = interp_method
  371. df = DataFrame(columns=["a", "b"], dtype=dtype)
  372. res = df.quantile(
  373. 0.5, axis=1, numeric_only=False, interpolation=interpolation, method=method
  374. )
  375. expected = Series([], index=[], name=0.5, dtype=dtype)
  376. tm.assert_series_equal(res, expected)
  377. # no columns in result, so no dtype preservation
  378. res = df.quantile(
  379. [0.5],
  380. axis=1,
  381. numeric_only=False,
  382. interpolation=interpolation,
  383. method=method,
  384. )
  385. expected = DataFrame(index=[0.5], columns=[])
  386. tm.assert_frame_equal(res, expected)
  387. @pytest.mark.parametrize("invalid", [-1, 2, [0.5, -1], [0.5, 2]])
  388. def test_quantile_invalid(self, invalid, datetime_frame, interp_method):
  389. msg = "percentiles should all be in the interval \\[0, 1\\]"
  390. interpolation, method = interp_method
  391. with pytest.raises(ValueError, match=msg):
  392. datetime_frame.quantile(invalid, interpolation=interpolation, method=method)
  393. def test_quantile_box(self, interp_method, request, using_array_manager):
  394. interpolation, method = interp_method
  395. if method == "table" and using_array_manager:
  396. request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
  397. df = DataFrame(
  398. {
  399. "A": [
  400. Timestamp("2011-01-01"),
  401. Timestamp("2011-01-02"),
  402. Timestamp("2011-01-03"),
  403. ],
  404. "B": [
  405. Timestamp("2011-01-01", tz="US/Eastern"),
  406. Timestamp("2011-01-02", tz="US/Eastern"),
  407. Timestamp("2011-01-03", tz="US/Eastern"),
  408. ],
  409. "C": [
  410. pd.Timedelta("1 days"),
  411. pd.Timedelta("2 days"),
  412. pd.Timedelta("3 days"),
  413. ],
  414. }
  415. )
  416. res = df.quantile(
  417. 0.5, numeric_only=False, interpolation=interpolation, method=method
  418. )
  419. exp = Series(
  420. [
  421. Timestamp("2011-01-02"),
  422. Timestamp("2011-01-02", tz="US/Eastern"),
  423. pd.Timedelta("2 days"),
  424. ],
  425. name=0.5,
  426. index=["A", "B", "C"],
  427. )
  428. tm.assert_series_equal(res, exp)
  429. res = df.quantile(
  430. [0.5], numeric_only=False, interpolation=interpolation, method=method
  431. )
  432. exp = DataFrame(
  433. [
  434. [
  435. Timestamp("2011-01-02"),
  436. Timestamp("2011-01-02", tz="US/Eastern"),
  437. pd.Timedelta("2 days"),
  438. ]
  439. ],
  440. index=[0.5],
  441. columns=["A", "B", "C"],
  442. )
  443. tm.assert_frame_equal(res, exp)
  444. def test_quantile_box_nat(self):
  445. # DatetimeLikeBlock may be consolidated and contain NaT in different loc
  446. df = DataFrame(
  447. {
  448. "A": [
  449. Timestamp("2011-01-01"),
  450. pd.NaT,
  451. Timestamp("2011-01-02"),
  452. Timestamp("2011-01-03"),
  453. ],
  454. "a": [
  455. Timestamp("2011-01-01"),
  456. Timestamp("2011-01-02"),
  457. pd.NaT,
  458. Timestamp("2011-01-03"),
  459. ],
  460. "B": [
  461. Timestamp("2011-01-01", tz="US/Eastern"),
  462. pd.NaT,
  463. Timestamp("2011-01-02", tz="US/Eastern"),
  464. Timestamp("2011-01-03", tz="US/Eastern"),
  465. ],
  466. "b": [
  467. Timestamp("2011-01-01", tz="US/Eastern"),
  468. Timestamp("2011-01-02", tz="US/Eastern"),
  469. pd.NaT,
  470. Timestamp("2011-01-03", tz="US/Eastern"),
  471. ],
  472. "C": [
  473. pd.Timedelta("1 days"),
  474. pd.Timedelta("2 days"),
  475. pd.Timedelta("3 days"),
  476. pd.NaT,
  477. ],
  478. "c": [
  479. pd.NaT,
  480. pd.Timedelta("1 days"),
  481. pd.Timedelta("2 days"),
  482. pd.Timedelta("3 days"),
  483. ],
  484. },
  485. columns=list("AaBbCc"),
  486. )
  487. res = df.quantile(0.5, numeric_only=False)
  488. exp = Series(
  489. [
  490. Timestamp("2011-01-02"),
  491. Timestamp("2011-01-02"),
  492. Timestamp("2011-01-02", tz="US/Eastern"),
  493. Timestamp("2011-01-02", tz="US/Eastern"),
  494. pd.Timedelta("2 days"),
  495. pd.Timedelta("2 days"),
  496. ],
  497. name=0.5,
  498. index=list("AaBbCc"),
  499. )
  500. tm.assert_series_equal(res, exp)
  501. res = df.quantile([0.5], numeric_only=False)
  502. exp = DataFrame(
  503. [
  504. [
  505. Timestamp("2011-01-02"),
  506. Timestamp("2011-01-02"),
  507. Timestamp("2011-01-02", tz="US/Eastern"),
  508. Timestamp("2011-01-02", tz="US/Eastern"),
  509. pd.Timedelta("2 days"),
  510. pd.Timedelta("2 days"),
  511. ]
  512. ],
  513. index=[0.5],
  514. columns=list("AaBbCc"),
  515. )
  516. tm.assert_frame_equal(res, exp)
  517. def test_quantile_nan(self, interp_method, request, using_array_manager):
  518. interpolation, method = interp_method
  519. if method == "table" and using_array_manager:
  520. request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
  521. # GH 14357 - float block where some cols have missing values
  522. df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)})
  523. df.iloc[-1, 1] = np.nan
  524. res = df.quantile(0.5, interpolation=interpolation, method=method)
  525. exp = Series(
  526. [3.0, 2.5 if interpolation == "linear" else 3.0], index=["a", "b"], name=0.5
  527. )
  528. tm.assert_series_equal(res, exp)
  529. res = df.quantile([0.5, 0.75], interpolation=interpolation, method=method)
  530. exp = DataFrame(
  531. {
  532. "a": [3.0, 4.0],
  533. "b": [2.5, 3.25] if interpolation == "linear" else [3.0, 4.0],
  534. },
  535. index=[0.5, 0.75],
  536. )
  537. tm.assert_frame_equal(res, exp)
  538. res = df.quantile(0.5, axis=1, interpolation=interpolation, method=method)
  539. exp = Series(np.arange(1.0, 6.0), name=0.5)
  540. tm.assert_series_equal(res, exp)
  541. res = df.quantile(
  542. [0.5, 0.75], axis=1, interpolation=interpolation, method=method
  543. )
  544. exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
  545. if interpolation == "nearest":
  546. exp.iloc[1, -1] = np.nan
  547. tm.assert_frame_equal(res, exp)
  548. # full-nan column
  549. df["b"] = np.nan
  550. res = df.quantile(0.5, interpolation=interpolation, method=method)
  551. exp = Series([3.0, np.nan], index=["a", "b"], name=0.5)
  552. tm.assert_series_equal(res, exp)
  553. res = df.quantile([0.5, 0.75], interpolation=interpolation, method=method)
  554. exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75])
  555. tm.assert_frame_equal(res, exp)
  556. def test_quantile_nat(self, interp_method, request, using_array_manager, unit):
  557. interpolation, method = interp_method
  558. if method == "table" and using_array_manager:
  559. request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set."))
  560. # full NaT column
  561. df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}, dtype=f"M8[{unit}]")
  562. res = df.quantile(
  563. 0.5, numeric_only=False, interpolation=interpolation, method=method
  564. )
  565. exp = Series([pd.NaT], index=["a"], name=0.5, dtype=f"M8[{unit}]")
  566. tm.assert_series_equal(res, exp)
  567. res = df.quantile(
  568. [0.5], numeric_only=False, interpolation=interpolation, method=method
  569. )
  570. exp = DataFrame({"a": [pd.NaT]}, index=[0.5], dtype=f"M8[{unit}]")
  571. tm.assert_frame_equal(res, exp)
  572. # mixed non-null / full null column
  573. df = DataFrame(
  574. {
  575. "a": [
  576. Timestamp("2012-01-01"),
  577. Timestamp("2012-01-02"),
  578. Timestamp("2012-01-03"),
  579. ],
  580. "b": [pd.NaT, pd.NaT, pd.NaT],
  581. },
  582. dtype=f"M8[{unit}]",
  583. )
  584. res = df.quantile(
  585. 0.5, numeric_only=False, interpolation=interpolation, method=method
  586. )
  587. exp = Series(
  588. [Timestamp("2012-01-02"), pd.NaT],
  589. index=["a", "b"],
  590. name=0.5,
  591. dtype=f"M8[{unit}]",
  592. )
  593. tm.assert_series_equal(res, exp)
  594. res = df.quantile(
  595. [0.5], numeric_only=False, interpolation=interpolation, method=method
  596. )
  597. exp = DataFrame(
  598. [[Timestamp("2012-01-02"), pd.NaT]],
  599. index=[0.5],
  600. columns=["a", "b"],
  601. dtype=f"M8[{unit}]",
  602. )
  603. tm.assert_frame_equal(res, exp)
  604. def test_quantile_empty_no_rows_floats(self, interp_method):
  605. interpolation, method = interp_method
  606. df = DataFrame(columns=["a", "b"], dtype="float64")
  607. res = df.quantile(0.5, interpolation=interpolation, method=method)
  608. exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5)
  609. tm.assert_series_equal(res, exp)
  610. res = df.quantile([0.5], interpolation=interpolation, method=method)
  611. exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5])
  612. tm.assert_frame_equal(res, exp)
  613. res = df.quantile(0.5, axis=1, interpolation=interpolation, method=method)
  614. exp = Series([], index=[], dtype="float64", name=0.5)
  615. tm.assert_series_equal(res, exp)
  616. res = df.quantile([0.5], axis=1, interpolation=interpolation, method=method)
  617. exp = DataFrame(columns=[], index=[0.5])
  618. tm.assert_frame_equal(res, exp)
  619. def test_quantile_empty_no_rows_ints(self, interp_method):
  620. interpolation, method = interp_method
  621. df = DataFrame(columns=["a", "b"], dtype="int64")
  622. res = df.quantile(0.5, interpolation=interpolation, method=method)
  623. exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5)
  624. tm.assert_series_equal(res, exp)
  625. def test_quantile_empty_no_rows_dt64(self, interp_method):
  626. interpolation, method = interp_method
  627. # datetimes
  628. df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]")
  629. res = df.quantile(
  630. 0.5, numeric_only=False, interpolation=interpolation, method=method
  631. )
  632. exp = Series(
  633. [pd.NaT, pd.NaT], index=["a", "b"], dtype="datetime64[ns]", name=0.5
  634. )
  635. tm.assert_series_equal(res, exp)
  636. # Mixed dt64/dt64tz
  637. df["a"] = df["a"].dt.tz_localize("US/Central")
  638. res = df.quantile(
  639. 0.5, numeric_only=False, interpolation=interpolation, method=method
  640. )
  641. exp = exp.astype(object)
  642. if interpolation == "nearest":
  643. # GH#18463 TODO: would we prefer NaTs here?
  644. msg = "The 'downcast' keyword in fillna is deprecated"
  645. with tm.assert_produces_warning(FutureWarning, match=msg):
  646. exp = exp.fillna(np.nan, downcast=False)
  647. tm.assert_series_equal(res, exp)
  648. # both dt64tz
  649. df["b"] = df["b"].dt.tz_localize("US/Central")
  650. res = df.quantile(
  651. 0.5, numeric_only=False, interpolation=interpolation, method=method
  652. )
  653. exp = exp.astype(df["b"].dtype)
  654. tm.assert_series_equal(res, exp)
  655. def test_quantile_empty_no_columns(self, interp_method):
  656. # GH#23925 _get_numeric_data may drop all columns
  657. interpolation, method = interp_method
  658. df = DataFrame(pd.date_range("1/1/18", periods=5))
  659. df.columns.name = "captain tightpants"
  660. result = df.quantile(
  661. 0.5, numeric_only=True, interpolation=interpolation, method=method
  662. )
  663. expected = Series([], index=[], name=0.5, dtype=np.float64)
  664. expected.index.name = "captain tightpants"
  665. tm.assert_series_equal(result, expected)
  666. result = df.quantile(
  667. [0.5], numeric_only=True, interpolation=interpolation, method=method
  668. )
  669. expected = DataFrame([], index=[0.5], columns=[])
  670. expected.columns.name = "captain tightpants"
  671. tm.assert_frame_equal(result, expected)
  672. def test_quantile_item_cache(
  673. self, using_array_manager, interp_method, using_copy_on_write
  674. ):
  675. # previous behavior incorrect retained an invalid _item_cache entry
  676. interpolation, method = interp_method
  677. df = DataFrame(
  678. np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"]
  679. )
  680. df["D"] = df["A"] * 2
  681. ser = df["A"]
  682. if not using_array_manager:
  683. assert len(df._mgr.blocks) == 2
  684. df.quantile(numeric_only=False, interpolation=interpolation, method=method)
  685. if using_copy_on_write:
  686. ser.iloc[0] = 99
  687. assert df.iloc[0, 0] == df["A"][0]
  688. assert df.iloc[0, 0] != 99
  689. else:
  690. ser.values[0] = 99
  691. assert df.iloc[0, 0] == df["A"][0]
  692. assert df.iloc[0, 0] == 99
  693. def test_invalid_method(self):
  694. with pytest.raises(ValueError, match="Invalid method: foo"):
  695. DataFrame(range(1)).quantile(0.5, method="foo")
  696. def test_table_invalid_interpolation(self):
  697. with pytest.raises(ValueError, match="Invalid interpolation: foo"):
  698. DataFrame(range(1)).quantile(0.5, method="table", interpolation="foo")
  699. class TestQuantileExtensionDtype:
  700. # TODO: tests for axis=1?
  701. # TODO: empty case?
  702. @pytest.fixture(
  703. params=[
  704. pytest.param(
  705. pd.IntervalIndex.from_breaks(range(10)),
  706. marks=pytest.mark.xfail(reason="raises when trying to add Intervals"),
  707. ),
  708. pd.period_range("2016-01-01", periods=9, freq="D"),
  709. pd.date_range("2016-01-01", periods=9, tz="US/Pacific"),
  710. pd.timedelta_range("1 Day", periods=9),
  711. pd.array(np.arange(9), dtype="Int64"),
  712. pd.array(np.arange(9), dtype="Float64"),
  713. ],
  714. ids=lambda x: str(x.dtype),
  715. )
  716. def index(self, request):
  717. # NB: not actually an Index object
  718. idx = request.param
  719. idx.name = "A"
  720. return idx
  721. @pytest.fixture
  722. def obj(self, index, frame_or_series):
  723. # bc index is not always an Index (yet), we need to re-patch .name
  724. obj = frame_or_series(index).copy()
  725. if frame_or_series is Series:
  726. obj.name = "A"
  727. else:
  728. obj.columns = ["A"]
  729. return obj
  730. def compute_quantile(self, obj, qs):
  731. if isinstance(obj, Series):
  732. result = obj.quantile(qs)
  733. else:
  734. result = obj.quantile(qs, numeric_only=False)
  735. return result
  736. def test_quantile_ea(self, request, obj, index):
  737. # result should be invariant to shuffling
  738. indexer = np.arange(len(index), dtype=np.intp)
  739. np.random.default_rng(2).shuffle(indexer)
  740. obj = obj.iloc[indexer]
  741. qs = [0.5, 0, 1]
  742. result = self.compute_quantile(obj, qs)
  743. exp_dtype = index.dtype
  744. if index.dtype == "Int64":
  745. # match non-nullable casting behavior
  746. exp_dtype = "Float64"
  747. # expected here assumes len(index) == 9
  748. expected = Series(
  749. [index[4], index[0], index[-1]], dtype=exp_dtype, index=qs, name="A"
  750. )
  751. expected = type(obj)(expected)
  752. tm.assert_equal(result, expected)
  753. def test_quantile_ea_with_na(self, obj, index):
  754. obj.iloc[0] = index._na_value
  755. obj.iloc[-1] = index._na_value
  756. # result should be invariant to shuffling
  757. indexer = np.arange(len(index), dtype=np.intp)
  758. np.random.default_rng(2).shuffle(indexer)
  759. obj = obj.iloc[indexer]
  760. qs = [0.5, 0, 1]
  761. result = self.compute_quantile(obj, qs)
  762. # expected here assumes len(index) == 9
  763. expected = Series(
  764. [index[4], index[1], index[-2]], dtype=index.dtype, index=qs, name="A"
  765. )
  766. expected = type(obj)(expected)
  767. tm.assert_equal(result, expected)
  768. def test_quantile_ea_all_na(self, request, obj, index):
  769. obj.iloc[:] = index._na_value
  770. # Check dtypes were preserved; this was once a problem see GH#39763
  771. assert np.all(obj.dtypes == index.dtype)
  772. # result should be invariant to shuffling
  773. indexer = np.arange(len(index), dtype=np.intp)
  774. np.random.default_rng(2).shuffle(indexer)
  775. obj = obj.iloc[indexer]
  776. qs = [0.5, 0, 1]
  777. result = self.compute_quantile(obj, qs)
  778. expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value)
  779. expected = Series(expected, index=qs, name="A")
  780. expected = type(obj)(expected)
  781. tm.assert_equal(result, expected)
  782. def test_quantile_ea_scalar(self, request, obj, index):
  783. # scalar qs
  784. # result should be invariant to shuffling
  785. indexer = np.arange(len(index), dtype=np.intp)
  786. np.random.default_rng(2).shuffle(indexer)
  787. obj = obj.iloc[indexer]
  788. qs = 0.5
  789. result = self.compute_quantile(obj, qs)
  790. exp_dtype = index.dtype
  791. if index.dtype == "Int64":
  792. exp_dtype = "Float64"
  793. expected = Series({"A": index[4]}, dtype=exp_dtype, name=0.5)
  794. if isinstance(obj, Series):
  795. expected = expected["A"]
  796. assert result == expected
  797. else:
  798. tm.assert_series_equal(result, expected)
  799. @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
  800. @pytest.mark.parametrize(
  801. "dtype, expected_data, expected_index, axis",
  802. [
  803. ["float64", [], [], 1],
  804. ["int64", [], [], 1],
  805. ["float64", [np.nan, np.nan], ["a", "b"], 0],
  806. ["int64", [np.nan, np.nan], ["a", "b"], 0],
  807. ],
  808. )
  809. def test_empty_numeric(self, dtype, expected_data, expected_index, axis):
  810. # GH 14564
  811. df = DataFrame(columns=["a", "b"], dtype=dtype)
  812. result = df.quantile(0.5, axis=axis)
  813. expected = Series(
  814. expected_data, name=0.5, index=Index(expected_index), dtype="float64"
  815. )
  816. tm.assert_series_equal(result, expected)
  817. @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
  818. @pytest.mark.parametrize(
  819. "dtype, expected_data, expected_index, axis, expected_dtype",
  820. [
  821. ["datetime64[ns]", [], [], 1, "datetime64[ns]"],
  822. ["datetime64[ns]", [pd.NaT, pd.NaT], ["a", "b"], 0, "datetime64[ns]"],
  823. ],
  824. )
  825. def test_empty_datelike(
  826. self, dtype, expected_data, expected_index, axis, expected_dtype
  827. ):
  828. # GH 14564
  829. df = DataFrame(columns=["a", "b"], dtype=dtype)
  830. result = df.quantile(0.5, axis=axis, numeric_only=False)
  831. expected = Series(
  832. expected_data, name=0.5, index=Index(expected_index), dtype=expected_dtype
  833. )
  834. tm.assert_series_equal(result, expected)
  835. @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
  836. @pytest.mark.parametrize(
  837. "expected_data, expected_index, axis",
  838. [
  839. [[np.nan, np.nan], range(2), 1],
  840. [[], [], 0],
  841. ],
  842. )
  843. def test_datelike_numeric_only(self, expected_data, expected_index, axis):
  844. # GH 14564
  845. df = DataFrame(
  846. {
  847. "a": pd.to_datetime(["2010", "2011"]),
  848. "b": [0, 5],
  849. "c": pd.to_datetime(["2011", "2012"]),
  850. }
  851. )
  852. result = df[["a", "c"]].quantile(0.5, axis=axis, numeric_only=True)
  853. expected = Series(
  854. expected_data, name=0.5, index=Index(expected_index), dtype=np.float64
  855. )
  856. tm.assert_series_equal(result, expected)