test_rank.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. from datetime import (
  2. datetime,
  3. timedelta,
  4. )
  5. import numpy as np
  6. import pytest
  7. from pandas._libs.algos import (
  8. Infinity,
  9. NegInfinity,
  10. )
  11. from pandas import (
  12. DataFrame,
  13. Index,
  14. Series,
  15. )
  16. import pandas._testing as tm
  17. class TestRank:
  18. s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])
  19. df = DataFrame({"A": s, "B": s})
  20. results = {
  21. "average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]),
  22. "min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]),
  23. "max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]),
  24. "first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]),
  25. "dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]),
  26. }
  27. @pytest.fixture(params=["average", "min", "max", "first", "dense"])
  28. def method(self, request):
  29. """
  30. Fixture for trying all rank methods
  31. """
  32. return request.param
  33. def test_rank(self, float_frame):
  34. sp_stats = pytest.importorskip("scipy.stats")
  35. float_frame.loc[::2, "A"] = np.nan
  36. float_frame.loc[::3, "B"] = np.nan
  37. float_frame.loc[::4, "C"] = np.nan
  38. float_frame.loc[::5, "D"] = np.nan
  39. ranks0 = float_frame.rank()
  40. ranks1 = float_frame.rank(1)
  41. mask = np.isnan(float_frame.values)
  42. fvals = float_frame.fillna(np.inf).values
  43. exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fvals)
  44. exp0[mask] = np.nan
  45. exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fvals)
  46. exp1[mask] = np.nan
  47. tm.assert_almost_equal(ranks0.values, exp0)
  48. tm.assert_almost_equal(ranks1.values, exp1)
  49. # integers
  50. df = DataFrame(
  51. np.random.default_rng(2).integers(0, 5, size=40).reshape((10, 4))
  52. )
  53. result = df.rank()
  54. exp = df.astype(float).rank()
  55. tm.assert_frame_equal(result, exp)
  56. result = df.rank(1)
  57. exp = df.astype(float).rank(1)
  58. tm.assert_frame_equal(result, exp)
  59. def test_rank2(self):
  60. df = DataFrame([[1, 3, 2], [1, 2, 3]])
  61. expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0
  62. result = df.rank(1, pct=True)
  63. tm.assert_frame_equal(result, expected)
  64. df = DataFrame([[1, 3, 2], [1, 2, 3]])
  65. expected = df.rank(0) / 2.0
  66. result = df.rank(0, pct=True)
  67. tm.assert_frame_equal(result, expected)
  68. df = DataFrame([["b", "c", "a"], ["a", "c", "b"]])
  69. expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]])
  70. result = df.rank(1, numeric_only=False)
  71. tm.assert_frame_equal(result, expected)
  72. expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]])
  73. result = df.rank(0, numeric_only=False)
  74. tm.assert_frame_equal(result, expected)
  75. df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]])
  76. expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]])
  77. result = df.rank(1, numeric_only=False)
  78. tm.assert_frame_equal(result, expected)
  79. expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 1.0, 2.0]])
  80. result = df.rank(0, numeric_only=False)
  81. tm.assert_frame_equal(result, expected)
  82. # f7u12, this does not work without extensive workaround
  83. data = [
  84. [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
  85. [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)],
  86. ]
  87. df = DataFrame(data)
  88. # check the rank
  89. expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]])
  90. result = df.rank(1, numeric_only=False, ascending=True)
  91. tm.assert_frame_equal(result, expected)
  92. expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]])
  93. result = df.rank(1, numeric_only=False, ascending=False)
  94. tm.assert_frame_equal(result, expected)
  95. df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]})
  96. exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]})
  97. tm.assert_frame_equal(df.rank(), exp)
  98. def test_rank_does_not_mutate(self):
  99. # GH#18521
  100. # Check rank does not mutate DataFrame
  101. df = DataFrame(
  102. np.random.default_rng(2).standard_normal((10, 3)), dtype="float64"
  103. )
  104. expected = df.copy()
  105. df.rank()
  106. result = df
  107. tm.assert_frame_equal(result, expected)
  108. def test_rank_mixed_frame(self, float_string_frame):
  109. float_string_frame["datetime"] = datetime.now()
  110. float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
  111. float_string_frame.rank(numeric_only=False)
  112. with pytest.raises(TypeError, match="not supported between instances of"):
  113. float_string_frame.rank(axis=1)
  114. def test_rank_na_option(self, float_frame):
  115. sp_stats = pytest.importorskip("scipy.stats")
  116. float_frame.loc[::2, "A"] = np.nan
  117. float_frame.loc[::3, "B"] = np.nan
  118. float_frame.loc[::4, "C"] = np.nan
  119. float_frame.loc[::5, "D"] = np.nan
  120. # bottom
  121. ranks0 = float_frame.rank(na_option="bottom")
  122. ranks1 = float_frame.rank(1, na_option="bottom")
  123. fvals = float_frame.fillna(np.inf).values
  124. exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fvals)
  125. exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fvals)
  126. tm.assert_almost_equal(ranks0.values, exp0)
  127. tm.assert_almost_equal(ranks1.values, exp1)
  128. # top
  129. ranks0 = float_frame.rank(na_option="top")
  130. ranks1 = float_frame.rank(1, na_option="top")
  131. fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
  132. fval1 = float_frame.T
  133. fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
  134. fval1 = fval1.fillna(np.inf).values
  135. exp0 = np.apply_along_axis(sp_stats.rankdata, 0, fval0)
  136. exp1 = np.apply_along_axis(sp_stats.rankdata, 1, fval1)
  137. tm.assert_almost_equal(ranks0.values, exp0)
  138. tm.assert_almost_equal(ranks1.values, exp1)
  139. # descending
  140. # bottom
  141. ranks0 = float_frame.rank(na_option="top", ascending=False)
  142. ranks1 = float_frame.rank(1, na_option="top", ascending=False)
  143. fvals = float_frame.fillna(np.inf).values
  144. exp0 = np.apply_along_axis(sp_stats.rankdata, 0, -fvals)
  145. exp1 = np.apply_along_axis(sp_stats.rankdata, 1, -fvals)
  146. tm.assert_almost_equal(ranks0.values, exp0)
  147. tm.assert_almost_equal(ranks1.values, exp1)
  148. # descending
  149. # top
  150. ranks0 = float_frame.rank(na_option="bottom", ascending=False)
  151. ranks1 = float_frame.rank(1, na_option="bottom", ascending=False)
  152. fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values
  153. fval1 = float_frame.T
  154. fval1 = fval1.fillna((fval1.min() - 1).to_dict()).T
  155. fval1 = fval1.fillna(np.inf).values
  156. exp0 = np.apply_along_axis(sp_stats.rankdata, 0, -fval0)
  157. exp1 = np.apply_along_axis(sp_stats.rankdata, 1, -fval1)
  158. tm.assert_numpy_array_equal(ranks0.values, exp0)
  159. tm.assert_numpy_array_equal(ranks1.values, exp1)
  160. # bad values throw error
  161. msg = "na_option must be one of 'keep', 'top', or 'bottom'"
  162. with pytest.raises(ValueError, match=msg):
  163. float_frame.rank(na_option="bad", ascending=False)
  164. # invalid type
  165. with pytest.raises(ValueError, match=msg):
  166. float_frame.rank(na_option=True, ascending=False)
  167. def test_rank_axis(self):
  168. # check if using axes' names gives the same result
  169. df = DataFrame([[2, 1], [4, 3]])
  170. tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index"))
  171. tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns"))
  172. @pytest.mark.parametrize("ax", [0, 1])
  173. @pytest.mark.parametrize("m", ["average", "min", "max", "first", "dense"])
  174. def test_rank_methods_frame(self, ax, m):
  175. sp_stats = pytest.importorskip("scipy.stats")
  176. xs = np.random.default_rng(2).integers(0, 21, (100, 26))
  177. xs = (xs - 10.0) / 10.0
  178. cols = [chr(ord("z") - i) for i in range(xs.shape[1])]
  179. for vals in [xs, xs + 1e6, xs * 1e-6]:
  180. df = DataFrame(vals, columns=cols)
  181. result = df.rank(axis=ax, method=m)
  182. sprank = np.apply_along_axis(
  183. sp_stats.rankdata, ax, vals, m if m != "first" else "ordinal"
  184. )
  185. sprank = sprank.astype(np.float64)
  186. expected = DataFrame(sprank, columns=cols).astype("float64")
  187. tm.assert_frame_equal(result, expected)
  188. @pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
  189. def test_rank_descending(self, method, dtype):
  190. if "i" in dtype:
  191. df = self.df.dropna().astype(dtype)
  192. else:
  193. df = self.df.astype(dtype)
  194. res = df.rank(ascending=False)
  195. expected = (df.max() - df).rank()
  196. tm.assert_frame_equal(res, expected)
  197. expected = (df.max() - df).rank(method=method)
  198. if dtype != "O":
  199. res2 = df.rank(method=method, ascending=False, numeric_only=True)
  200. tm.assert_frame_equal(res2, expected)
  201. res3 = df.rank(method=method, ascending=False, numeric_only=False)
  202. tm.assert_frame_equal(res3, expected)
  203. @pytest.mark.parametrize("axis", [0, 1])
  204. @pytest.mark.parametrize("dtype", [None, object])
  205. def test_rank_2d_tie_methods(self, method, axis, dtype):
  206. df = self.df
  207. def _check2d(df, expected, method="average", axis=0):
  208. exp_df = DataFrame({"A": expected, "B": expected})
  209. if axis == 1:
  210. df = df.T
  211. exp_df = exp_df.T
  212. result = df.rank(method=method, axis=axis)
  213. tm.assert_frame_equal(result, exp_df)
  214. frame = df if dtype is None else df.astype(dtype)
  215. _check2d(frame, self.results[method], method=method, axis=axis)
  216. @pytest.mark.parametrize(
  217. "method,exp",
  218. [
  219. ("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3], [1.0, 0.5, 1.0 / 3]]),
  220. (
  221. "min",
  222. [
  223. [1.0 / 3, 1.0, 1.0],
  224. [1.0 / 3, 1.0 / 3, 2.0 / 3],
  225. [1.0 / 3, 1.0 / 3, 1.0 / 3],
  226. ],
  227. ),
  228. (
  229. "max",
  230. [[1.0, 1.0, 1.0], [1.0, 2.0 / 3, 2.0 / 3], [1.0, 2.0 / 3, 1.0 / 3]],
  231. ),
  232. (
  233. "average",
  234. [[2.0 / 3, 1.0, 1.0], [2.0 / 3, 0.5, 2.0 / 3], [2.0 / 3, 0.5, 1.0 / 3]],
  235. ),
  236. (
  237. "first",
  238. [
  239. [1.0 / 3, 1.0, 1.0],
  240. [2.0 / 3, 1.0 / 3, 2.0 / 3],
  241. [3.0 / 3, 2.0 / 3, 1.0 / 3],
  242. ],
  243. ),
  244. ],
  245. )
  246. def test_rank_pct_true(self, method, exp):
  247. # see gh-15630.
  248. df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]])
  249. result = df.rank(method=method, pct=True)
  250. expected = DataFrame(exp)
  251. tm.assert_frame_equal(result, expected)
  252. @pytest.mark.single_cpu
  253. def test_pct_max_many_rows(self):
  254. # GH 18271
  255. df = DataFrame(
  256. {"A": np.arange(2**24 + 1), "B": np.arange(2**24 + 1, 0, -1)}
  257. )
  258. result = df.rank(pct=True).max()
  259. assert (result == 1).all()
  260. @pytest.mark.parametrize(
  261. "contents,dtype",
  262. [
  263. (
  264. [
  265. -np.inf,
  266. -50,
  267. -1,
  268. -1e-20,
  269. -1e-25,
  270. -1e-50,
  271. 0,
  272. 1e-40,
  273. 1e-20,
  274. 1e-10,
  275. 2,
  276. 40,
  277. np.inf,
  278. ],
  279. "float64",
  280. ),
  281. (
  282. [
  283. -np.inf,
  284. -50,
  285. -1,
  286. -1e-20,
  287. -1e-25,
  288. -1e-45,
  289. 0,
  290. 1e-40,
  291. 1e-20,
  292. 1e-10,
  293. 2,
  294. 40,
  295. np.inf,
  296. ],
  297. "float32",
  298. ),
  299. ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"),
  300. (
  301. [
  302. np.iinfo(np.int64).min,
  303. -100,
  304. 0,
  305. 1,
  306. 9999,
  307. 100000,
  308. 1e10,
  309. np.iinfo(np.int64).max,
  310. ],
  311. "int64",
  312. ),
  313. ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"),
  314. (
  315. [datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 5)],
  316. "datetime64",
  317. ),
  318. ],
  319. )
  320. def test_rank_inf_and_nan(self, contents, dtype, frame_or_series):
  321. dtype_na_map = {
  322. "float64": np.nan,
  323. "float32": np.nan,
  324. "object": None,
  325. "datetime64": np.datetime64("nat"),
  326. }
  327. # Insert nans at random positions if underlying dtype has missing
  328. # value. Then adjust the expected order by adding nans accordingly
  329. # This is for testing whether rank calculation is affected
  330. # when values are interwined with nan values.
  331. values = np.array(contents, dtype=dtype)
  332. exp_order = np.array(range(len(values)), dtype="float64") + 1.0
  333. if dtype in dtype_na_map:
  334. na_value = dtype_na_map[dtype]
  335. nan_indices = np.random.default_rng(2).choice(range(len(values)), 5)
  336. values = np.insert(values, nan_indices, na_value)
  337. exp_order = np.insert(exp_order, nan_indices, np.nan)
  338. # Shuffle the testing array and expected results in the same way
  339. random_order = np.random.default_rng(2).permutation(len(values))
  340. obj = frame_or_series(values[random_order])
  341. expected = frame_or_series(exp_order[random_order], dtype="float64")
  342. result = obj.rank()
  343. tm.assert_equal(result, expected)
  344. def test_df_series_inf_nan_consistency(self):
  345. # GH#32593
  346. index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10]
  347. col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6]
  348. col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf]
  349. df = DataFrame(
  350. data={
  351. "col1": col1,
  352. "col2": col2,
  353. },
  354. index=index,
  355. dtype="f8",
  356. )
  357. df_result = df.rank()
  358. series_result = df.copy()
  359. series_result["col1"] = df["col1"].rank()
  360. series_result["col2"] = df["col2"].rank()
  361. tm.assert_frame_equal(df_result, series_result)
  362. def test_rank_both_inf(self):
  363. # GH#32593
  364. df = DataFrame({"a": [-np.inf, 0, np.inf]})
  365. expected = DataFrame({"a": [1.0, 2.0, 3.0]})
  366. result = df.rank()
  367. tm.assert_frame_equal(result, expected)
  368. @pytest.mark.parametrize(
  369. "na_option,ascending,expected",
  370. [
  371. ("top", True, [3.0, 1.0, 2.0]),
  372. ("top", False, [2.0, 1.0, 3.0]),
  373. ("bottom", True, [2.0, 3.0, 1.0]),
  374. ("bottom", False, [1.0, 3.0, 2.0]),
  375. ],
  376. )
  377. def test_rank_inf_nans_na_option(
  378. self, frame_or_series, method, na_option, ascending, expected
  379. ):
  380. obj = frame_or_series([np.inf, np.nan, -np.inf])
  381. result = obj.rank(method=method, na_option=na_option, ascending=ascending)
  382. expected = frame_or_series(expected)
  383. tm.assert_equal(result, expected)
  384. @pytest.mark.parametrize(
  385. "na_option,ascending,expected",
  386. [
  387. ("bottom", True, [1.0, 2.0, 4.0, 3.0]),
  388. ("bottom", False, [1.0, 2.0, 4.0, 3.0]),
  389. ("top", True, [2.0, 3.0, 1.0, 4.0]),
  390. ("top", False, [2.0, 3.0, 1.0, 4.0]),
  391. ],
  392. )
  393. def test_rank_object_first(self, frame_or_series, na_option, ascending, expected):
  394. obj = frame_or_series(["foo", "foo", None, "foo"])
  395. result = obj.rank(method="first", na_option=na_option, ascending=ascending)
  396. expected = frame_or_series(expected)
  397. tm.assert_equal(result, expected)
  398. @pytest.mark.parametrize(
  399. "data,expected",
  400. [
  401. (
  402. {"a": [1, 2, "a"], "b": [4, 5, 6]},
  403. DataFrame({"b": [1.0, 2.0, 3.0]}, columns=Index(["b"], dtype=object)),
  404. ),
  405. ({"a": [1, 2, "a"]}, DataFrame(index=range(3), columns=[])),
  406. ],
  407. )
  408. def test_rank_mixed_axis_zero(self, data, expected):
  409. df = DataFrame(data, columns=Index(list(data.keys()), dtype=object))
  410. with pytest.raises(TypeError, match="'<' not supported between instances of"):
  411. df.rank()
  412. result = df.rank(numeric_only=True)
  413. tm.assert_frame_equal(result, expected)
  414. def test_rank_string_dtype(self, string_dtype_no_object):
  415. # GH#55362
  416. obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
  417. result = obj.rank(method="first")
  418. exp_dtype = (
  419. "Float64" if string_dtype_no_object == "string[pyarrow]" else "float64"
  420. )
  421. if string_dtype_no_object.storage == "python":
  422. # TODO nullable string[python] should also return nullable Int64
  423. exp_dtype = "float64"
  424. expected = Series([1, 2, None, 3], dtype=exp_dtype)
  425. tm.assert_series_equal(result, expected)