test_quantile.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. import numpy as np
  2. import pytest
  3. from pandas.core.dtypes.common import is_integer
  4. import pandas as pd
  5. from pandas import (
  6. Index,
  7. Series,
  8. )
  9. import pandas._testing as tm
  10. from pandas.core.indexes.datetimes import Timestamp
  11. class TestSeriesQuantile:
  12. def test_quantile(self, datetime_series):
  13. q = datetime_series.quantile(0.1)
  14. assert q == np.percentile(datetime_series.dropna(), 10)
  15. q = datetime_series.quantile(0.9)
  16. assert q == np.percentile(datetime_series.dropna(), 90)
  17. # object dtype
  18. q = Series(datetime_series, dtype=object).quantile(0.9)
  19. assert q == np.percentile(datetime_series.dropna(), 90)
  20. # datetime64[ns] dtype
  21. dts = datetime_series.index.to_series()
  22. q = dts.quantile(0.2)
  23. assert q == Timestamp("2000-01-10 19:12:00")
  24. # timedelta64[ns] dtype
  25. tds = dts.diff()
  26. q = tds.quantile(0.25)
  27. assert q == pd.to_timedelta("24:00:00")
  28. # GH7661
  29. result = Series([np.timedelta64("NaT")]).sum()
  30. assert result == pd.Timedelta(0)
  31. msg = "percentiles should all be in the interval \\[0, 1\\]"
  32. for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
  33. with pytest.raises(ValueError, match=msg):
  34. datetime_series.quantile(invalid)
  35. s = Series(np.random.default_rng(2).standard_normal(100))
  36. percentile_array = [-0.5, 0.25, 1.5]
  37. with pytest.raises(ValueError, match=msg):
  38. s.quantile(percentile_array)
  39. def test_quantile_multi(self, datetime_series, unit):
  40. datetime_series.index = datetime_series.index.as_unit(unit)
  41. qs = [0.1, 0.9]
  42. result = datetime_series.quantile(qs)
  43. expected = Series(
  44. [
  45. np.percentile(datetime_series.dropna(), 10),
  46. np.percentile(datetime_series.dropna(), 90),
  47. ],
  48. index=qs,
  49. name=datetime_series.name,
  50. )
  51. tm.assert_series_equal(result, expected)
  52. dts = datetime_series.index.to_series()
  53. dts.name = "xxx"
  54. result = dts.quantile((0.2, 0.2))
  55. expected = Series(
  56. [Timestamp("2000-01-10 19:12:00"), Timestamp("2000-01-10 19:12:00")],
  57. index=[0.2, 0.2],
  58. name="xxx",
  59. dtype=f"M8[{unit}]",
  60. )
  61. tm.assert_series_equal(result, expected)
  62. result = datetime_series.quantile([])
  63. expected = Series(
  64. [], name=datetime_series.name, index=Index([], dtype=float), dtype="float64"
  65. )
  66. tm.assert_series_equal(result, expected)
  67. def test_quantile_interpolation(self, datetime_series):
  68. # see gh-10174
  69. # interpolation = linear (default case)
  70. q = datetime_series.quantile(0.1, interpolation="linear")
  71. assert q == np.percentile(datetime_series.dropna(), 10)
  72. q1 = datetime_series.quantile(0.1)
  73. assert q1 == np.percentile(datetime_series.dropna(), 10)
  74. # test with and without interpolation keyword
  75. assert q == q1
  76. def test_quantile_interpolation_dtype(self):
  77. # GH #10174
  78. # interpolation = linear (default case)
  79. q = Series([1, 3, 4]).quantile(0.5, interpolation="lower")
  80. assert q == np.percentile(np.array([1, 3, 4]), 50)
  81. assert is_integer(q)
  82. q = Series([1, 3, 4]).quantile(0.5, interpolation="higher")
  83. assert q == np.percentile(np.array([1, 3, 4]), 50)
  84. assert is_integer(q)
  85. def test_quantile_nan(self):
  86. # GH 13098
  87. ser = Series([1, 2, 3, 4, np.nan])
  88. result = ser.quantile(0.5)
  89. expected = 2.5
  90. assert result == expected
  91. # all nan/empty
  92. s1 = Series([], dtype=object)
  93. cases = [s1, Series([np.nan, np.nan])]
  94. for ser in cases:
  95. res = ser.quantile(0.5)
  96. assert np.isnan(res)
  97. res = ser.quantile([0.5])
  98. tm.assert_series_equal(res, Series([np.nan], index=[0.5]))
  99. res = ser.quantile([0.2, 0.3])
  100. tm.assert_series_equal(res, Series([np.nan, np.nan], index=[0.2, 0.3]))
  101. @pytest.mark.parametrize(
  102. "case",
  103. [
  104. [
  105. Timestamp("2011-01-01"),
  106. Timestamp("2011-01-02"),
  107. Timestamp("2011-01-03"),
  108. ],
  109. [
  110. Timestamp("2011-01-01", tz="US/Eastern"),
  111. Timestamp("2011-01-02", tz="US/Eastern"),
  112. Timestamp("2011-01-03", tz="US/Eastern"),
  113. ],
  114. [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")],
  115. # NaT
  116. [
  117. Timestamp("2011-01-01"),
  118. Timestamp("2011-01-02"),
  119. Timestamp("2011-01-03"),
  120. pd.NaT,
  121. ],
  122. [
  123. Timestamp("2011-01-01", tz="US/Eastern"),
  124. Timestamp("2011-01-02", tz="US/Eastern"),
  125. Timestamp("2011-01-03", tz="US/Eastern"),
  126. pd.NaT,
  127. ],
  128. [
  129. pd.Timedelta("1 days"),
  130. pd.Timedelta("2 days"),
  131. pd.Timedelta("3 days"),
  132. pd.NaT,
  133. ],
  134. ],
  135. )
  136. def test_quantile_box(self, case):
  137. ser = Series(case, name="XXX")
  138. res = ser.quantile(0.5)
  139. assert res == case[1]
  140. res = ser.quantile([0.5])
  141. exp = Series([case[1]], index=[0.5], name="XXX")
  142. tm.assert_series_equal(res, exp)
  143. def test_datetime_timedelta_quantiles(self):
  144. # covers #9694
  145. assert pd.isna(Series([], dtype="M8[ns]").quantile(0.5))
  146. assert pd.isna(Series([], dtype="m8[ns]").quantile(0.5))
  147. def test_quantile_nat(self):
  148. res = Series([pd.NaT, pd.NaT]).quantile(0.5)
  149. assert res is pd.NaT
  150. res = Series([pd.NaT, pd.NaT]).quantile([0.5])
  151. tm.assert_series_equal(res, Series([pd.NaT], index=[0.5]))
  152. @pytest.mark.parametrize(
  153. "values, dtype",
  154. [([0, 0, 0, 1, 2, 3], "Sparse[int]"), ([0.0, None, 1.0, 2.0], "Sparse[float]")],
  155. )
  156. def test_quantile_sparse(self, values, dtype):
  157. ser = Series(values, dtype=dtype)
  158. result = ser.quantile([0.5])
  159. expected = Series(np.asarray(ser)).quantile([0.5]).astype("Sparse[float]")
  160. tm.assert_series_equal(result, expected)
  161. def test_quantile_empty_float64(self):
  162. # floats
  163. ser = Series([], dtype="float64")
  164. res = ser.quantile(0.5)
  165. assert np.isnan(res)
  166. res = ser.quantile([0.5])
  167. exp = Series([np.nan], index=[0.5])
  168. tm.assert_series_equal(res, exp)
  169. def test_quantile_empty_int64(self):
  170. # int
  171. ser = Series([], dtype="int64")
  172. res = ser.quantile(0.5)
  173. assert np.isnan(res)
  174. res = ser.quantile([0.5])
  175. exp = Series([np.nan], index=[0.5])
  176. tm.assert_series_equal(res, exp)
  177. def test_quantile_empty_dt64(self):
  178. # datetime
  179. ser = Series([], dtype="datetime64[ns]")
  180. res = ser.quantile(0.5)
  181. assert res is pd.NaT
  182. res = ser.quantile([0.5])
  183. exp = Series([pd.NaT], index=[0.5], dtype=ser.dtype)
  184. tm.assert_series_equal(res, exp)
  185. @pytest.mark.parametrize("dtype", [int, float, "Int64"])
  186. def test_quantile_dtypes(self, dtype):
  187. result = Series([1, 2, 3], dtype=dtype).quantile(np.arange(0, 1, 0.25))
  188. expected = Series(np.arange(1, 3, 0.5), index=np.arange(0, 1, 0.25))
  189. if dtype == "Int64":
  190. expected = expected.astype("Float64")
  191. tm.assert_series_equal(result, expected)
  192. def test_quantile_all_na(self, any_int_ea_dtype):
  193. # GH#50681
  194. ser = Series([pd.NA, pd.NA], dtype=any_int_ea_dtype)
  195. with tm.assert_produces_warning(None):
  196. result = ser.quantile([0.1, 0.5])
  197. expected = Series([pd.NA, pd.NA], dtype=any_int_ea_dtype, index=[0.1, 0.5])
  198. tm.assert_series_equal(result, expected)
  199. def test_quantile_dtype_size(self, any_int_ea_dtype):
  200. # GH#50681
  201. ser = Series([pd.NA, pd.NA, 1], dtype=any_int_ea_dtype)
  202. result = ser.quantile([0.1, 0.5])
  203. expected = Series([1, 1], dtype=any_int_ea_dtype, index=[0.1, 0.5])
  204. tm.assert_series_equal(result, expected)