test_reductions.py 75 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133
  1. from datetime import timedelta
  2. from decimal import Decimal
  3. import re
  4. from dateutil.tz import tzlocal
  5. import numpy as np
  6. import pytest
  7. from pandas.compat import (
  8. IS64,
  9. is_platform_windows,
  10. )
  11. from pandas.compat.numpy import np_version_gt2
  12. import pandas.util._test_decorators as td
  13. import pandas as pd
  14. from pandas import (
  15. Categorical,
  16. CategoricalDtype,
  17. DataFrame,
  18. DatetimeIndex,
  19. Index,
  20. PeriodIndex,
  21. RangeIndex,
  22. Series,
  23. Timestamp,
  24. date_range,
  25. isna,
  26. notna,
  27. to_datetime,
  28. to_timedelta,
  29. )
  30. import pandas._testing as tm
  31. from pandas.core import (
  32. algorithms,
  33. nanops,
  34. )
  35. is_windows_np2_or_is32 = (is_platform_windows() and not np_version_gt2) or not IS64
  36. is_windows_or_is32 = is_platform_windows() or not IS64
  37. def make_skipna_wrapper(alternative, skipna_alternative=None):
  38. """
  39. Create a function for calling on an array.
  40. Parameters
  41. ----------
  42. alternative : function
  43. The function to be called on the array with no NaNs.
  44. Only used when 'skipna_alternative' is None.
  45. skipna_alternative : function
  46. The function to be called on the original array
  47. Returns
  48. -------
  49. function
  50. """
  51. if skipna_alternative:
  52. def skipna_wrapper(x):
  53. return skipna_alternative(x.values)
  54. else:
  55. def skipna_wrapper(x):
  56. nona = x.dropna()
  57. if len(nona) == 0:
  58. return np.nan
  59. return alternative(nona)
  60. return skipna_wrapper
  61. def assert_stat_op_calc(
  62. opname,
  63. alternative,
  64. frame,
  65. has_skipna=True,
  66. check_dtype=True,
  67. check_dates=False,
  68. rtol=1e-5,
  69. atol=1e-8,
  70. skipna_alternative=None,
  71. ):
  72. """
  73. Check that operator opname works as advertised on frame
  74. Parameters
  75. ----------
  76. opname : str
  77. Name of the operator to test on frame
  78. alternative : function
  79. Function that opname is tested against; i.e. "frame.opname()" should
  80. equal "alternative(frame)".
  81. frame : DataFrame
  82. The object that the tests are executed on
  83. has_skipna : bool, default True
  84. Whether the method "opname" has the kwarg "skip_na"
  85. check_dtype : bool, default True
  86. Whether the dtypes of the result of "frame.opname()" and
  87. "alternative(frame)" should be checked.
  88. check_dates : bool, default false
  89. Whether opname should be tested on a Datetime Series
  90. rtol : float, default 1e-5
  91. Relative tolerance.
  92. atol : float, default 1e-8
  93. Absolute tolerance.
  94. skipna_alternative : function, default None
  95. NaN-safe version of alternative
  96. """
  97. f = getattr(frame, opname)
  98. if check_dates:
  99. df = DataFrame({"b": date_range("1/1/2001", periods=2)})
  100. with tm.assert_produces_warning(None):
  101. result = getattr(df, opname)()
  102. assert isinstance(result, Series)
  103. df["a"] = range(len(df))
  104. with tm.assert_produces_warning(None):
  105. result = getattr(df, opname)()
  106. assert isinstance(result, Series)
  107. assert len(result)
  108. if has_skipna:
  109. def wrapper(x):
  110. return alternative(x.values)
  111. skipna_wrapper = make_skipna_wrapper(alternative, skipna_alternative)
  112. result0 = f(axis=0, skipna=False)
  113. result1 = f(axis=1, skipna=False)
  114. tm.assert_series_equal(
  115. result0, frame.apply(wrapper), check_dtype=check_dtype, rtol=rtol, atol=atol
  116. )
  117. tm.assert_series_equal(
  118. result1,
  119. frame.apply(wrapper, axis=1),
  120. rtol=rtol,
  121. atol=atol,
  122. )
  123. else:
  124. skipna_wrapper = alternative
  125. result0 = f(axis=0)
  126. result1 = f(axis=1)
  127. tm.assert_series_equal(
  128. result0,
  129. frame.apply(skipna_wrapper),
  130. check_dtype=check_dtype,
  131. rtol=rtol,
  132. atol=atol,
  133. )
  134. if opname in ["sum", "prod"]:
  135. expected = frame.apply(skipna_wrapper, axis=1)
  136. tm.assert_series_equal(
  137. result1, expected, check_dtype=False, rtol=rtol, atol=atol
  138. )
  139. # check dtypes
  140. if check_dtype:
  141. lcd_dtype = frame.values.dtype
  142. assert lcd_dtype == result0.dtype
  143. assert lcd_dtype == result1.dtype
  144. # bad axis
  145. with pytest.raises(ValueError, match="No axis named 2"):
  146. f(axis=2)
  147. # all NA case
  148. if has_skipna:
  149. all_na = frame * np.nan
  150. r0 = getattr(all_na, opname)(axis=0)
  151. r1 = getattr(all_na, opname)(axis=1)
  152. if opname in ["sum", "prod"]:
  153. unit = 1 if opname == "prod" else 0 # result for empty sum/prod
  154. expected = Series(unit, index=r0.index, dtype=r0.dtype)
  155. tm.assert_series_equal(r0, expected)
  156. expected = Series(unit, index=r1.index, dtype=r1.dtype)
  157. tm.assert_series_equal(r1, expected)
  158. @pytest.fixture
  159. def bool_frame_with_na():
  160. """
  161. Fixture for DataFrame of booleans with index of unique strings
  162. Columns are ['A', 'B', 'C', 'D']; some entries are missing
  163. """
  164. df = DataFrame(
  165. np.concatenate(
  166. [np.ones((15, 4), dtype=bool), np.zeros((15, 4), dtype=bool)], axis=0
  167. ),
  168. index=Index([f"foo_{i}" for i in range(30)], dtype=object),
  169. columns=Index(list("ABCD"), dtype=object),
  170. dtype=object,
  171. )
  172. # set some NAs
  173. df.iloc[5:10] = np.nan
  174. df.iloc[15:20, -2:] = np.nan
  175. return df
  176. @pytest.fixture
  177. def float_frame_with_na():
  178. """
  179. Fixture for DataFrame of floats with index of unique strings
  180. Columns are ['A', 'B', 'C', 'D']; some entries are missing
  181. """
  182. df = DataFrame(
  183. np.random.default_rng(2).standard_normal((30, 4)),
  184. index=Index([f"foo_{i}" for i in range(30)], dtype=object),
  185. columns=Index(list("ABCD"), dtype=object),
  186. )
  187. # set some NAs
  188. df.iloc[5:10] = np.nan
  189. df.iloc[15:20, -2:] = np.nan
  190. return df
  191. class TestDataFrameAnalytics:
  192. # ---------------------------------------------------------------------
  193. # Reductions
  194. @pytest.mark.parametrize("axis", [0, 1])
  195. @pytest.mark.parametrize(
  196. "opname",
  197. [
  198. "count",
  199. "sum",
  200. "mean",
  201. "product",
  202. "median",
  203. "min",
  204. "max",
  205. "nunique",
  206. "var",
  207. "std",
  208. "sem",
  209. pytest.param("skew", marks=td.skip_if_no("scipy")),
  210. pytest.param("kurt", marks=td.skip_if_no("scipy")),
  211. ],
  212. )
  213. def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname):
  214. if (opname in ("sum", "min", "max") and axis == 0) or opname in (
  215. "count",
  216. "nunique",
  217. ):
  218. getattr(float_string_frame, opname)(axis=axis)
  219. else:
  220. if opname in ["var", "std", "sem", "skew", "kurt"]:
  221. msg = "could not convert string to float: 'bar'"
  222. elif opname == "product":
  223. if axis == 1:
  224. msg = "can't multiply sequence by non-int of type 'float'"
  225. else:
  226. msg = "can't multiply sequence by non-int of type 'str'"
  227. elif opname == "sum":
  228. msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'"
  229. elif opname == "mean":
  230. if axis == 0:
  231. # different message on different builds
  232. msg = "|".join(
  233. [
  234. r"Could not convert \['.*'\] to numeric",
  235. "Could not convert string '(bar){30}' to numeric",
  236. ]
  237. )
  238. else:
  239. msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'"
  240. elif opname in ["min", "max"]:
  241. msg = "'[><]=' not supported between instances of 'float' and 'str'"
  242. elif opname == "median":
  243. msg = re.compile(
  244. r"Cannot convert \[.*\] to numeric|does not support|Cannot perform",
  245. flags=re.S,
  246. )
  247. if not isinstance(msg, re.Pattern):
  248. msg = msg + "|does not support|Cannot perform reduction"
  249. with pytest.raises(TypeError, match=msg):
  250. getattr(float_string_frame, opname)(axis=axis)
  251. if opname != "nunique":
  252. getattr(float_string_frame, opname)(axis=axis, numeric_only=True)
  253. @pytest.mark.parametrize("axis", [0, 1])
  254. @pytest.mark.parametrize(
  255. "opname",
  256. [
  257. "count",
  258. "sum",
  259. "mean",
  260. "product",
  261. "median",
  262. "min",
  263. "max",
  264. "var",
  265. "std",
  266. "sem",
  267. pytest.param("skew", marks=td.skip_if_no("scipy")),
  268. pytest.param("kurt", marks=td.skip_if_no("scipy")),
  269. ],
  270. )
  271. def test_stat_op_api_float_frame(self, float_frame, axis, opname):
  272. getattr(float_frame, opname)(axis=axis, numeric_only=False)
  273. def test_stat_op_calc(self, float_frame_with_na, mixed_float_frame):
  274. def count(s):
  275. return notna(s).sum()
  276. def nunique(s):
  277. return len(algorithms.unique1d(s.dropna()))
  278. def var(x):
  279. return np.var(x, ddof=1)
  280. def std(x):
  281. return np.std(x, ddof=1)
  282. def sem(x):
  283. return np.std(x, ddof=1) / np.sqrt(len(x))
  284. assert_stat_op_calc(
  285. "nunique",
  286. nunique,
  287. float_frame_with_na,
  288. has_skipna=False,
  289. check_dtype=False,
  290. check_dates=True,
  291. )
  292. # GH#32571: rol needed for flaky CI builds
  293. # mixed types (with upcasting happening)
  294. assert_stat_op_calc(
  295. "sum",
  296. np.sum,
  297. mixed_float_frame.astype("float32"),
  298. check_dtype=False,
  299. rtol=1e-3,
  300. )
  301. assert_stat_op_calc(
  302. "sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum
  303. )
  304. assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True)
  305. assert_stat_op_calc(
  306. "product", np.prod, float_frame_with_na, skipna_alternative=np.nanprod
  307. )
  308. assert_stat_op_calc("var", var, float_frame_with_na)
  309. assert_stat_op_calc("std", std, float_frame_with_na)
  310. assert_stat_op_calc("sem", sem, float_frame_with_na)
  311. assert_stat_op_calc(
  312. "count",
  313. count,
  314. float_frame_with_na,
  315. has_skipna=False,
  316. check_dtype=False,
  317. check_dates=True,
  318. )
  319. def test_stat_op_calc_skew_kurtosis(self, float_frame_with_na):
  320. sp_stats = pytest.importorskip("scipy.stats")
  321. def skewness(x):
  322. if len(x) < 3:
  323. return np.nan
  324. return sp_stats.skew(x, bias=False)
  325. def kurt(x):
  326. if len(x) < 4:
  327. return np.nan
  328. return sp_stats.kurtosis(x, bias=False)
  329. assert_stat_op_calc("skew", skewness, float_frame_with_na)
  330. assert_stat_op_calc("kurt", kurt, float_frame_with_na)
  331. def test_median(self, float_frame_with_na, int_frame):
  332. def wrapper(x):
  333. if isna(x).any():
  334. return np.nan
  335. return np.median(x)
  336. assert_stat_op_calc("median", wrapper, float_frame_with_na, check_dates=True)
  337. assert_stat_op_calc(
  338. "median", wrapper, int_frame, check_dtype=False, check_dates=True
  339. )
  340. @pytest.mark.parametrize(
  341. "method", ["sum", "mean", "prod", "var", "std", "skew", "min", "max"]
  342. )
  343. @pytest.mark.parametrize(
  344. "df",
  345. [
  346. DataFrame(
  347. {
  348. "a": [
  349. -0.00049987540199591344,
  350. -0.0016467257772919831,
  351. 0.00067695870775883013,
  352. ],
  353. "b": [-0, -0, 0.0],
  354. "c": [
  355. 0.00031111847529610595,
  356. 0.0014902627951905339,
  357. -0.00094099200035979691,
  358. ],
  359. },
  360. index=["foo", "bar", "baz"],
  361. dtype="O",
  362. ),
  363. DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object),
  364. ],
  365. )
  366. @pytest.mark.filterwarnings("ignore:Mismatched null-like values:FutureWarning")
  367. def test_stat_operators_attempt_obj_array(self, method, df, axis):
  368. # GH#676
  369. assert df.values.dtype == np.object_
  370. result = getattr(df, method)(axis=axis)
  371. expected = getattr(df.astype("f8"), method)(axis=axis).astype(object)
  372. if axis in [1, "columns"] and method in ["min", "max"]:
  373. expected[expected.isna()] = None
  374. tm.assert_series_equal(result, expected)
  375. @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
  376. def test_mixed_ops(self, op):
  377. # GH#16116
  378. df = DataFrame(
  379. {
  380. "int": [1, 2, 3, 4],
  381. "float": [1.0, 2.0, 3.0, 4.0],
  382. "str": ["a", "b", "c", "d"],
  383. }
  384. )
  385. msg = "|".join(
  386. [
  387. "Could not convert",
  388. "could not convert",
  389. "can't multiply sequence by non-int",
  390. "does not support",
  391. "Cannot perform",
  392. ]
  393. )
  394. with pytest.raises(TypeError, match=msg):
  395. getattr(df, op)()
  396. with pd.option_context("use_bottleneck", False):
  397. with pytest.raises(TypeError, match=msg):
  398. getattr(df, op)()
  399. def test_reduce_mixed_frame(self):
  400. # GH 6806
  401. df = DataFrame(
  402. {
  403. "bool_data": [True, True, False, False, False],
  404. "int_data": [10, 20, 30, 40, 50],
  405. "string_data": ["a", "b", "c", "d", "e"],
  406. }
  407. )
  408. df.reindex(columns=["bool_data", "int_data", "string_data"])
  409. test = df.sum(axis=0)
  410. tm.assert_numpy_array_equal(
  411. test.values, np.array([2, 150, "abcde"], dtype=object)
  412. )
  413. alt = df.T.sum(axis=1)
  414. tm.assert_series_equal(test, alt)
  415. def test_nunique(self):
  416. df = DataFrame({"A": [1, 1, 1], "B": [1, 2, 3], "C": [1, np.nan, 3]})
  417. tm.assert_series_equal(df.nunique(), Series({"A": 1, "B": 3, "C": 2}))
  418. tm.assert_series_equal(
  419. df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3})
  420. )
  421. tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2}))
  422. tm.assert_series_equal(
  423. df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2})
  424. )
  425. @pytest.mark.parametrize("tz", [None, "UTC"])
  426. def test_mean_mixed_datetime_numeric(self, tz):
  427. # https://github.com/pandas-dev/pandas/issues/24752
  428. df = DataFrame({"A": [1, 1], "B": [Timestamp("2000", tz=tz)] * 2})
  429. result = df.mean()
  430. expected = Series([1.0, Timestamp("2000", tz=tz)], index=["A", "B"])
  431. tm.assert_series_equal(result, expected)
  432. @pytest.mark.parametrize("tz", [None, "UTC"])
  433. def test_mean_includes_datetimes(self, tz):
  434. # https://github.com/pandas-dev/pandas/issues/24752
  435. # Behavior in 0.24.0rc1 was buggy.
  436. # As of 2.0 with numeric_only=None we do *not* drop datetime columns
  437. df = DataFrame({"A": [Timestamp("2000", tz=tz)] * 2})
  438. result = df.mean()
  439. expected = Series([Timestamp("2000", tz=tz)], index=["A"])
  440. tm.assert_series_equal(result, expected)
  441. def test_mean_mixed_string_decimal(self):
  442. # GH 11670
  443. # possible bug when calculating mean of DataFrame?
  444. d = [
  445. {"A": 2, "B": None, "C": Decimal("628.00")},
  446. {"A": 1, "B": None, "C": Decimal("383.00")},
  447. {"A": 3, "B": None, "C": Decimal("651.00")},
  448. {"A": 2, "B": None, "C": Decimal("575.00")},
  449. {"A": 4, "B": None, "C": Decimal("1114.00")},
  450. {"A": 1, "B": "TEST", "C": Decimal("241.00")},
  451. {"A": 2, "B": None, "C": Decimal("572.00")},
  452. {"A": 4, "B": None, "C": Decimal("609.00")},
  453. {"A": 3, "B": None, "C": Decimal("820.00")},
  454. {"A": 5, "B": None, "C": Decimal("1223.00")},
  455. ]
  456. df = DataFrame(d)
  457. with pytest.raises(
  458. TypeError, match="unsupported operand type|does not support|Cannot perform"
  459. ):
  460. df.mean()
  461. result = df[["A", "C"]].mean()
  462. expected = Series([2.7, 681.6], index=["A", "C"], dtype=object)
  463. tm.assert_series_equal(result, expected)
  464. def test_var_std(self, datetime_frame):
  465. result = datetime_frame.std(ddof=4)
  466. expected = datetime_frame.apply(lambda x: x.std(ddof=4))
  467. tm.assert_almost_equal(result, expected)
  468. result = datetime_frame.var(ddof=4)
  469. expected = datetime_frame.apply(lambda x: x.var(ddof=4))
  470. tm.assert_almost_equal(result, expected)
  471. arr = np.repeat(np.random.default_rng(2).random((1, 1000)), 1000, 0)
  472. result = nanops.nanvar(arr, axis=0)
  473. assert not (result < 0).any()
  474. with pd.option_context("use_bottleneck", False):
  475. result = nanops.nanvar(arr, axis=0)
  476. assert not (result < 0).any()
  477. @pytest.mark.parametrize("meth", ["sem", "var", "std"])
  478. def test_numeric_only_flag(self, meth):
  479. # GH 9201
  480. df1 = DataFrame(
  481. np.random.default_rng(2).standard_normal((5, 3)),
  482. columns=["foo", "bar", "baz"],
  483. )
  484. # Cast to object to avoid implicit cast when setting entry to "100" below
  485. df1 = df1.astype({"foo": object})
  486. # set one entry to a number in str format
  487. df1.loc[0, "foo"] = "100"
  488. df2 = DataFrame(
  489. np.random.default_rng(2).standard_normal((5, 3)),
  490. columns=["foo", "bar", "baz"],
  491. )
  492. # Cast to object to avoid implicit cast when setting entry to "a" below
  493. df2 = df2.astype({"foo": object})
  494. # set one entry to a non-number str
  495. df2.loc[0, "foo"] = "a"
  496. result = getattr(df1, meth)(axis=1, numeric_only=True)
  497. expected = getattr(df1[["bar", "baz"]], meth)(axis=1)
  498. tm.assert_series_equal(expected, result)
  499. result = getattr(df2, meth)(axis=1, numeric_only=True)
  500. expected = getattr(df2[["bar", "baz"]], meth)(axis=1)
  501. tm.assert_series_equal(expected, result)
  502. # df1 has all numbers, df2 has a letter inside
  503. msg = r"unsupported operand type\(s\) for -: 'float' and 'str'"
  504. with pytest.raises(TypeError, match=msg):
  505. getattr(df1, meth)(axis=1, numeric_only=False)
  506. msg = "could not convert string to float: 'a'"
  507. with pytest.raises(TypeError, match=msg):
  508. getattr(df2, meth)(axis=1, numeric_only=False)
  509. def test_sem(self, datetime_frame):
  510. result = datetime_frame.sem(ddof=4)
  511. expected = datetime_frame.apply(lambda x: x.std(ddof=4) / np.sqrt(len(x)))
  512. tm.assert_almost_equal(result, expected)
  513. arr = np.repeat(np.random.default_rng(2).random((1, 1000)), 1000, 0)
  514. result = nanops.nansem(arr, axis=0)
  515. assert not (result < 0).any()
  516. with pd.option_context("use_bottleneck", False):
  517. result = nanops.nansem(arr, axis=0)
  518. assert not (result < 0).any()
  519. @pytest.mark.parametrize(
  520. "dropna, expected",
  521. [
  522. (
  523. True,
  524. {
  525. "A": [12],
  526. "B": [10.0],
  527. "C": [1.0],
  528. "D": ["a"],
  529. "E": Categorical(["a"], categories=["a"]),
  530. "F": DatetimeIndex(["2000-01-02"], dtype="M8[ns]"),
  531. "G": to_timedelta(["1 days"]),
  532. },
  533. ),
  534. (
  535. False,
  536. {
  537. "A": [12],
  538. "B": [10.0],
  539. "C": [np.nan],
  540. "D": Series([np.nan], dtype="str"),
  541. "E": Categorical([np.nan], categories=["a"]),
  542. "F": DatetimeIndex([pd.NaT], dtype="M8[ns]"),
  543. "G": to_timedelta([pd.NaT]),
  544. },
  545. ),
  546. (
  547. True,
  548. {
  549. "H": [8, 9, np.nan, np.nan],
  550. "I": [8, 9, np.nan, np.nan],
  551. "J": [1, np.nan, np.nan, np.nan],
  552. "K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]),
  553. "L": DatetimeIndex(
  554. ["2000-01-02", "NaT", "NaT", "NaT"], dtype="M8[ns]"
  555. ),
  556. "M": to_timedelta(["1 days", "nan", "nan", "nan"]),
  557. "N": [0, 1, 2, 3],
  558. },
  559. ),
  560. (
  561. False,
  562. {
  563. "H": [8, 9, np.nan, np.nan],
  564. "I": [8, 9, np.nan, np.nan],
  565. "J": [1, np.nan, np.nan, np.nan],
  566. "K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]),
  567. "L": DatetimeIndex(
  568. ["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"
  569. ),
  570. "M": to_timedelta(["nan", "1 days", "nan", "nan"]),
  571. "N": [0, 1, 2, 3],
  572. },
  573. ),
  574. ],
  575. )
  576. def test_mode_dropna(self, dropna, expected):
  577. df = DataFrame(
  578. {
  579. "A": [12, 12, 19, 11],
  580. "B": [10, 10, np.nan, 3],
  581. "C": [1, np.nan, np.nan, np.nan],
  582. "D": Series([np.nan, np.nan, "a", np.nan], dtype="str"),
  583. "E": Categorical([np.nan, np.nan, "a", np.nan]),
  584. "F": DatetimeIndex(["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"),
  585. "G": to_timedelta(["1 days", "nan", "nan", "nan"]),
  586. "H": [8, 8, 9, 9],
  587. "I": [9, 9, 8, 8],
  588. "J": [1, 1, np.nan, np.nan],
  589. "K": Categorical(["a", np.nan, "a", np.nan]),
  590. "L": DatetimeIndex(
  591. ["2000-01-02", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"
  592. ),
  593. "M": to_timedelta(["1 days", "nan", "1 days", "nan"]),
  594. "N": np.arange(4, dtype="int64"),
  595. }
  596. )
  597. result = df[sorted(expected.keys())].mode(dropna=dropna)
  598. expected = DataFrame(expected)
  599. tm.assert_frame_equal(result, expected)
  600. def test_mode_sort_with_na(self, using_infer_string):
  601. df = DataFrame({"A": [np.nan, np.nan, "a", "a"]})
  602. expected = DataFrame({"A": ["a", np.nan]})
  603. result = df.mode(dropna=False)
  604. tm.assert_frame_equal(result, expected)
  605. def test_mode_empty_df(self):
  606. df = DataFrame([], columns=["a", "b"])
  607. result = df.mode()
  608. expected = DataFrame([], columns=["a", "b"], index=Index([], dtype=np.int64))
  609. tm.assert_frame_equal(result, expected)
  610. def test_operators_timedelta64(self):
  611. df = DataFrame(
  612. {
  613. "A": date_range("2012-1-1", periods=3, freq="D"),
  614. "B": date_range("2012-1-2", periods=3, freq="D"),
  615. "C": Timestamp("20120101") - timedelta(minutes=5, seconds=5),
  616. }
  617. )
  618. diffs = DataFrame({"A": df["A"] - df["C"], "B": df["A"] - df["B"]})
  619. # min
  620. result = diffs.min()
  621. assert result.iloc[0] == diffs.loc[0, "A"]
  622. assert result.iloc[1] == diffs.loc[0, "B"]
  623. result = diffs.min(axis=1)
  624. assert (result == diffs.loc[0, "B"]).all()
  625. # max
  626. result = diffs.max()
  627. assert result.iloc[0] == diffs.loc[2, "A"]
  628. assert result.iloc[1] == diffs.loc[2, "B"]
  629. result = diffs.max(axis=1)
  630. assert (result == diffs["A"]).all()
  631. # abs
  632. result = diffs.abs()
  633. result2 = abs(diffs)
  634. expected = DataFrame({"A": df["A"] - df["C"], "B": df["B"] - df["A"]})
  635. tm.assert_frame_equal(result, expected)
  636. tm.assert_frame_equal(result2, expected)
  637. # mixed frame
  638. mixed = diffs.copy()
  639. mixed["C"] = "foo"
  640. mixed["D"] = 1
  641. mixed["E"] = 1.0
  642. mixed["F"] = Timestamp("20130101")
  643. # results in an object array
  644. result = mixed.min()
  645. expected = Series(
  646. [
  647. pd.Timedelta(timedelta(seconds=5 * 60 + 5)),
  648. pd.Timedelta(timedelta(days=-1)),
  649. "foo",
  650. 1,
  651. 1.0,
  652. Timestamp("20130101"),
  653. ],
  654. index=mixed.columns,
  655. )
  656. tm.assert_series_equal(result, expected)
  657. # excludes non-numeric
  658. result = mixed.min(axis=1, numeric_only=True)
  659. expected = Series([1, 1, 1.0], index=[0, 1, 2])
  660. tm.assert_series_equal(result, expected)
  661. # works when only those columns are selected
  662. result = mixed[["A", "B"]].min(1)
  663. expected = Series([timedelta(days=-1)] * 3)
  664. tm.assert_series_equal(result, expected)
  665. result = mixed[["A", "B"]].min()
  666. expected = Series(
  667. [timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=["A", "B"]
  668. )
  669. tm.assert_series_equal(result, expected)
  670. # GH 3106
  671. df = DataFrame(
  672. {
  673. "time": date_range("20130102", periods=5),
  674. "time2": date_range("20130105", periods=5),
  675. }
  676. )
  677. df["off1"] = df["time2"] - df["time"]
  678. assert df["off1"].dtype == "timedelta64[ns]"
  679. df["off2"] = df["time"] - df["time2"]
  680. df._consolidate_inplace()
  681. assert df["off1"].dtype == "timedelta64[ns]"
  682. assert df["off2"].dtype == "timedelta64[ns]"
  683. def test_std_timedelta64_skipna_false(self):
  684. # GH#37392
  685. tdi = pd.timedelta_range("1 Day", periods=10)
  686. df = DataFrame({"A": tdi, "B": tdi}, copy=True)
  687. df.iloc[-2, -1] = pd.NaT
  688. result = df.std(skipna=False)
  689. expected = Series(
  690. [df["A"].std(), pd.NaT], index=["A", "B"], dtype="timedelta64[ns]"
  691. )
  692. tm.assert_series_equal(result, expected)
  693. result = df.std(axis=1, skipna=False)
  694. expected = Series([pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)])
  695. tm.assert_series_equal(result, expected)
  696. @pytest.mark.parametrize(
  697. "values", [["2022-01-01", "2022-01-02", pd.NaT, "2022-01-03"], 4 * [pd.NaT]]
  698. )
  699. def test_std_datetime64_with_nat(
  700. self, values, skipna, using_array_manager, request, unit
  701. ):
  702. # GH#51335
  703. if using_array_manager and (
  704. not skipna or all(value is pd.NaT for value in values)
  705. ):
  706. mark = pytest.mark.xfail(
  707. reason="GH#51446: Incorrect type inference on NaT in reduction result"
  708. )
  709. request.applymarker(mark)
  710. dti = to_datetime(values).as_unit(unit)
  711. df = DataFrame({"a": dti})
  712. result = df.std(skipna=skipna)
  713. if not skipna or all(value is pd.NaT for value in values):
  714. expected = Series({"a": pd.NaT}, dtype=f"timedelta64[{unit}]")
  715. else:
  716. # 86400000000000ns == 1 day
  717. expected = Series({"a": 86400000000000}, dtype=f"timedelta64[{unit}]")
  718. tm.assert_series_equal(result, expected)
  719. def test_sum_corner(self):
  720. empty_frame = DataFrame()
  721. axis0 = empty_frame.sum(0)
  722. axis1 = empty_frame.sum(1)
  723. assert isinstance(axis0, Series)
  724. assert isinstance(axis1, Series)
  725. assert len(axis0) == 0
  726. assert len(axis1) == 0
  727. @pytest.mark.parametrize(
  728. "index",
  729. [
  730. RangeIndex(0),
  731. DatetimeIndex([]),
  732. Index([], dtype=np.int64),
  733. Index([], dtype=np.float64),
  734. DatetimeIndex([], freq="ME"),
  735. PeriodIndex([], freq="D"),
  736. ],
  737. )
  738. def test_axis_1_empty(self, all_reductions, index):
  739. df = DataFrame(columns=["a"], index=index)
  740. result = getattr(df, all_reductions)(axis=1)
  741. if all_reductions in ("any", "all"):
  742. expected_dtype = "bool"
  743. elif all_reductions == "count":
  744. expected_dtype = "int64"
  745. else:
  746. expected_dtype = "object"
  747. expected = Series([], index=index, dtype=expected_dtype)
  748. tm.assert_series_equal(result, expected)
  749. @pytest.mark.parametrize("min_count", [0, 1])
  750. def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count):
  751. # https://github.com/pandas-dev/pandas/issues/60229
  752. dtype = string_dtype_no_object
  753. df = DataFrame({"a": [pd.NA]}, dtype=dtype)
  754. result = df.sum(axis=1, skipna=skipna, min_count=min_count)
  755. value = "" if skipna and min_count == 0 else pd.NA
  756. expected = Series([value], dtype=dtype)
  757. tm.assert_series_equal(result, expected)
  758. @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
  759. @pytest.mark.parametrize("numeric_only", [None, True, False])
  760. def test_sum_prod_nanops(self, method, unit, numeric_only):
  761. idx = ["a", "b", "c"]
  762. df = DataFrame({"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]})
  763. # The default
  764. result = getattr(df, method)(numeric_only=numeric_only)
  765. expected = Series([unit, unit, unit], index=idx, dtype="float64")
  766. tm.assert_series_equal(result, expected)
  767. # min_count=1
  768. result = getattr(df, method)(numeric_only=numeric_only, min_count=1)
  769. expected = Series([unit, unit, np.nan], index=idx)
  770. tm.assert_series_equal(result, expected)
  771. # min_count=0
  772. result = getattr(df, method)(numeric_only=numeric_only, min_count=0)
  773. expected = Series([unit, unit, unit], index=idx, dtype="float64")
  774. tm.assert_series_equal(result, expected)
  775. result = getattr(df.iloc[1:], method)(numeric_only=numeric_only, min_count=1)
  776. expected = Series([unit, np.nan, np.nan], index=idx)
  777. tm.assert_series_equal(result, expected)
  778. # min_count > 1
  779. df = DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5})
  780. result = getattr(df, method)(numeric_only=numeric_only, min_count=5)
  781. expected = Series(result, index=["A", "B"])
  782. tm.assert_series_equal(result, expected)
  783. result = getattr(df, method)(numeric_only=numeric_only, min_count=6)
  784. expected = Series(result, index=["A", "B"])
  785. tm.assert_series_equal(result, expected)
  786. def test_sum_nanops_timedelta(self):
  787. # prod isn't defined on timedeltas
  788. idx = ["a", "b", "c"]
  789. df = DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]})
  790. df2 = df.apply(to_timedelta)
  791. # 0 by default
  792. result = df2.sum()
  793. expected = Series([0, 0, 0], dtype="m8[ns]", index=idx)
  794. tm.assert_series_equal(result, expected)
  795. # min_count=0
  796. result = df2.sum(min_count=0)
  797. tm.assert_series_equal(result, expected)
  798. # min_count=1
  799. result = df2.sum(min_count=1)
  800. expected = Series([0, 0, np.nan], dtype="m8[ns]", index=idx)
  801. tm.assert_series_equal(result, expected)
  802. def test_sum_nanops_min_count(self):
  803. # https://github.com/pandas-dev/pandas/issues/39738
  804. df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
  805. result = df.sum(min_count=10)
  806. expected = Series([np.nan, np.nan], index=["x", "y"])
  807. tm.assert_series_equal(result, expected)
  808. @pytest.mark.parametrize("float_type", ["float16", "float32", "float64"])
  809. @pytest.mark.parametrize(
  810. "kwargs, expected_result",
  811. [
  812. ({"axis": 1, "min_count": 2}, [3.2, 5.3, np.nan]),
  813. ({"axis": 1, "min_count": 3}, [np.nan, np.nan, np.nan]),
  814. ({"axis": 1, "skipna": False}, [3.2, 5.3, np.nan]),
  815. ],
  816. )
  817. def test_sum_nanops_dtype_min_count(self, float_type, kwargs, expected_result):
  818. # GH#46947
  819. df = DataFrame({"a": [1.0, 2.3, 4.4], "b": [2.2, 3, np.nan]}, dtype=float_type)
  820. result = df.sum(**kwargs)
  821. expected = Series(expected_result).astype(float_type)
  822. tm.assert_series_equal(result, expected)
  823. @pytest.mark.parametrize("float_type", ["float16", "float32", "float64"])
  824. @pytest.mark.parametrize(
  825. "kwargs, expected_result",
  826. [
  827. ({"axis": 1, "min_count": 2}, [2.0, 4.0, np.nan]),
  828. ({"axis": 1, "min_count": 3}, [np.nan, np.nan, np.nan]),
  829. ({"axis": 1, "skipna": False}, [2.0, 4.0, np.nan]),
  830. ],
  831. )
  832. def test_prod_nanops_dtype_min_count(self, float_type, kwargs, expected_result):
  833. # GH#46947
  834. df = DataFrame(
  835. {"a": [1.0, 2.0, 4.4], "b": [2.0, 2.0, np.nan]}, dtype=float_type
  836. )
  837. result = df.prod(**kwargs)
  838. expected = Series(expected_result).astype(float_type)
  839. tm.assert_series_equal(result, expected)
  840. def test_sum_object(self, float_frame):
  841. values = float_frame.values.astype(int)
  842. frame = DataFrame(values, index=float_frame.index, columns=float_frame.columns)
  843. deltas = frame * timedelta(1)
  844. deltas.sum()
  845. def test_sum_bool(self, float_frame):
  846. # ensure this works, bug report
  847. bools = np.isnan(float_frame)
  848. bools.sum(1)
  849. bools.sum(0)
  850. def test_sum_mixed_datetime(self):
  851. # GH#30886
  852. df = DataFrame({"A": date_range("2000", periods=4), "B": [1, 2, 3, 4]}).reindex(
  853. [2, 3, 4]
  854. )
  855. with pytest.raises(TypeError, match="does not support reduction 'sum'"):
  856. df.sum()
  857. def test_mean_corner(self, float_frame, float_string_frame):
  858. # unit test when have object data
  859. msg = "Could not convert|does not support|Cannot perform"
  860. with pytest.raises(TypeError, match=msg):
  861. float_string_frame.mean(axis=0)
  862. # xs sum mixed type, just want to know it works...
  863. with pytest.raises(TypeError, match="unsupported operand type"):
  864. float_string_frame.mean(axis=1)
  865. # take mean of boolean column
  866. float_frame["bool"] = float_frame["A"] > 0
  867. means = float_frame.mean(0)
  868. assert means["bool"] == float_frame["bool"].values.mean()
  869. def test_mean_datetimelike(self):
  870. # GH#24757 check that datetimelike are excluded by default, handled
  871. # correctly with numeric_only=True
  872. # As of 2.0, datetimelike are *not* excluded with numeric_only=None
  873. df = DataFrame(
  874. {
  875. "A": np.arange(3),
  876. "B": date_range("2016-01-01", periods=3),
  877. "C": pd.timedelta_range("1D", periods=3),
  878. "D": pd.period_range("2016", periods=3, freq="Y"),
  879. }
  880. )
  881. result = df.mean(numeric_only=True)
  882. expected = Series({"A": 1.0})
  883. tm.assert_series_equal(result, expected)
  884. with pytest.raises(TypeError, match="mean is not implemented for PeriodArray"):
  885. df.mean()
  886. def test_mean_datetimelike_numeric_only_false(self):
  887. df = DataFrame(
  888. {
  889. "A": np.arange(3),
  890. "B": date_range("2016-01-01", periods=3),
  891. "C": pd.timedelta_range("1D", periods=3),
  892. }
  893. )
  894. # datetime(tz) and timedelta work
  895. result = df.mean(numeric_only=False)
  896. expected = Series({"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"]})
  897. tm.assert_series_equal(result, expected)
  898. # mean of period is not allowed
  899. df["D"] = pd.period_range("2016", periods=3, freq="Y")
  900. with pytest.raises(TypeError, match="mean is not implemented for Period"):
  901. df.mean(numeric_only=False)
  902. def test_mean_extensionarray_numeric_only_true(self):
  903. # https://github.com/pandas-dev/pandas/issues/33256
  904. arr = np.random.default_rng(2).integers(1000, size=(10, 5))
  905. df = DataFrame(arr, dtype="Int64")
  906. result = df.mean(numeric_only=True)
  907. expected = DataFrame(arr).mean().astype("Float64")
  908. tm.assert_series_equal(result, expected)
  909. def test_stats_mixed_type(self, float_string_frame):
  910. with pytest.raises(TypeError, match="could not convert"):
  911. float_string_frame.std(1)
  912. with pytest.raises(TypeError, match="could not convert"):
  913. float_string_frame.var(1)
  914. with pytest.raises(TypeError, match="unsupported operand type"):
  915. float_string_frame.mean(1)
  916. with pytest.raises(TypeError, match="could not convert"):
  917. float_string_frame.skew(1)
  918. def test_sum_bools(self):
  919. df = DataFrame(index=range(1), columns=range(10))
  920. bools = isna(df)
  921. assert bools.sum(axis=1)[0] == 10
  922. # ----------------------------------------------------------------------
  923. # Index of max / min
  924. @pytest.mark.parametrize("skipna", [True, False])
  925. @pytest.mark.parametrize("axis", [0, 1])
  926. def test_idxmin(self, float_frame, int_frame, skipna, axis):
  927. frame = float_frame
  928. frame.iloc[5:10] = np.nan
  929. frame.iloc[15:20, -2:] = np.nan
  930. for df in [frame, int_frame]:
  931. warn = None
  932. if skipna is False or axis == 1:
  933. warn = None if df is int_frame else FutureWarning
  934. msg = "The behavior of DataFrame.idxmin with all-NA values"
  935. with tm.assert_produces_warning(warn, match=msg):
  936. result = df.idxmin(axis=axis, skipna=skipna)
  937. msg2 = "The behavior of Series.idxmin"
  938. with tm.assert_produces_warning(warn, match=msg2):
  939. expected = df.apply(Series.idxmin, axis=axis, skipna=skipna)
  940. expected = expected.astype(df.index.dtype)
  941. tm.assert_series_equal(result, expected)
  942. @pytest.mark.parametrize("axis", [0, 1])
  943. @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
  944. def test_idxmin_empty(self, index, skipna, axis):
  945. # GH53265
  946. if axis == 0:
  947. frame = DataFrame(index=index)
  948. else:
  949. frame = DataFrame(columns=index)
  950. result = frame.idxmin(axis=axis, skipna=skipna)
  951. expected = Series(dtype=index.dtype)
  952. tm.assert_series_equal(result, expected)
  953. @pytest.mark.parametrize("numeric_only", [True, False])
  954. def test_idxmin_numeric_only(self, numeric_only):
  955. df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
  956. result = df.idxmin(numeric_only=numeric_only)
  957. if numeric_only:
  958. expected = Series([2, 1], index=["a", "b"])
  959. else:
  960. expected = Series([2, 1, 0], index=["a", "b", "c"])
  961. tm.assert_series_equal(result, expected)
  962. def test_idxmin_axis_2(self, float_frame):
  963. frame = float_frame
  964. msg = "No axis named 2 for object type DataFrame"
  965. with pytest.raises(ValueError, match=msg):
  966. frame.idxmin(axis=2)
  967. @pytest.mark.parametrize("axis", [0, 1])
  968. def test_idxmax(self, float_frame, int_frame, skipna, axis):
  969. frame = float_frame
  970. frame.iloc[5:10] = np.nan
  971. frame.iloc[15:20, -2:] = np.nan
  972. for df in [frame, int_frame]:
  973. warn = None
  974. if skipna is False or axis == 1:
  975. warn = None if df is int_frame else FutureWarning
  976. msg = "The behavior of DataFrame.idxmax with all-NA values"
  977. with tm.assert_produces_warning(warn, match=msg):
  978. result = df.idxmax(axis=axis, skipna=skipna)
  979. msg2 = "The behavior of Series.idxmax"
  980. with tm.assert_produces_warning(warn, match=msg2):
  981. expected = df.apply(Series.idxmax, axis=axis, skipna=skipna)
  982. expected = expected.astype(df.index.dtype)
  983. tm.assert_series_equal(result, expected)
  984. @pytest.mark.parametrize("axis", [0, 1])
  985. @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
  986. def test_idxmax_empty(self, index, skipna, axis):
  987. # GH53265
  988. if axis == 0:
  989. frame = DataFrame(index=index)
  990. else:
  991. frame = DataFrame(columns=index)
  992. result = frame.idxmax(axis=axis, skipna=skipna)
  993. expected = Series(dtype=index.dtype)
  994. tm.assert_series_equal(result, expected)
  995. @pytest.mark.parametrize("numeric_only", [True, False])
  996. def test_idxmax_numeric_only(self, numeric_only):
  997. df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
  998. result = df.idxmax(numeric_only=numeric_only)
  999. if numeric_only:
  1000. expected = Series([1, 0], index=["a", "b"])
  1001. else:
  1002. expected = Series([1, 0, 1], index=["a", "b", "c"])
  1003. tm.assert_series_equal(result, expected)
  1004. def test_idxmax_arrow_types(self):
  1005. # GH#55368
  1006. pytest.importorskip("pyarrow")
  1007. df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1]}, dtype="int64[pyarrow]")
  1008. result = df.idxmax()
  1009. expected = Series([1, 0], index=["a", "b"])
  1010. tm.assert_series_equal(result, expected)
  1011. result = df.idxmin()
  1012. expected = Series([2, 1], index=["a", "b"])
  1013. tm.assert_series_equal(result, expected)
  1014. df = DataFrame({"a": ["b", "c", "a"]}, dtype="string[pyarrow]")
  1015. result = df.idxmax(numeric_only=False)
  1016. expected = Series([1], index=["a"])
  1017. tm.assert_series_equal(result, expected)
  1018. result = df.idxmin(numeric_only=False)
  1019. expected = Series([2], index=["a"])
  1020. tm.assert_series_equal(result, expected)
  1021. def test_idxmax_axis_2(self, float_frame):
  1022. frame = float_frame
  1023. msg = "No axis named 2 for object type DataFrame"
  1024. with pytest.raises(ValueError, match=msg):
  1025. frame.idxmax(axis=2)
  1026. def test_idxmax_mixed_dtype(self):
  1027. # don't cast to object, which would raise in nanops
  1028. dti = date_range("2016-01-01", periods=3)
  1029. # Copying dti is needed for ArrayManager otherwise when we set
  1030. # df.loc[0, 3] = pd.NaT below it edits dti
  1031. df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti.copy(deep=True)})
  1032. result = df.idxmax()
  1033. expected = Series([1, 0, 2], index=[1, 2, 3])
  1034. tm.assert_series_equal(result, expected)
  1035. result = df.idxmin()
  1036. expected = Series([0, 2, 0], index=[1, 2, 3])
  1037. tm.assert_series_equal(result, expected)
  1038. # with NaTs
  1039. df.loc[0, 3] = pd.NaT
  1040. result = df.idxmax()
  1041. expected = Series([1, 0, 2], index=[1, 2, 3])
  1042. tm.assert_series_equal(result, expected)
  1043. result = df.idxmin()
  1044. expected = Series([0, 2, 1], index=[1, 2, 3])
  1045. tm.assert_series_equal(result, expected)
  1046. # with multi-column dt64 block
  1047. df[4] = dti[::-1]
  1048. df._consolidate_inplace()
  1049. result = df.idxmax()
  1050. expected = Series([1, 0, 2, 0], index=[1, 2, 3, 4])
  1051. tm.assert_series_equal(result, expected)
  1052. result = df.idxmin()
  1053. expected = Series([0, 2, 1, 2], index=[1, 2, 3, 4])
  1054. tm.assert_series_equal(result, expected)
  1055. @pytest.mark.parametrize(
  1056. "op, expected_value",
  1057. [("idxmax", [0, 4]), ("idxmin", [0, 5])],
  1058. )
  1059. def test_idxmax_idxmin_convert_dtypes(self, op, expected_value):
  1060. # GH 40346
  1061. df = DataFrame(
  1062. {
  1063. "ID": [100, 100, 100, 200, 200, 200],
  1064. "value": [0, 0, 0, 1, 2, 0],
  1065. },
  1066. dtype="Int64",
  1067. )
  1068. df = df.groupby("ID")
  1069. result = getattr(df, op)()
  1070. expected = DataFrame(
  1071. {"value": expected_value},
  1072. index=Index([100, 200], name="ID", dtype="Int64"),
  1073. )
  1074. tm.assert_frame_equal(result, expected)
  1075. def test_idxmax_dt64_multicolumn_axis1(self):
  1076. dti = date_range("2016-01-01", periods=3)
  1077. df = DataFrame({3: dti, 4: dti[::-1]}, copy=True)
  1078. df.iloc[0, 0] = pd.NaT
  1079. df._consolidate_inplace()
  1080. result = df.idxmax(axis=1)
  1081. expected = Series([4, 3, 3])
  1082. tm.assert_series_equal(result, expected)
  1083. result = df.idxmin(axis=1)
  1084. expected = Series([4, 3, 4])
  1085. tm.assert_series_equal(result, expected)
  1086. # ----------------------------------------------------------------------
  1087. # Logical reductions
  1088. @pytest.mark.parametrize("opname", ["any", "all"])
  1089. @pytest.mark.parametrize("axis", [0, 1])
  1090. @pytest.mark.parametrize("bool_only", [False, True])
  1091. def test_any_all_mixed_float(self, opname, axis, bool_only, float_string_frame):
  1092. # make sure op works on mixed-type frame
  1093. mixed = float_string_frame
  1094. mixed["_bool_"] = np.random.default_rng(2).standard_normal(len(mixed)) > 0.5
  1095. getattr(mixed, opname)(axis=axis, bool_only=bool_only)
  1096. @pytest.mark.parametrize("opname", ["any", "all"])
  1097. @pytest.mark.parametrize("axis", [0, 1])
  1098. def test_any_all_bool_with_na(self, opname, axis, bool_frame_with_na):
  1099. getattr(bool_frame_with_na, opname)(axis=axis, bool_only=False)
  1100. @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning")
  1101. @pytest.mark.parametrize("opname", ["any", "all"])
  1102. def test_any_all_bool_frame(self, opname, bool_frame_with_na):
  1103. # GH#12863: numpy gives back non-boolean data for object type
  1104. # so fill NaNs to compare with pandas behavior
  1105. frame = bool_frame_with_na.fillna(True)
  1106. alternative = getattr(np, opname)
  1107. f = getattr(frame, opname)
  1108. def skipna_wrapper(x):
  1109. nona = x.dropna().values
  1110. return alternative(nona)
  1111. def wrapper(x):
  1112. return alternative(x.values)
  1113. result0 = f(axis=0, skipna=False)
  1114. result1 = f(axis=1, skipna=False)
  1115. tm.assert_series_equal(result0, frame.apply(wrapper))
  1116. tm.assert_series_equal(result1, frame.apply(wrapper, axis=1))
  1117. result0 = f(axis=0)
  1118. result1 = f(axis=1)
  1119. tm.assert_series_equal(result0, frame.apply(skipna_wrapper))
  1120. tm.assert_series_equal(
  1121. result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False
  1122. )
  1123. # bad axis
  1124. with pytest.raises(ValueError, match="No axis named 2"):
  1125. f(axis=2)
  1126. # all NA case
  1127. all_na = frame * np.nan
  1128. r0 = getattr(all_na, opname)(axis=0)
  1129. r1 = getattr(all_na, opname)(axis=1)
  1130. if opname == "any":
  1131. assert not r0.any()
  1132. assert not r1.any()
  1133. else:
  1134. assert r0.all()
  1135. assert r1.all()
  1136. def test_any_all_extra(self):
  1137. df = DataFrame(
  1138. {
  1139. "A": [True, False, False],
  1140. "B": [True, True, False],
  1141. "C": [True, True, True],
  1142. },
  1143. index=["a", "b", "c"],
  1144. )
  1145. result = df[["A", "B"]].any(axis=1)
  1146. expected = Series([True, True, False], index=["a", "b", "c"])
  1147. tm.assert_series_equal(result, expected)
  1148. result = df[["A", "B"]].any(axis=1, bool_only=True)
  1149. tm.assert_series_equal(result, expected)
  1150. result = df.all(1)
  1151. expected = Series([True, False, False], index=["a", "b", "c"])
  1152. tm.assert_series_equal(result, expected)
  1153. result = df.all(1, bool_only=True)
  1154. tm.assert_series_equal(result, expected)
  1155. # Axis is None
  1156. result = df.all(axis=None).item()
  1157. assert result is False
  1158. result = df.any(axis=None).item()
  1159. assert result is True
  1160. result = df[["C"]].all(axis=None).item()
  1161. assert result is True
  1162. @pytest.mark.parametrize("axis", [0, 1])
  1163. @pytest.mark.parametrize("bool_agg_func", ["any", "all"])
  1164. @pytest.mark.parametrize("skipna", [True, False])
  1165. def test_any_all_object_dtype(self, axis, bool_agg_func, skipna):
  1166. # GH#35450
  1167. df = DataFrame(
  1168. data=[
  1169. [1, np.nan, np.nan, True],
  1170. [np.nan, 2, np.nan, True],
  1171. [np.nan, np.nan, np.nan, True],
  1172. [np.nan, np.nan, "5", np.nan],
  1173. ]
  1174. )
  1175. result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna)
  1176. expected = Series([True, True, True, True])
  1177. tm.assert_series_equal(result, expected)
  1178. # GH#50947 deprecates this but it is not emitting a warning in some builds.
  1179. @pytest.mark.filterwarnings(
  1180. "ignore:'any' with datetime64 dtypes is deprecated.*:FutureWarning"
  1181. )
  1182. def test_any_datetime(self):
  1183. # GH 23070
  1184. float_data = [1, np.nan, 3, np.nan]
  1185. datetime_data = [
  1186. Timestamp("1960-02-15"),
  1187. Timestamp("1960-02-16"),
  1188. pd.NaT,
  1189. pd.NaT,
  1190. ]
  1191. df = DataFrame({"A": float_data, "B": datetime_data})
  1192. result = df.any(axis=1)
  1193. expected = Series([True, True, True, False])
  1194. tm.assert_series_equal(result, expected)
  1195. def test_any_all_bool_only(self):
  1196. # GH 25101
  1197. df = DataFrame(
  1198. {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]},
  1199. columns=Index(["col1", "col2", "col3"], dtype=object),
  1200. )
  1201. result = df.all(bool_only=True)
  1202. expected = Series(dtype=np.bool_, index=[])
  1203. tm.assert_series_equal(result, expected)
  1204. df = DataFrame(
  1205. {
  1206. "col1": [1, 2, 3],
  1207. "col2": [4, 5, 6],
  1208. "col3": [None, None, None],
  1209. "col4": [False, False, True],
  1210. }
  1211. )
  1212. result = df.all(bool_only=True)
  1213. expected = Series({"col4": False})
  1214. tm.assert_series_equal(result, expected)
  1215. @pytest.mark.parametrize(
  1216. "func, data, expected",
  1217. [
  1218. (np.any, {}, False),
  1219. (np.all, {}, True),
  1220. (np.any, {"A": []}, False),
  1221. (np.all, {"A": []}, True),
  1222. (np.any, {"A": [False, False]}, False),
  1223. (np.all, {"A": [False, False]}, False),
  1224. (np.any, {"A": [True, False]}, True),
  1225. (np.all, {"A": [True, False]}, False),
  1226. (np.any, {"A": [True, True]}, True),
  1227. (np.all, {"A": [True, True]}, True),
  1228. (np.any, {"A": [False], "B": [False]}, False),
  1229. (np.all, {"A": [False], "B": [False]}, False),
  1230. (np.any, {"A": [False, False], "B": [False, True]}, True),
  1231. (np.all, {"A": [False, False], "B": [False, True]}, False),
  1232. # other types
  1233. (np.all, {"A": Series([0.0, 1.0], dtype="float")}, False),
  1234. (np.any, {"A": Series([0.0, 1.0], dtype="float")}, True),
  1235. (np.all, {"A": Series([0, 1], dtype=int)}, False),
  1236. (np.any, {"A": Series([0, 1], dtype=int)}, True),
  1237. pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns]")}, False),
  1238. pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, False),
  1239. pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns]")}, True),
  1240. pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, True),
  1241. pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns]")}, True),
  1242. pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True),
  1243. pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns]")}, True),
  1244. pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True),
  1245. pytest.param(np.all, {"A": Series([0, 1], dtype="m8[ns]")}, False),
  1246. pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True),
  1247. pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True),
  1248. pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True),
  1249. # np.all on Categorical raises, so the reduction drops the
  1250. # column, so all is being done on an empty Series, so is True
  1251. (np.all, {"A": Series([0, 1], dtype="category")}, True),
  1252. (np.any, {"A": Series([0, 1], dtype="category")}, False),
  1253. (np.all, {"A": Series([1, 2], dtype="category")}, True),
  1254. (np.any, {"A": Series([1, 2], dtype="category")}, False),
  1255. # Mix GH#21484
  1256. pytest.param(
  1257. np.all,
  1258. {
  1259. "A": Series([10, 20], dtype="M8[ns]"),
  1260. "B": Series([10, 20], dtype="m8[ns]"),
  1261. },
  1262. True,
  1263. ),
  1264. ],
  1265. )
  1266. def test_any_all_np_func(self, func, data, expected):
  1267. # GH 19976
  1268. data = DataFrame(data)
  1269. if any(isinstance(x, CategoricalDtype) for x in data.dtypes):
  1270. with pytest.raises(
  1271. TypeError, match="dtype category does not support reduction"
  1272. ):
  1273. func(data)
  1274. # method version
  1275. with pytest.raises(
  1276. TypeError, match="dtype category does not support reduction"
  1277. ):
  1278. getattr(DataFrame(data), func.__name__)(axis=None)
  1279. else:
  1280. msg = "'(any|all)' with datetime64 dtypes is deprecated"
  1281. if data.dtypes.apply(lambda x: x.kind == "M").any():
  1282. warn = FutureWarning
  1283. else:
  1284. warn = None
  1285. with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
  1286. # GH#34479
  1287. result = func(data)
  1288. assert isinstance(result, np.bool_)
  1289. assert result.item() is expected
  1290. # method version
  1291. with tm.assert_produces_warning(warn, match=msg):
  1292. # GH#34479
  1293. result = getattr(DataFrame(data), func.__name__)(axis=None)
  1294. assert isinstance(result, np.bool_)
  1295. assert result.item() is expected
  1296. def test_any_all_object(self):
  1297. # GH 19976
  1298. result = np.all(DataFrame(columns=["a", "b"])).item()
  1299. assert result is True
  1300. result = np.any(DataFrame(columns=["a", "b"])).item()
  1301. assert result is False
  1302. def test_any_all_object_bool_only(self):
  1303. df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object)
  1304. df._consolidate_inplace()
  1305. df["C"] = Series([True, True])
  1306. # Categorical of bools is _not_ considered booly
  1307. df["D"] = df["C"].astype("category")
  1308. # The underlying bug is in DataFrame._get_bool_data, so we check
  1309. # that while we're here
  1310. res = df._get_bool_data()
  1311. expected = df[["C"]]
  1312. tm.assert_frame_equal(res, expected)
  1313. res = df.all(bool_only=True, axis=0)
  1314. expected = Series([True], index=["C"])
  1315. tm.assert_series_equal(res, expected)
  1316. # operating on a subset of columns should not produce a _larger_ Series
  1317. res = df[["B", "C"]].all(bool_only=True, axis=0)
  1318. tm.assert_series_equal(res, expected)
  1319. assert df.all(bool_only=True, axis=None)
  1320. res = df.any(bool_only=True, axis=0)
  1321. expected = Series([True], index=["C"])
  1322. tm.assert_series_equal(res, expected)
  1323. # operating on a subset of columns should not produce a _larger_ Series
  1324. res = df[["C"]].any(bool_only=True, axis=0)
  1325. tm.assert_series_equal(res, expected)
  1326. assert df.any(bool_only=True, axis=None)
  1327. # ---------------------------------------------------------------------
  1328. # Unsorted
  1329. def test_series_broadcasting(self):
  1330. # smoke test for numpy warnings
  1331. # GH 16378, GH 16306
  1332. df = DataFrame([1.0, 1.0, 1.0])
  1333. df_nan = DataFrame({"A": [np.nan, 2.0, np.nan]})
  1334. s = Series([1, 1, 1])
  1335. s_nan = Series([np.nan, np.nan, 1])
  1336. with tm.assert_produces_warning(None):
  1337. df_nan.clip(lower=s, axis=0)
  1338. for op in ["lt", "le", "gt", "ge", "eq", "ne"]:
  1339. getattr(df, op)(s_nan, axis=0)
  1340. class TestDataFrameReductions:
  1341. def test_min_max_dt64_with_NaT(self):
  1342. # Both NaT and Timestamp are in DataFrame.
  1343. df = DataFrame({"foo": [pd.NaT, pd.NaT, Timestamp("2012-05-01")]})
  1344. res = df.min()
  1345. exp = Series([Timestamp("2012-05-01")], index=["foo"])
  1346. tm.assert_series_equal(res, exp)
  1347. res = df.max()
  1348. exp = Series([Timestamp("2012-05-01")], index=["foo"])
  1349. tm.assert_series_equal(res, exp)
  1350. # GH12941, only NaTs are in DataFrame.
  1351. df = DataFrame({"foo": [pd.NaT, pd.NaT]})
  1352. res = df.min()
  1353. exp = Series([pd.NaT], index=["foo"])
  1354. tm.assert_series_equal(res, exp)
  1355. res = df.max()
  1356. exp = Series([pd.NaT], index=["foo"])
  1357. tm.assert_series_equal(res, exp)
  1358. def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture):
  1359. # GH#36907
  1360. tz = tz_naive_fixture
  1361. if isinstance(tz, tzlocal) and is_platform_windows():
  1362. pytest.skip(
  1363. "GH#37659 OSError raised within tzlocal bc Windows "
  1364. "chokes in times before 1970-01-01"
  1365. )
  1366. df = DataFrame(
  1367. {
  1368. "a": [
  1369. Timestamp("2020-01-01 08:00:00", tz=tz),
  1370. Timestamp("1920-02-01 09:00:00", tz=tz),
  1371. ],
  1372. "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT],
  1373. }
  1374. )
  1375. res = df.min(axis=1, skipna=False)
  1376. expected = Series([df.loc[0, "a"], pd.NaT])
  1377. assert expected.dtype == df["a"].dtype
  1378. tm.assert_series_equal(res, expected)
  1379. res = df.max(axis=1, skipna=False)
  1380. expected = Series([df.loc[0, "b"], pd.NaT])
  1381. assert expected.dtype == df["a"].dtype
  1382. tm.assert_series_equal(res, expected)
  1383. def test_min_max_dt64_api_consistency_with_NaT(self):
  1384. # Calling the following sum functions returned an error for dataframes but
  1385. # returned NaT for series. These tests check that the API is consistent in
  1386. # min/max calls on empty Series/DataFrames. See GH:33704 for more
  1387. # information
  1388. df = DataFrame({"x": to_datetime([])})
  1389. expected_dt_series = Series(to_datetime([]))
  1390. # check axis 0
  1391. assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is pd.NaT)
  1392. assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is pd.NaT)
  1393. # check axis 1
  1394. tm.assert_series_equal(df.min(axis=1), expected_dt_series)
  1395. tm.assert_series_equal(df.max(axis=1), expected_dt_series)
  1396. def test_min_max_dt64_api_consistency_empty_df(self):
  1397. # check DataFrame/Series api consistency when calling min/max on an empty
  1398. # DataFrame/Series.
  1399. df = DataFrame({"x": []})
  1400. expected_float_series = Series([], dtype=float)
  1401. # check axis 0
  1402. assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min())
  1403. assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max())
  1404. # check axis 1
  1405. tm.assert_series_equal(df.min(axis=1), expected_float_series)
  1406. tm.assert_series_equal(df.min(axis=1), expected_float_series)
  1407. @pytest.mark.parametrize(
  1408. "initial",
  1409. ["2018-10-08 13:36:45+00:00", "2018-10-08 13:36:45+03:00"], # Non-UTC timezone
  1410. )
  1411. @pytest.mark.parametrize("method", ["min", "max"])
  1412. def test_preserve_timezone(self, initial: str, method):
  1413. # GH 28552
  1414. initial_dt = to_datetime(initial)
  1415. expected = Series([initial_dt])
  1416. df = DataFrame([expected])
  1417. result = getattr(df, method)(axis=1)
  1418. tm.assert_series_equal(result, expected)
  1419. @pytest.mark.parametrize("method", ["min", "max"])
  1420. def test_minmax_tzaware_skipna_axis_1(self, method, skipna):
  1421. # GH#51242
  1422. val = to_datetime("1900-01-01", utc=True)
  1423. df = DataFrame(
  1424. {"a": Series([pd.NaT, pd.NaT, val]), "b": Series([pd.NaT, val, val])}
  1425. )
  1426. op = getattr(df, method)
  1427. result = op(axis=1, skipna=skipna)
  1428. if skipna:
  1429. expected = Series([pd.NaT, val, val])
  1430. else:
  1431. expected = Series([pd.NaT, pd.NaT, val])
  1432. tm.assert_series_equal(result, expected)
  1433. def test_frame_any_with_timedelta(self):
  1434. # GH#17667
  1435. df = DataFrame(
  1436. {
  1437. "a": Series([0, 0]),
  1438. "t": Series([to_timedelta(0, "s"), to_timedelta(1, "ms")]),
  1439. }
  1440. )
  1441. result = df.any(axis=0)
  1442. expected = Series(data=[False, True], index=["a", "t"])
  1443. tm.assert_series_equal(result, expected)
  1444. result = df.any(axis=1)
  1445. expected = Series(data=[False, True])
  1446. tm.assert_series_equal(result, expected)
  1447. def test_reductions_skipna_none_raises(
  1448. self, request, frame_or_series, all_reductions
  1449. ):
  1450. if all_reductions == "count":
  1451. request.applymarker(
  1452. pytest.mark.xfail(reason="Count does not accept skipna")
  1453. )
  1454. obj = frame_or_series([1, 2, 3])
  1455. msg = 'For argument "skipna" expected type bool, received type NoneType.'
  1456. with pytest.raises(ValueError, match=msg):
  1457. getattr(obj, all_reductions)(skipna=None)
  1458. @td.skip_array_manager_invalid_test
  1459. def test_reduction_timestamp_smallest_unit(self):
  1460. # GH#52524
  1461. df = DataFrame(
  1462. {
  1463. "a": Series([Timestamp("2019-12-31")], dtype="datetime64[s]"),
  1464. "b": Series(
  1465. [Timestamp("2019-12-31 00:00:00.123")], dtype="datetime64[ms]"
  1466. ),
  1467. }
  1468. )
  1469. result = df.max()
  1470. expected = Series(
  1471. [Timestamp("2019-12-31"), Timestamp("2019-12-31 00:00:00.123")],
  1472. dtype="datetime64[ms]",
  1473. index=["a", "b"],
  1474. )
  1475. tm.assert_series_equal(result, expected)
  1476. @td.skip_array_manager_not_yet_implemented
  1477. def test_reduction_timedelta_smallest_unit(self):
  1478. # GH#52524
  1479. df = DataFrame(
  1480. {
  1481. "a": Series([pd.Timedelta("1 days")], dtype="timedelta64[s]"),
  1482. "b": Series([pd.Timedelta("1 days")], dtype="timedelta64[ms]"),
  1483. }
  1484. )
  1485. result = df.max()
  1486. expected = Series(
  1487. [pd.Timedelta("1 days"), pd.Timedelta("1 days")],
  1488. dtype="timedelta64[ms]",
  1489. index=["a", "b"],
  1490. )
  1491. tm.assert_series_equal(result, expected)
  1492. class TestNuisanceColumns:
  1493. @pytest.mark.parametrize("method", ["any", "all"])
  1494. def test_any_all_categorical_dtype_nuisance_column(self, method):
  1495. # GH#36076 DataFrame should match Series behavior
  1496. ser = Series([0, 1], dtype="category", name="A")
  1497. df = ser.to_frame()
  1498. # Double-check the Series behavior is to raise
  1499. with pytest.raises(TypeError, match="does not support reduction"):
  1500. getattr(ser, method)()
  1501. with pytest.raises(TypeError, match="does not support reduction"):
  1502. getattr(np, method)(ser)
  1503. with pytest.raises(TypeError, match="does not support reduction"):
  1504. getattr(df, method)(bool_only=False)
  1505. with pytest.raises(TypeError, match="does not support reduction"):
  1506. getattr(df, method)(bool_only=None)
  1507. with pytest.raises(TypeError, match="does not support reduction"):
  1508. getattr(np, method)(df, axis=0)
  1509. def test_median_categorical_dtype_nuisance_column(self):
  1510. # GH#21020 DataFrame.median should match Series.median
  1511. df = DataFrame({"A": Categorical([1, 2, 2, 2, 3])})
  1512. ser = df["A"]
  1513. # Double-check the Series behavior is to raise
  1514. with pytest.raises(TypeError, match="does not support reduction"):
  1515. ser.median()
  1516. with pytest.raises(TypeError, match="does not support reduction"):
  1517. df.median(numeric_only=False)
  1518. with pytest.raises(TypeError, match="does not support reduction"):
  1519. df.median()
  1520. # same thing, but with an additional non-categorical column
  1521. df["B"] = df["A"].astype(int)
  1522. with pytest.raises(TypeError, match="does not support reduction"):
  1523. df.median(numeric_only=False)
  1524. with pytest.raises(TypeError, match="does not support reduction"):
  1525. df.median()
  1526. # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead
  1527. # of expected.values
  1528. @pytest.mark.parametrize("method", ["min", "max"])
  1529. def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
  1530. # GH#28949 DataFrame.min should behave like Series.min
  1531. cat = Categorical(["a", "b", "c", "b"], ordered=False)
  1532. ser = Series(cat)
  1533. df = ser.to_frame("A")
  1534. # Double-check the Series behavior
  1535. with pytest.raises(TypeError, match="is not ordered for operation"):
  1536. getattr(ser, method)()
  1537. with pytest.raises(TypeError, match="is not ordered for operation"):
  1538. getattr(np, method)(ser)
  1539. with pytest.raises(TypeError, match="is not ordered for operation"):
  1540. getattr(df, method)(numeric_only=False)
  1541. with pytest.raises(TypeError, match="is not ordered for operation"):
  1542. getattr(df, method)()
  1543. with pytest.raises(TypeError, match="is not ordered for operation"):
  1544. getattr(np, method)(df, axis=0)
  1545. # same thing, but with an additional non-categorical column
  1546. df["B"] = df["A"].astype(object)
  1547. with pytest.raises(TypeError, match="is not ordered for operation"):
  1548. getattr(df, method)()
  1549. with pytest.raises(TypeError, match="is not ordered for operation"):
  1550. getattr(np, method)(df, axis=0)
  1551. class TestEmptyDataFrameReductions:
  1552. @pytest.mark.parametrize(
  1553. "opname, dtype, exp_value, exp_dtype",
  1554. [
  1555. ("sum", np.int8, 0, np.int64),
  1556. ("prod", np.int8, 1, np.int_),
  1557. ("sum", np.int64, 0, np.int64),
  1558. ("prod", np.int64, 1, np.int64),
  1559. ("sum", np.uint8, 0, np.uint64),
  1560. ("prod", np.uint8, 1, np.uint),
  1561. ("sum", np.uint64, 0, np.uint64),
  1562. ("prod", np.uint64, 1, np.uint64),
  1563. ("sum", np.float32, 0, np.float32),
  1564. ("prod", np.float32, 1, np.float32),
  1565. ("sum", np.float64, 0, np.float64),
  1566. ],
  1567. )
  1568. def test_df_empty_min_count_0(self, opname, dtype, exp_value, exp_dtype):
  1569. df = DataFrame({0: [], 1: []}, dtype=dtype)
  1570. result = getattr(df, opname)(min_count=0)
  1571. expected = Series([exp_value, exp_value], dtype=exp_dtype)
  1572. tm.assert_series_equal(result, expected)
  1573. @pytest.mark.parametrize(
  1574. "opname, dtype, exp_dtype",
  1575. [
  1576. ("sum", np.int8, np.float64),
  1577. ("prod", np.int8, np.float64),
  1578. ("sum", np.int64, np.float64),
  1579. ("prod", np.int64, np.float64),
  1580. ("sum", np.uint8, np.float64),
  1581. ("prod", np.uint8, np.float64),
  1582. ("sum", np.uint64, np.float64),
  1583. ("prod", np.uint64, np.float64),
  1584. ("sum", np.float32, np.float32),
  1585. ("prod", np.float32, np.float32),
  1586. ("sum", np.float64, np.float64),
  1587. ],
  1588. )
  1589. def test_df_empty_min_count_1(self, opname, dtype, exp_dtype):
  1590. df = DataFrame({0: [], 1: []}, dtype=dtype)
  1591. result = getattr(df, opname)(min_count=1)
  1592. expected = Series([np.nan, np.nan], dtype=exp_dtype)
  1593. tm.assert_series_equal(result, expected)
  1594. @pytest.mark.parametrize(
  1595. "opname, dtype, exp_value, exp_dtype",
  1596. [
  1597. ("sum", "Int8", 0, ("Int32" if is_windows_np2_or_is32 else "Int64")),
  1598. ("prod", "Int8", 1, ("Int32" if is_windows_np2_or_is32 else "Int64")),
  1599. ("prod", "Int8", 1, ("Int32" if is_windows_np2_or_is32 else "Int64")),
  1600. ("sum", "Int64", 0, "Int64"),
  1601. ("prod", "Int64", 1, "Int64"),
  1602. ("sum", "UInt8", 0, ("UInt32" if is_windows_np2_or_is32 else "UInt64")),
  1603. ("prod", "UInt8", 1, ("UInt32" if is_windows_np2_or_is32 else "UInt64")),
  1604. ("sum", "UInt64", 0, "UInt64"),
  1605. ("prod", "UInt64", 1, "UInt64"),
  1606. ("sum", "Float32", 0, "Float32"),
  1607. ("prod", "Float32", 1, "Float32"),
  1608. ("sum", "Float64", 0, "Float64"),
  1609. ],
  1610. )
  1611. def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype):
  1612. df = DataFrame({0: [], 1: []}, dtype=dtype)
  1613. result = getattr(df, opname)(min_count=0)
  1614. expected = Series([exp_value, exp_value], dtype=exp_dtype)
  1615. tm.assert_series_equal(result, expected)
  1616. # TODO: why does min_count=1 impact the resulting Windows dtype
  1617. # differently than min_count=0?
  1618. @pytest.mark.parametrize(
  1619. "opname, dtype, exp_dtype",
  1620. [
  1621. ("sum", "Int8", ("Int32" if is_windows_or_is32 else "Int64")),
  1622. ("prod", "Int8", ("Int32" if is_windows_or_is32 else "Int64")),
  1623. ("sum", "Int64", "Int64"),
  1624. ("prod", "Int64", "Int64"),
  1625. ("sum", "UInt8", ("UInt32" if is_windows_or_is32 else "UInt64")),
  1626. ("prod", "UInt8", ("UInt32" if is_windows_or_is32 else "UInt64")),
  1627. ("sum", "UInt64", "UInt64"),
  1628. ("prod", "UInt64", "UInt64"),
  1629. ("sum", "Float32", "Float32"),
  1630. ("prod", "Float32", "Float32"),
  1631. ("sum", "Float64", "Float64"),
  1632. ],
  1633. )
  1634. def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype):
  1635. df = DataFrame({0: [], 1: []}, dtype=dtype)
  1636. result = getattr(df, opname)(min_count=1)
  1637. expected = Series([pd.NA, pd.NA], dtype=exp_dtype)
  1638. tm.assert_series_equal(result, expected)
  1639. def test_sum_timedelta64_skipna_false(using_array_manager, request):
  1640. # GH#17235
  1641. if using_array_manager:
  1642. mark = pytest.mark.xfail(
  1643. reason="Incorrect type inference on NaT in reduction result"
  1644. )
  1645. request.applymarker(mark)
  1646. arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2)
  1647. arr[-1, -1] = "Nat"
  1648. df = DataFrame(arr)
  1649. assert (df.dtypes == arr.dtype).all()
  1650. result = df.sum(skipna=False)
  1651. expected = Series([pd.Timedelta(seconds=12), pd.NaT], dtype="m8[s]")
  1652. tm.assert_series_equal(result, expected)
  1653. result = df.sum(axis=0, skipna=False)
  1654. tm.assert_series_equal(result, expected)
  1655. result = df.sum(axis=1, skipna=False)
  1656. expected = Series(
  1657. [
  1658. pd.Timedelta(seconds=1),
  1659. pd.Timedelta(seconds=5),
  1660. pd.Timedelta(seconds=9),
  1661. pd.NaT,
  1662. ],
  1663. dtype="m8[s]",
  1664. )
  1665. tm.assert_series_equal(result, expected)
  1666. def test_mixed_frame_with_integer_sum():
  1667. # https://github.com/pandas-dev/pandas/issues/34520
  1668. df = DataFrame([["a", 1]], columns=list("ab"))
  1669. df = df.astype({"b": "Int64"})
  1670. result = df.sum()
  1671. expected = Series(["a", 1], index=["a", "b"])
  1672. tm.assert_series_equal(result, expected)
  1673. @pytest.mark.parametrize("numeric_only", [True, False, None])
  1674. @pytest.mark.parametrize("method", ["min", "max"])
  1675. def test_minmax_extensionarray(method, numeric_only):
  1676. # https://github.com/pandas-dev/pandas/issues/32651
  1677. int64_info = np.iinfo("int64")
  1678. ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype())
  1679. df = DataFrame({"Int64": ser})
  1680. result = getattr(df, method)(numeric_only=numeric_only)
  1681. expected = Series(
  1682. [getattr(int64_info, method)],
  1683. dtype="Int64",
  1684. index=Index(["Int64"]),
  1685. )
  1686. tm.assert_series_equal(result, expected)
  1687. @pytest.mark.parametrize("ts_value", [Timestamp("2000-01-01"), pd.NaT])
  1688. def test_frame_mixed_numeric_object_with_timestamp(ts_value):
  1689. # GH 13912
  1690. df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]})
  1691. with pytest.raises(
  1692. TypeError, match="does not support (operation|reduction)|Cannot perform"
  1693. ):
  1694. df.sum()
  1695. def test_prod_sum_min_count_mixed_object():
  1696. # https://github.com/pandas-dev/pandas/issues/41074
  1697. df = DataFrame([1, "a", True])
  1698. result = df.prod(axis=0, min_count=1, numeric_only=False)
  1699. expected = Series(["a"], dtype=object)
  1700. tm.assert_series_equal(result, expected)
  1701. msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'")
  1702. with pytest.raises(TypeError, match=msg):
  1703. df.sum(axis=0, min_count=1, numeric_only=False)
  1704. @pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"])
  1705. @pytest.mark.parametrize("numeric_only", [True, False])
  1706. @pytest.mark.parametrize("dtype", ["float64", "Float64"])
  1707. def test_reduction_axis_none_returns_scalar(method, numeric_only, dtype):
  1708. # GH#21597 As of 2.0, axis=None reduces over all axes.
  1709. df = DataFrame(np.random.default_rng(2).standard_normal((4, 4)), dtype=dtype)
  1710. result = getattr(df, method)(axis=None, numeric_only=numeric_only)
  1711. np_arr = df.to_numpy(dtype=np.float64)
  1712. if method in {"skew", "kurt"}:
  1713. comp_mod = pytest.importorskip("scipy.stats")
  1714. if method == "kurt":
  1715. method = "kurtosis"
  1716. expected = getattr(comp_mod, method)(np_arr, bias=False, axis=None)
  1717. tm.assert_almost_equal(result, expected)
  1718. else:
  1719. expected = getattr(np, method)(np_arr, axis=None)
  1720. assert result == expected
  1721. @pytest.mark.parametrize(
  1722. "kernel",
  1723. [
  1724. "corr",
  1725. "corrwith",
  1726. "cov",
  1727. "idxmax",
  1728. "idxmin",
  1729. "kurt",
  1730. "max",
  1731. "mean",
  1732. "median",
  1733. "min",
  1734. "prod",
  1735. "quantile",
  1736. "sem",
  1737. "skew",
  1738. "std",
  1739. "sum",
  1740. "var",
  1741. ],
  1742. )
  1743. def test_fails_on_non_numeric(kernel):
  1744. # GH#46852
  1745. df = DataFrame({"a": [1, 2, 3], "b": object})
  1746. args = (df,) if kernel == "corrwith" else ()
  1747. msg = "|".join(
  1748. [
  1749. "not allowed for this dtype",
  1750. "argument must be a string or a number",
  1751. "not supported between instances of",
  1752. "unsupported operand type",
  1753. "argument must be a string or a real number",
  1754. ]
  1755. )
  1756. if kernel == "median":
  1757. # slightly different message on different builds
  1758. msg1 = (
  1759. r"Cannot convert \[\[<class 'object'> <class 'object'> "
  1760. r"<class 'object'>\]\] to numeric"
  1761. )
  1762. msg2 = (
  1763. r"Cannot convert \[<class 'object'> <class 'object'> "
  1764. r"<class 'object'>\] to numeric"
  1765. )
  1766. msg = "|".join([msg1, msg2])
  1767. with pytest.raises(TypeError, match=msg):
  1768. getattr(df, kernel)(*args)
  1769. @pytest.mark.parametrize(
  1770. "method",
  1771. [
  1772. "all",
  1773. "any",
  1774. "count",
  1775. "idxmax",
  1776. "idxmin",
  1777. "kurt",
  1778. "kurtosis",
  1779. "max",
  1780. "mean",
  1781. "median",
  1782. "min",
  1783. "nunique",
  1784. "prod",
  1785. "product",
  1786. "sem",
  1787. "skew",
  1788. "std",
  1789. "sum",
  1790. "var",
  1791. ],
  1792. )
  1793. @pytest.mark.parametrize("min_count", [0, 2])
  1794. def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype):
  1795. # GH 54341
  1796. df = DataFrame(
  1797. {
  1798. "a": Series([0, 1, 2, 3], dtype=any_numeric_ea_dtype),
  1799. "b": Series([0, 1, pd.NA, 3], dtype=any_numeric_ea_dtype),
  1800. },
  1801. )
  1802. expected_df = DataFrame(
  1803. {
  1804. "a": [0.0, 1.0, 2.0, 3.0],
  1805. "b": [0.0, 1.0, np.nan, 3.0],
  1806. },
  1807. )
  1808. if method in ("count", "nunique"):
  1809. expected_dtype = "int64"
  1810. elif method in ("all", "any"):
  1811. expected_dtype = "boolean"
  1812. elif method in (
  1813. "kurt",
  1814. "kurtosis",
  1815. "mean",
  1816. "median",
  1817. "sem",
  1818. "skew",
  1819. "std",
  1820. "var",
  1821. ) and not any_numeric_ea_dtype.startswith("Float"):
  1822. expected_dtype = "Float64"
  1823. else:
  1824. expected_dtype = any_numeric_ea_dtype
  1825. kwargs = {}
  1826. if method not in ("count", "nunique", "quantile"):
  1827. kwargs["skipna"] = skipna
  1828. if method in ("prod", "product", "sum"):
  1829. kwargs["min_count"] = min_count
  1830. warn = None
  1831. msg = None
  1832. if not skipna and method in ("idxmax", "idxmin"):
  1833. warn = FutureWarning
  1834. msg = f"The behavior of DataFrame.{method} with all-NA values"
  1835. with tm.assert_produces_warning(warn, match=msg):
  1836. result = getattr(df, method)(axis=1, **kwargs)
  1837. with tm.assert_produces_warning(warn, match=msg):
  1838. expected = getattr(expected_df, method)(axis=1, **kwargs)
  1839. if method not in ("idxmax", "idxmin"):
  1840. expected = expected.astype(expected_dtype)
  1841. tm.assert_series_equal(result, expected)