nanops.py 50 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748
  1. from __future__ import annotations
  2. import functools
  3. import itertools
  4. from typing import (
  5. Any,
  6. Callable,
  7. cast,
  8. )
  9. import warnings
  10. import numpy as np
  11. from pandas._config import get_option
  12. from pandas._libs import (
  13. NaT,
  14. NaTType,
  15. iNaT,
  16. lib,
  17. )
  18. from pandas._typing import (
  19. ArrayLike,
  20. AxisInt,
  21. CorrelationMethod,
  22. Dtype,
  23. DtypeObj,
  24. F,
  25. Scalar,
  26. Shape,
  27. npt,
  28. )
  29. from pandas.compat._optional import import_optional_dependency
  30. from pandas.util._exceptions import find_stack_level
  31. from pandas.core.dtypes.common import (
  32. is_complex,
  33. is_float,
  34. is_float_dtype,
  35. is_integer,
  36. is_numeric_dtype,
  37. is_object_dtype,
  38. needs_i8_conversion,
  39. pandas_dtype,
  40. )
  41. from pandas.core.dtypes.missing import (
  42. isna,
  43. na_value_for_dtype,
  44. notna,
  45. )
  46. bn = import_optional_dependency("bottleneck", errors="warn")
  47. _BOTTLENECK_INSTALLED = bn is not None
  48. _USE_BOTTLENECK = False
  49. def set_use_bottleneck(v: bool = True) -> None:
  50. # set/unset to use bottleneck
  51. global _USE_BOTTLENECK
  52. if _BOTTLENECK_INSTALLED:
  53. _USE_BOTTLENECK = v
  54. set_use_bottleneck(get_option("compute.use_bottleneck"))
  55. class disallow:
  56. def __init__(self, *dtypes: Dtype) -> None:
  57. super().__init__()
  58. self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes)
  59. def check(self, obj) -> bool:
  60. return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes)
  61. def __call__(self, f: F) -> F:
  62. @functools.wraps(f)
  63. def _f(*args, **kwargs):
  64. obj_iter = itertools.chain(args, kwargs.values())
  65. if any(self.check(obj) for obj in obj_iter):
  66. f_name = f.__name__.replace("nan", "")
  67. raise TypeError(
  68. f"reduction operation '{f_name}' not allowed for this dtype"
  69. )
  70. try:
  71. return f(*args, **kwargs)
  72. except ValueError as e:
  73. # we want to transform an object array
  74. # ValueError message to the more typical TypeError
  75. # e.g. this is normally a disallowed function on
  76. # object arrays that contain strings
  77. if is_object_dtype(args[0]):
  78. raise TypeError(e) from e
  79. raise
  80. return cast(F, _f)
  81. class bottleneck_switch:
  82. def __init__(self, name=None, **kwargs) -> None:
  83. self.name = name
  84. self.kwargs = kwargs
  85. def __call__(self, alt: F) -> F:
  86. bn_name = self.name or alt.__name__
  87. try:
  88. bn_func = getattr(bn, bn_name)
  89. except (AttributeError, NameError): # pragma: no cover
  90. bn_func = None
  91. @functools.wraps(alt)
  92. def f(
  93. values: np.ndarray,
  94. *,
  95. axis: AxisInt | None = None,
  96. skipna: bool = True,
  97. **kwds,
  98. ):
  99. if len(self.kwargs) > 0:
  100. for k, v in self.kwargs.items():
  101. if k not in kwds:
  102. kwds[k] = v
  103. if values.size == 0 and kwds.get("min_count") is None:
  104. # We are empty, returning NA for our type
  105. # Only applies for the default `min_count` of None
  106. # since that affects how empty arrays are handled.
  107. # TODO(GH-18976) update all the nanops methods to
  108. # correctly handle empty inputs and remove this check.
  109. # It *may* just be `var`
  110. return _na_for_min_count(values, axis)
  111. if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name):
  112. if kwds.get("mask", None) is None:
  113. # `mask` is not recognised by bottleneck, would raise
  114. # TypeError if called
  115. kwds.pop("mask", None)
  116. result = bn_func(values, axis=axis, **kwds)
  117. # prefer to treat inf/-inf as NA, but must compute the func
  118. # twice :(
  119. if _has_infs(result):
  120. result = alt(values, axis=axis, skipna=skipna, **kwds)
  121. else:
  122. result = alt(values, axis=axis, skipna=skipna, **kwds)
  123. else:
  124. result = alt(values, axis=axis, skipna=skipna, **kwds)
  125. return result
  126. return cast(F, f)
  127. def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
  128. # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
  129. if dtype != object and not needs_i8_conversion(dtype):
  130. # GH 42878
  131. # Bottleneck uses naive summation leading to O(n) loss of precision
  132. # unlike numpy which implements pairwise summation, which has O(log(n)) loss
  133. # crossref: https://github.com/pydata/bottleneck/issues/379
  134. # GH 15507
  135. # bottleneck does not properly upcast during the sum
  136. # so can overflow
  137. # GH 9422
  138. # further we also want to preserve NaN when all elements
  139. # are NaN, unlike bottleneck/numpy which consider this
  140. # to be 0
  141. return name not in ["nansum", "nanprod", "nanmean"]
  142. return False
  143. def _has_infs(result) -> bool:
  144. if isinstance(result, np.ndarray):
  145. if result.dtype in ("f8", "f4"):
  146. # Note: outside of an nanops-specific test, we always have
  147. # result.ndim == 1, so there is no risk of this ravel making a copy.
  148. return lib.has_infs(result.ravel("K"))
  149. try:
  150. return np.isinf(result).any()
  151. except (TypeError, NotImplementedError):
  152. # if it doesn't support infs, then it can't have infs
  153. return False
  154. def _get_fill_value(
  155. dtype: DtypeObj, fill_value: Scalar | None = None, fill_value_typ=None
  156. ):
  157. """return the correct fill value for the dtype of the values"""
  158. if fill_value is not None:
  159. return fill_value
  160. if _na_ok_dtype(dtype):
  161. if fill_value_typ is None:
  162. return np.nan
  163. else:
  164. if fill_value_typ == "+inf":
  165. return np.inf
  166. else:
  167. return -np.inf
  168. else:
  169. if fill_value_typ == "+inf":
  170. # need the max int here
  171. return lib.i8max
  172. else:
  173. return iNaT
  174. def _maybe_get_mask(
  175. values: np.ndarray, skipna: bool, mask: npt.NDArray[np.bool_] | None
  176. ) -> npt.NDArray[np.bool_] | None:
  177. """
  178. Compute a mask if and only if necessary.
  179. This function will compute a mask iff it is necessary. Otherwise,
  180. return the provided mask (potentially None) when a mask does not need to be
  181. computed.
  182. A mask is never necessary if the values array is of boolean or integer
  183. dtypes, as these are incapable of storing NaNs. If passing a NaN-capable
  184. dtype that is interpretable as either boolean or integer data (eg,
  185. timedelta64), a mask must be provided.
  186. If the skipna parameter is False, a new mask will not be computed.
  187. The mask is computed using isna() by default. Setting invert=True selects
  188. notna() as the masking function.
  189. Parameters
  190. ----------
  191. values : ndarray
  192. input array to potentially compute mask for
  193. skipna : bool
  194. boolean for whether NaNs should be skipped
  195. mask : Optional[ndarray]
  196. nan-mask if known
  197. Returns
  198. -------
  199. Optional[np.ndarray[bool]]
  200. """
  201. if mask is None:
  202. if values.dtype.kind in "biu":
  203. # Boolean data cannot contain nulls, so signal via mask being None
  204. return None
  205. if skipna or values.dtype.kind in "mM":
  206. mask = isna(values)
  207. return mask
  208. def _get_values(
  209. values: np.ndarray,
  210. skipna: bool,
  211. fill_value: Any = None,
  212. fill_value_typ: str | None = None,
  213. mask: npt.NDArray[np.bool_] | None = None,
  214. ) -> tuple[np.ndarray, npt.NDArray[np.bool_] | None]:
  215. """
  216. Utility to get the values view, mask, dtype, dtype_max, and fill_value.
  217. If both mask and fill_value/fill_value_typ are not None and skipna is True,
  218. the values array will be copied.
  219. For input arrays of boolean or integer dtypes, copies will only occur if a
  220. precomputed mask, a fill_value/fill_value_typ, and skipna=True are
  221. provided.
  222. Parameters
  223. ----------
  224. values : ndarray
  225. input array to potentially compute mask for
  226. skipna : bool
  227. boolean for whether NaNs should be skipped
  228. fill_value : Any
  229. value to fill NaNs with
  230. fill_value_typ : str
  231. Set to '+inf' or '-inf' to handle dtype-specific infinities
  232. mask : Optional[np.ndarray[bool]]
  233. nan-mask if known
  234. Returns
  235. -------
  236. values : ndarray
  237. Potential copy of input value array
  238. mask : Optional[ndarray[bool]]
  239. Mask for values, if deemed necessary to compute
  240. """
  241. # In _get_values is only called from within nanops, and in all cases
  242. # with scalar fill_value. This guarantee is important for the
  243. # np.where call below
  244. mask = _maybe_get_mask(values, skipna, mask)
  245. dtype = values.dtype
  246. datetimelike = False
  247. if values.dtype.kind in "mM":
  248. # changing timedelta64/datetime64 to int64 needs to happen after
  249. # finding `mask` above
  250. values = np.asarray(values.view("i8"))
  251. datetimelike = True
  252. if skipna and (mask is not None):
  253. # get our fill value (in case we need to provide an alternative
  254. # dtype for it)
  255. fill_value = _get_fill_value(
  256. dtype, fill_value=fill_value, fill_value_typ=fill_value_typ
  257. )
  258. if fill_value is not None:
  259. if mask.any():
  260. if datetimelike or _na_ok_dtype(dtype):
  261. values = values.copy()
  262. np.putmask(values, mask, fill_value)
  263. else:
  264. # np.where will promote if needed
  265. values = np.where(~mask, values, fill_value)
  266. return values, mask
  267. def _get_dtype_max(dtype: np.dtype) -> np.dtype:
  268. # return a platform independent precision dtype
  269. dtype_max = dtype
  270. if dtype.kind in "bi":
  271. dtype_max = np.dtype(np.int64)
  272. elif dtype.kind == "u":
  273. dtype_max = np.dtype(np.uint64)
  274. elif dtype.kind == "f":
  275. dtype_max = np.dtype(np.float64)
  276. return dtype_max
  277. def _na_ok_dtype(dtype: DtypeObj) -> bool:
  278. if needs_i8_conversion(dtype):
  279. return False
  280. return not issubclass(dtype.type, np.integer)
  281. def _wrap_results(result, dtype: np.dtype, fill_value=None):
  282. """wrap our results if needed"""
  283. if result is NaT:
  284. pass
  285. elif dtype.kind == "M":
  286. if fill_value is None:
  287. # GH#24293
  288. fill_value = iNaT
  289. if not isinstance(result, np.ndarray):
  290. assert not isna(fill_value), "Expected non-null fill_value"
  291. if result == fill_value:
  292. result = np.nan
  293. if isna(result):
  294. result = np.datetime64("NaT", "ns").astype(dtype)
  295. else:
  296. result = np.int64(result).view(dtype)
  297. # retain original unit
  298. result = result.astype(dtype, copy=False)
  299. else:
  300. # If we have float dtype, taking a view will give the wrong result
  301. result = result.astype(dtype)
  302. elif dtype.kind == "m":
  303. if not isinstance(result, np.ndarray):
  304. if result == fill_value or np.isnan(result):
  305. result = np.timedelta64("NaT").astype(dtype)
  306. elif np.fabs(result) > lib.i8max:
  307. # raise if we have a timedelta64[ns] which is too large
  308. raise ValueError("overflow in timedelta operation")
  309. else:
  310. # return a timedelta64 with the original unit
  311. result = np.int64(result).astype(dtype, copy=False)
  312. else:
  313. result = result.astype("m8[ns]").view(dtype)
  314. return result
  315. def _datetimelike_compat(func: F) -> F:
  316. """
  317. If we have datetime64 or timedelta64 values, ensure we have a correct
  318. mask before calling the wrapped function, then cast back afterwards.
  319. """
  320. @functools.wraps(func)
  321. def new_func(
  322. values: np.ndarray,
  323. *,
  324. axis: AxisInt | None = None,
  325. skipna: bool = True,
  326. mask: npt.NDArray[np.bool_] | None = None,
  327. **kwargs,
  328. ):
  329. orig_values = values
  330. datetimelike = values.dtype.kind in "mM"
  331. if datetimelike and mask is None:
  332. mask = isna(values)
  333. result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
  334. if datetimelike:
  335. result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)
  336. if not skipna:
  337. assert mask is not None # checked above
  338. result = _mask_datetimelike_result(result, axis, mask, orig_values)
  339. return result
  340. return cast(F, new_func)
  341. def _na_for_min_count(values: np.ndarray, axis: AxisInt | None) -> Scalar | np.ndarray:
  342. """
  343. Return the missing value for `values`.
  344. Parameters
  345. ----------
  346. values : ndarray
  347. axis : int or None
  348. axis for the reduction, required if values.ndim > 1.
  349. Returns
  350. -------
  351. result : scalar or ndarray
  352. For 1-D values, returns a scalar of the correct missing type.
  353. For 2-D values, returns a 1-D array where each element is missing.
  354. """
  355. # we either return np.nan or pd.NaT
  356. if values.dtype.kind in "iufcb":
  357. values = values.astype("float64")
  358. fill_value = na_value_for_dtype(values.dtype)
  359. if values.ndim == 1:
  360. return fill_value
  361. elif axis is None:
  362. return fill_value
  363. else:
  364. result_shape = values.shape[:axis] + values.shape[axis + 1 :]
  365. return np.full(result_shape, fill_value, dtype=values.dtype)
  366. def maybe_operate_rowwise(func: F) -> F:
  367. """
  368. NumPy operations on C-contiguous ndarrays with axis=1 can be
  369. very slow if axis 1 >> axis 0.
  370. Operate row-by-row and concatenate the results.
  371. """
  372. @functools.wraps(func)
  373. def newfunc(values: np.ndarray, *, axis: AxisInt | None = None, **kwargs):
  374. if (
  375. axis == 1
  376. and values.ndim == 2
  377. and values.flags["C_CONTIGUOUS"]
  378. # only takes this path for wide arrays (long dataframes), for threshold see
  379. # https://github.com/pandas-dev/pandas/pull/43311#issuecomment-974891737
  380. and (values.shape[1] / 1000) > values.shape[0]
  381. and values.dtype != object
  382. and values.dtype != bool
  383. ):
  384. arrs = list(values)
  385. if kwargs.get("mask") is not None:
  386. mask = kwargs.pop("mask")
  387. results = [
  388. func(arrs[i], mask=mask[i], **kwargs) for i in range(len(arrs))
  389. ]
  390. else:
  391. results = [func(x, **kwargs) for x in arrs]
  392. return np.array(results)
  393. return func(values, axis=axis, **kwargs)
  394. return cast(F, newfunc)
  395. def nanany(
  396. values: np.ndarray,
  397. *,
  398. axis: AxisInt | None = None,
  399. skipna: bool = True,
  400. mask: npt.NDArray[np.bool_] | None = None,
  401. ) -> bool:
  402. """
  403. Check if any elements along an axis evaluate to True.
  404. Parameters
  405. ----------
  406. values : ndarray
  407. axis : int, optional
  408. skipna : bool, default True
  409. mask : ndarray[bool], optional
  410. nan-mask if known
  411. Returns
  412. -------
  413. result : bool
  414. Examples
  415. --------
  416. >>> from pandas.core import nanops
  417. >>> s = pd.Series([1, 2])
  418. >>> nanops.nanany(s.values)
  419. True
  420. >>> from pandas.core import nanops
  421. >>> s = pd.Series([np.nan])
  422. >>> nanops.nanany(s.values)
  423. False
  424. """
  425. if values.dtype.kind in "iub" and mask is None:
  426. # GH#26032 fastpath
  427. # error: Incompatible return value type (got "Union[bool_, ndarray]",
  428. # expected "bool")
  429. return values.any(axis) # type: ignore[return-value]
  430. if values.dtype.kind == "M":
  431. # GH#34479
  432. warnings.warn(
  433. "'any' with datetime64 dtypes is deprecated and will raise in a "
  434. "future version. Use (obj != pd.Timestamp(0)).any() instead.",
  435. FutureWarning,
  436. stacklevel=find_stack_level(),
  437. )
  438. values, _ = _get_values(values, skipna, fill_value=False, mask=mask)
  439. # For object type, any won't necessarily return
  440. # boolean values (numpy/numpy#4352)
  441. if values.dtype == object:
  442. values = values.astype(bool)
  443. # error: Incompatible return value type (got "Union[bool_, ndarray]", expected
  444. # "bool")
  445. return values.any(axis) # type: ignore[return-value]
  446. def nanall(
  447. values: np.ndarray,
  448. *,
  449. axis: AxisInt | None = None,
  450. skipna: bool = True,
  451. mask: npt.NDArray[np.bool_] | None = None,
  452. ) -> bool:
  453. """
  454. Check if all elements along an axis evaluate to True.
  455. Parameters
  456. ----------
  457. values : ndarray
  458. axis : int, optional
  459. skipna : bool, default True
  460. mask : ndarray[bool], optional
  461. nan-mask if known
  462. Returns
  463. -------
  464. result : bool
  465. Examples
  466. --------
  467. >>> from pandas.core import nanops
  468. >>> s = pd.Series([1, 2, np.nan])
  469. >>> nanops.nanall(s.values)
  470. True
  471. >>> from pandas.core import nanops
  472. >>> s = pd.Series([1, 0])
  473. >>> nanops.nanall(s.values)
  474. False
  475. """
  476. if values.dtype.kind in "iub" and mask is None:
  477. # GH#26032 fastpath
  478. # error: Incompatible return value type (got "Union[bool_, ndarray]",
  479. # expected "bool")
  480. return values.all(axis) # type: ignore[return-value]
  481. if values.dtype.kind == "M":
  482. # GH#34479
  483. warnings.warn(
  484. "'all' with datetime64 dtypes is deprecated and will raise in a "
  485. "future version. Use (obj != pd.Timestamp(0)).all() instead.",
  486. FutureWarning,
  487. stacklevel=find_stack_level(),
  488. )
  489. values, _ = _get_values(values, skipna, fill_value=True, mask=mask)
  490. # For object type, all won't necessarily return
  491. # boolean values (numpy/numpy#4352)
  492. if values.dtype == object:
  493. values = values.astype(bool)
  494. # error: Incompatible return value type (got "Union[bool_, ndarray]", expected
  495. # "bool")
  496. return values.all(axis) # type: ignore[return-value]
  497. @disallow("M8")
  498. @_datetimelike_compat
  499. @maybe_operate_rowwise
  500. def nansum(
  501. values: np.ndarray,
  502. *,
  503. axis: AxisInt | None = None,
  504. skipna: bool = True,
  505. min_count: int = 0,
  506. mask: npt.NDArray[np.bool_] | None = None,
  507. ) -> float:
  508. """
  509. Sum the elements along an axis ignoring NaNs
  510. Parameters
  511. ----------
  512. values : ndarray[dtype]
  513. axis : int, optional
  514. skipna : bool, default True
  515. min_count: int, default 0
  516. mask : ndarray[bool], optional
  517. nan-mask if known
  518. Returns
  519. -------
  520. result : dtype
  521. Examples
  522. --------
  523. >>> from pandas.core import nanops
  524. >>> s = pd.Series([1, 2, np.nan])
  525. >>> nanops.nansum(s.values)
  526. 3.0
  527. """
  528. dtype = values.dtype
  529. values, mask = _get_values(values, skipna, fill_value=0, mask=mask)
  530. dtype_sum = _get_dtype_max(dtype)
  531. if dtype.kind == "f":
  532. dtype_sum = dtype
  533. elif dtype.kind == "m":
  534. dtype_sum = np.dtype(np.float64)
  535. the_sum = values.sum(axis, dtype=dtype_sum)
  536. the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
  537. return the_sum
  538. def _mask_datetimelike_result(
  539. result: np.ndarray | np.datetime64 | np.timedelta64,
  540. axis: AxisInt | None,
  541. mask: npt.NDArray[np.bool_],
  542. orig_values: np.ndarray,
  543. ) -> np.ndarray | np.datetime64 | np.timedelta64 | NaTType:
  544. if isinstance(result, np.ndarray):
  545. # we need to apply the mask
  546. result = result.astype("i8").view(orig_values.dtype)
  547. axis_mask = mask.any(axis=axis)
  548. # error: Unsupported target for indexed assignment ("Union[ndarray[Any, Any],
  549. # datetime64, timedelta64]")
  550. result[axis_mask] = iNaT # type: ignore[index]
  551. else:
  552. if mask.any():
  553. return np.int64(iNaT).view(orig_values.dtype)
  554. return result
  555. @bottleneck_switch()
  556. @_datetimelike_compat
  557. def nanmean(
  558. values: np.ndarray,
  559. *,
  560. axis: AxisInt | None = None,
  561. skipna: bool = True,
  562. mask: npt.NDArray[np.bool_] | None = None,
  563. ) -> float:
  564. """
  565. Compute the mean of the element along an axis ignoring NaNs
  566. Parameters
  567. ----------
  568. values : ndarray
  569. axis : int, optional
  570. skipna : bool, default True
  571. mask : ndarray[bool], optional
  572. nan-mask if known
  573. Returns
  574. -------
  575. float
  576. Unless input is a float array, in which case use the same
  577. precision as the input array.
  578. Examples
  579. --------
  580. >>> from pandas.core import nanops
  581. >>> s = pd.Series([1, 2, np.nan])
  582. >>> nanops.nanmean(s.values)
  583. 1.5
  584. """
  585. dtype = values.dtype
  586. values, mask = _get_values(values, skipna, fill_value=0, mask=mask)
  587. dtype_sum = _get_dtype_max(dtype)
  588. dtype_count = np.dtype(np.float64)
  589. # not using needs_i8_conversion because that includes period
  590. if dtype.kind in "mM":
  591. dtype_sum = np.dtype(np.float64)
  592. elif dtype.kind in "iu":
  593. dtype_sum = np.dtype(np.float64)
  594. elif dtype.kind == "f":
  595. dtype_sum = dtype
  596. dtype_count = dtype
  597. count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
  598. the_sum = values.sum(axis, dtype=dtype_sum)
  599. the_sum = _ensure_numeric(the_sum)
  600. if axis is not None and getattr(the_sum, "ndim", False):
  601. count = cast(np.ndarray, count)
  602. with np.errstate(all="ignore"):
  603. # suppress division by zero warnings
  604. the_mean = the_sum / count
  605. ct_mask = count == 0
  606. if ct_mask.any():
  607. the_mean[ct_mask] = np.nan
  608. else:
  609. the_mean = the_sum / count if count > 0 else np.nan
  610. return the_mean
  611. @bottleneck_switch()
  612. def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=None):
  613. """
  614. Parameters
  615. ----------
  616. values : ndarray
  617. axis : int, optional
  618. skipna : bool, default True
  619. mask : ndarray[bool], optional
  620. nan-mask if known
  621. Returns
  622. -------
  623. result : float
  624. Unless input is a float array, in which case use the same
  625. precision as the input array.
  626. Examples
  627. --------
  628. >>> from pandas.core import nanops
  629. >>> s = pd.Series([1, np.nan, 2, 2])
  630. >>> nanops.nanmedian(s.values)
  631. 2.0
  632. """
  633. # for floats without mask, the data already uses NaN as missing value
  634. # indicator, and `mask` will be calculated from that below -> in those
  635. # cases we never need to set NaN to the masked values
  636. using_nan_sentinel = values.dtype.kind == "f" and mask is None
  637. def get_median(x, _mask=None):
  638. if _mask is None:
  639. _mask = notna(x)
  640. else:
  641. _mask = ~_mask
  642. if not skipna and not _mask.all():
  643. return np.nan
  644. with warnings.catch_warnings():
  645. # Suppress RuntimeWarning about All-NaN slice
  646. warnings.filterwarnings(
  647. "ignore", "All-NaN slice encountered", RuntimeWarning
  648. )
  649. res = np.nanmedian(x[_mask])
  650. return res
  651. dtype = values.dtype
  652. values, mask = _get_values(values, skipna, mask=mask, fill_value=None)
  653. if values.dtype.kind != "f":
  654. if values.dtype == object:
  655. # GH#34671 avoid casting strings to numeric
  656. inferred = lib.infer_dtype(values)
  657. if inferred in ["string", "mixed"]:
  658. raise TypeError(f"Cannot convert {values} to numeric")
  659. try:
  660. values = values.astype("f8")
  661. except ValueError as err:
  662. # e.g. "could not convert string to float: 'a'"
  663. raise TypeError(str(err)) from err
  664. if not using_nan_sentinel and mask is not None:
  665. if not values.flags.writeable:
  666. values = values.copy()
  667. values[mask] = np.nan
  668. notempty = values.size
  669. # an array from a frame
  670. if values.ndim > 1 and axis is not None:
  671. # there's a non-empty array to apply over otherwise numpy raises
  672. if notempty:
  673. if not skipna:
  674. res = np.apply_along_axis(get_median, axis, values)
  675. else:
  676. # fastpath for the skipna case
  677. with warnings.catch_warnings():
  678. # Suppress RuntimeWarning about All-NaN slice
  679. warnings.filterwarnings(
  680. "ignore", "All-NaN slice encountered", RuntimeWarning
  681. )
  682. if (values.shape[1] == 1 and axis == 0) or (
  683. values.shape[0] == 1 and axis == 1
  684. ):
  685. # GH52788: fastpath when squeezable, nanmedian for 2D array slow
  686. res = np.nanmedian(np.squeeze(values), keepdims=True)
  687. else:
  688. res = np.nanmedian(values, axis=axis)
  689. else:
  690. # must return the correct shape, but median is not defined for the
  691. # empty set so return nans of shape "everything but the passed axis"
  692. # since "axis" is where the reduction would occur if we had a nonempty
  693. # array
  694. res = _get_empty_reduction_result(values.shape, axis)
  695. else:
  696. # otherwise return a scalar value
  697. res = get_median(values, mask) if notempty else np.nan
  698. return _wrap_results(res, dtype)
  699. def _get_empty_reduction_result(
  700. shape: Shape,
  701. axis: AxisInt,
  702. ) -> np.ndarray:
  703. """
  704. The result from a reduction on an empty ndarray.
  705. Parameters
  706. ----------
  707. shape : Tuple[int, ...]
  708. axis : int
  709. Returns
  710. -------
  711. np.ndarray
  712. """
  713. shp = np.array(shape)
  714. dims = np.arange(len(shape))
  715. ret = np.empty(shp[dims != axis], dtype=np.float64)
  716. ret.fill(np.nan)
  717. return ret
  718. def _get_counts_nanvar(
  719. values_shape: Shape,
  720. mask: npt.NDArray[np.bool_] | None,
  721. axis: AxisInt | None,
  722. ddof: int,
  723. dtype: np.dtype = np.dtype(np.float64),
  724. ) -> tuple[float | np.ndarray, float | np.ndarray]:
  725. """
  726. Get the count of non-null values along an axis, accounting
  727. for degrees of freedom.
  728. Parameters
  729. ----------
  730. values_shape : Tuple[int, ...]
  731. shape tuple from values ndarray, used if mask is None
  732. mask : Optional[ndarray[bool]]
  733. locations in values that should be considered missing
  734. axis : Optional[int]
  735. axis to count along
  736. ddof : int
  737. degrees of freedom
  738. dtype : type, optional
  739. type to use for count
  740. Returns
  741. -------
  742. count : int, np.nan or np.ndarray
  743. d : int, np.nan or np.ndarray
  744. """
  745. count = _get_counts(values_shape, mask, axis, dtype=dtype)
  746. d = count - dtype.type(ddof)
  747. # always return NaN, never inf
  748. if is_float(count):
  749. if count <= ddof:
  750. # error: Incompatible types in assignment (expression has type
  751. # "float", variable has type "Union[floating[Any], ndarray[Any,
  752. # dtype[floating[Any]]]]")
  753. count = np.nan # type: ignore[assignment]
  754. d = np.nan
  755. else:
  756. # count is not narrowed by is_float check
  757. count = cast(np.ndarray, count)
  758. mask = count <= ddof
  759. if mask.any():
  760. np.putmask(d, mask, np.nan)
  761. np.putmask(count, mask, np.nan)
  762. return count, d
  763. @bottleneck_switch(ddof=1)
  764. def nanstd(
  765. values,
  766. *,
  767. axis: AxisInt | None = None,
  768. skipna: bool = True,
  769. ddof: int = 1,
  770. mask=None,
  771. ):
  772. """
  773. Compute the standard deviation along given axis while ignoring NaNs
  774. Parameters
  775. ----------
  776. values : ndarray
  777. axis : int, optional
  778. skipna : bool, default True
  779. ddof : int, default 1
  780. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  781. where N represents the number of elements.
  782. mask : ndarray[bool], optional
  783. nan-mask if known
  784. Returns
  785. -------
  786. result : float
  787. Unless input is a float array, in which case use the same
  788. precision as the input array.
  789. Examples
  790. --------
  791. >>> from pandas.core import nanops
  792. >>> s = pd.Series([1, np.nan, 2, 3])
  793. >>> nanops.nanstd(s.values)
  794. 1.0
  795. """
  796. if values.dtype == "M8[ns]":
  797. values = values.view("m8[ns]")
  798. orig_dtype = values.dtype
  799. values, mask = _get_values(values, skipna, mask=mask)
  800. result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask))
  801. return _wrap_results(result, orig_dtype)
  802. @disallow("M8", "m8")
  803. @bottleneck_switch(ddof=1)
  804. def nanvar(
  805. values: np.ndarray,
  806. *,
  807. axis: AxisInt | None = None,
  808. skipna: bool = True,
  809. ddof: int = 1,
  810. mask=None,
  811. ):
  812. """
  813. Compute the variance along given axis while ignoring NaNs
  814. Parameters
  815. ----------
  816. values : ndarray
  817. axis : int, optional
  818. skipna : bool, default True
  819. ddof : int, default 1
  820. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  821. where N represents the number of elements.
  822. mask : ndarray[bool], optional
  823. nan-mask if known
  824. Returns
  825. -------
  826. result : float
  827. Unless input is a float array, in which case use the same
  828. precision as the input array.
  829. Examples
  830. --------
  831. >>> from pandas.core import nanops
  832. >>> s = pd.Series([1, np.nan, 2, 3])
  833. >>> nanops.nanvar(s.values)
  834. 1.0
  835. """
  836. dtype = values.dtype
  837. mask = _maybe_get_mask(values, skipna, mask)
  838. if dtype.kind in "iu":
  839. values = values.astype("f8")
  840. if mask is not None:
  841. values[mask] = np.nan
  842. if values.dtype.kind == "f":
  843. count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
  844. else:
  845. count, d = _get_counts_nanvar(values.shape, mask, axis, ddof)
  846. if skipna and mask is not None:
  847. values = values.copy()
  848. np.putmask(values, mask, 0)
  849. # xref GH10242
  850. # Compute variance via two-pass algorithm, which is stable against
  851. # cancellation errors and relatively accurate for small numbers of
  852. # observations.
  853. #
  854. # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
  855. avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
  856. if axis is not None:
  857. avg = np.expand_dims(avg, axis)
  858. sqr = _ensure_numeric((avg - values) ** 2)
  859. if mask is not None:
  860. np.putmask(sqr, mask, 0)
  861. result = sqr.sum(axis=axis, dtype=np.float64) / d
  862. # Return variance as np.float64 (the datatype used in the accumulator),
  863. # unless we were dealing with a float array, in which case use the same
  864. # precision as the original values array.
  865. if dtype.kind == "f":
  866. result = result.astype(dtype, copy=False)
  867. return result
  868. @disallow("M8", "m8")
  869. def nansem(
  870. values: np.ndarray,
  871. *,
  872. axis: AxisInt | None = None,
  873. skipna: bool = True,
  874. ddof: int = 1,
  875. mask: npt.NDArray[np.bool_] | None = None,
  876. ) -> float:
  877. """
  878. Compute the standard error in the mean along given axis while ignoring NaNs
  879. Parameters
  880. ----------
  881. values : ndarray
  882. axis : int, optional
  883. skipna : bool, default True
  884. ddof : int, default 1
  885. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  886. where N represents the number of elements.
  887. mask : ndarray[bool], optional
  888. nan-mask if known
  889. Returns
  890. -------
  891. result : float64
  892. Unless input is a float array, in which case use the same
  893. precision as the input array.
  894. Examples
  895. --------
  896. >>> from pandas.core import nanops
  897. >>> s = pd.Series([1, np.nan, 2, 3])
  898. >>> nanops.nansem(s.values)
  899. 0.5773502691896258
  900. """
  901. # This checks if non-numeric-like data is passed with numeric_only=False
  902. # and raises a TypeError otherwise
  903. nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
  904. mask = _maybe_get_mask(values, skipna, mask)
  905. if values.dtype.kind != "f":
  906. values = values.astype("f8")
  907. if not skipna and mask is not None and mask.any():
  908. return np.nan
  909. count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
  910. var = nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
  911. return np.sqrt(var) / np.sqrt(count)
  912. def _nanminmax(meth, fill_value_typ):
  913. @bottleneck_switch(name=f"nan{meth}")
  914. @_datetimelike_compat
  915. def reduction(
  916. values: np.ndarray,
  917. *,
  918. axis: AxisInt | None = None,
  919. skipna: bool = True,
  920. mask: npt.NDArray[np.bool_] | None = None,
  921. ):
  922. if values.size == 0:
  923. return _na_for_min_count(values, axis)
  924. values, mask = _get_values(
  925. values, skipna, fill_value_typ=fill_value_typ, mask=mask
  926. )
  927. result = getattr(values, meth)(axis)
  928. result = _maybe_null_out(result, axis, mask, values.shape)
  929. return result
  930. return reduction
  931. nanmin = _nanminmax("min", fill_value_typ="+inf")
  932. nanmax = _nanminmax("max", fill_value_typ="-inf")
  933. def nanargmax(
  934. values: np.ndarray,
  935. *,
  936. axis: AxisInt | None = None,
  937. skipna: bool = True,
  938. mask: npt.NDArray[np.bool_] | None = None,
  939. ) -> int | np.ndarray:
  940. """
  941. Parameters
  942. ----------
  943. values : ndarray
  944. axis : int, optional
  945. skipna : bool, default True
  946. mask : ndarray[bool], optional
  947. nan-mask if known
  948. Returns
  949. -------
  950. result : int or ndarray[int]
  951. The index/indices of max value in specified axis or -1 in the NA case
  952. Examples
  953. --------
  954. >>> from pandas.core import nanops
  955. >>> arr = np.array([1, 2, 3, np.nan, 4])
  956. >>> nanops.nanargmax(arr)
  957. 4
  958. >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
  959. >>> arr[2:, 2] = np.nan
  960. >>> arr
  961. array([[ 0., 1., 2.],
  962. [ 3., 4., 5.],
  963. [ 6., 7., nan],
  964. [ 9., 10., nan]])
  965. >>> nanops.nanargmax(arr, axis=1)
  966. array([2, 2, 1, 1])
  967. """
  968. values, mask = _get_values(values, True, fill_value_typ="-inf", mask=mask)
  969. result = values.argmax(axis)
  970. # error: Argument 1 to "_maybe_arg_null_out" has incompatible type "Any |
  971. # signedinteger[Any]"; expected "ndarray[Any, Any]"
  972. result = _maybe_arg_null_out(result, axis, mask, skipna) # type: ignore[arg-type]
  973. return result
  974. def nanargmin(
  975. values: np.ndarray,
  976. *,
  977. axis: AxisInt | None = None,
  978. skipna: bool = True,
  979. mask: npt.NDArray[np.bool_] | None = None,
  980. ) -> int | np.ndarray:
  981. """
  982. Parameters
  983. ----------
  984. values : ndarray
  985. axis : int, optional
  986. skipna : bool, default True
  987. mask : ndarray[bool], optional
  988. nan-mask if known
  989. Returns
  990. -------
  991. result : int or ndarray[int]
  992. The index/indices of min value in specified axis or -1 in the NA case
  993. Examples
  994. --------
  995. >>> from pandas.core import nanops
  996. >>> arr = np.array([1, 2, 3, np.nan, 4])
  997. >>> nanops.nanargmin(arr)
  998. 0
  999. >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
  1000. >>> arr[2:, 0] = np.nan
  1001. >>> arr
  1002. array([[ 0., 1., 2.],
  1003. [ 3., 4., 5.],
  1004. [nan, 7., 8.],
  1005. [nan, 10., 11.]])
  1006. >>> nanops.nanargmin(arr, axis=1)
  1007. array([0, 0, 1, 1])
  1008. """
  1009. values, mask = _get_values(values, True, fill_value_typ="+inf", mask=mask)
  1010. result = values.argmin(axis)
  1011. # error: Argument 1 to "_maybe_arg_null_out" has incompatible type "Any |
  1012. # signedinteger[Any]"; expected "ndarray[Any, Any]"
  1013. result = _maybe_arg_null_out(result, axis, mask, skipna) # type: ignore[arg-type]
  1014. return result
  1015. @disallow("M8", "m8")
  1016. @maybe_operate_rowwise
  1017. def nanskew(
  1018. values: np.ndarray,
  1019. *,
  1020. axis: AxisInt | None = None,
  1021. skipna: bool = True,
  1022. mask: npt.NDArray[np.bool_] | None = None,
  1023. ) -> float:
  1024. """
  1025. Compute the sample skewness.
  1026. The statistic computed here is the adjusted Fisher-Pearson standardized
  1027. moment coefficient G1. The algorithm computes this coefficient directly
  1028. from the second and third central moment.
  1029. Parameters
  1030. ----------
  1031. values : ndarray
  1032. axis : int, optional
  1033. skipna : bool, default True
  1034. mask : ndarray[bool], optional
  1035. nan-mask if known
  1036. Returns
  1037. -------
  1038. result : float64
  1039. Unless input is a float array, in which case use the same
  1040. precision as the input array.
  1041. Examples
  1042. --------
  1043. >>> from pandas.core import nanops
  1044. >>> s = pd.Series([1, np.nan, 1, 2])
  1045. >>> nanops.nanskew(s.values)
  1046. 1.7320508075688787
  1047. """
  1048. mask = _maybe_get_mask(values, skipna, mask)
  1049. if values.dtype.kind != "f":
  1050. values = values.astype("f8")
  1051. count = _get_counts(values.shape, mask, axis)
  1052. else:
  1053. count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
  1054. if skipna and mask is not None:
  1055. values = values.copy()
  1056. np.putmask(values, mask, 0)
  1057. elif not skipna and mask is not None and mask.any():
  1058. return np.nan
  1059. with np.errstate(invalid="ignore", divide="ignore"):
  1060. mean = values.sum(axis, dtype=np.float64) / count
  1061. if axis is not None:
  1062. mean = np.expand_dims(mean, axis)
  1063. adjusted = values - mean
  1064. if skipna and mask is not None:
  1065. np.putmask(adjusted, mask, 0)
  1066. adjusted2 = adjusted**2
  1067. adjusted3 = adjusted2 * adjusted
  1068. m2 = adjusted2.sum(axis, dtype=np.float64)
  1069. m3 = adjusted3.sum(axis, dtype=np.float64)
  1070. # floating point error
  1071. #
  1072. # #18044 in _libs/windows.pyx calc_skew follow this behavior
  1073. # to fix the fperr to treat m2 <1e-14 as zero
  1074. m2 = _zero_out_fperr(m2)
  1075. m3 = _zero_out_fperr(m3)
  1076. with np.errstate(invalid="ignore", divide="ignore"):
  1077. result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5)
  1078. dtype = values.dtype
  1079. if dtype.kind == "f":
  1080. result = result.astype(dtype, copy=False)
  1081. if isinstance(result, np.ndarray):
  1082. result = np.where(m2 == 0, 0, result)
  1083. result[count < 3] = np.nan
  1084. else:
  1085. result = dtype.type(0) if m2 == 0 else result
  1086. if count < 3:
  1087. return np.nan
  1088. return result
  1089. @disallow("M8", "m8")
  1090. @maybe_operate_rowwise
  1091. def nankurt(
  1092. values: np.ndarray,
  1093. *,
  1094. axis: AxisInt | None = None,
  1095. skipna: bool = True,
  1096. mask: npt.NDArray[np.bool_] | None = None,
  1097. ) -> float:
  1098. """
  1099. Compute the sample excess kurtosis
  1100. The statistic computed here is the adjusted Fisher-Pearson standardized
  1101. moment coefficient G2, computed directly from the second and fourth
  1102. central moment.
  1103. Parameters
  1104. ----------
  1105. values : ndarray
  1106. axis : int, optional
  1107. skipna : bool, default True
  1108. mask : ndarray[bool], optional
  1109. nan-mask if known
  1110. Returns
  1111. -------
  1112. result : float64
  1113. Unless input is a float array, in which case use the same
  1114. precision as the input array.
  1115. Examples
  1116. --------
  1117. >>> from pandas.core import nanops
  1118. >>> s = pd.Series([1, np.nan, 1, 3, 2])
  1119. >>> nanops.nankurt(s.values)
  1120. -1.2892561983471076
  1121. """
  1122. mask = _maybe_get_mask(values, skipna, mask)
  1123. if values.dtype.kind != "f":
  1124. values = values.astype("f8")
  1125. count = _get_counts(values.shape, mask, axis)
  1126. else:
  1127. count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
  1128. if skipna and mask is not None:
  1129. values = values.copy()
  1130. np.putmask(values, mask, 0)
  1131. elif not skipna and mask is not None and mask.any():
  1132. return np.nan
  1133. with np.errstate(invalid="ignore", divide="ignore"):
  1134. mean = values.sum(axis, dtype=np.float64) / count
  1135. if axis is not None:
  1136. mean = np.expand_dims(mean, axis)
  1137. adjusted = values - mean
  1138. if skipna and mask is not None:
  1139. np.putmask(adjusted, mask, 0)
  1140. adjusted2 = adjusted**2
  1141. adjusted4 = adjusted2**2
  1142. m2 = adjusted2.sum(axis, dtype=np.float64)
  1143. m4 = adjusted4.sum(axis, dtype=np.float64)
  1144. with np.errstate(invalid="ignore", divide="ignore"):
  1145. adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
  1146. numerator = count * (count + 1) * (count - 1) * m4
  1147. denominator = (count - 2) * (count - 3) * m2**2
  1148. # floating point error
  1149. #
  1150. # #18044 in _libs/windows.pyx calc_kurt follow this behavior
  1151. # to fix the fperr to treat denom <1e-14 as zero
  1152. numerator = _zero_out_fperr(numerator)
  1153. denominator = _zero_out_fperr(denominator)
  1154. if not isinstance(denominator, np.ndarray):
  1155. # if ``denom`` is a scalar, check these corner cases first before
  1156. # doing division
  1157. if count < 4:
  1158. return np.nan
  1159. if denominator == 0:
  1160. return values.dtype.type(0)
  1161. with np.errstate(invalid="ignore", divide="ignore"):
  1162. result = numerator / denominator - adj
  1163. dtype = values.dtype
  1164. if dtype.kind == "f":
  1165. result = result.astype(dtype, copy=False)
  1166. if isinstance(result, np.ndarray):
  1167. result = np.where(denominator == 0, 0, result)
  1168. result[count < 4] = np.nan
  1169. return result
  1170. @disallow("M8", "m8")
  1171. @maybe_operate_rowwise
  1172. def nanprod(
  1173. values: np.ndarray,
  1174. *,
  1175. axis: AxisInt | None = None,
  1176. skipna: bool = True,
  1177. min_count: int = 0,
  1178. mask: npt.NDArray[np.bool_] | None = None,
  1179. ) -> float:
  1180. """
  1181. Parameters
  1182. ----------
  1183. values : ndarray[dtype]
  1184. axis : int, optional
  1185. skipna : bool, default True
  1186. min_count: int, default 0
  1187. mask : ndarray[bool], optional
  1188. nan-mask if known
  1189. Returns
  1190. -------
  1191. Dtype
  1192. The product of all elements on a given axis. ( NaNs are treated as 1)
  1193. Examples
  1194. --------
  1195. >>> from pandas.core import nanops
  1196. >>> s = pd.Series([1, 2, 3, np.nan])
  1197. >>> nanops.nanprod(s.values)
  1198. 6.0
  1199. """
  1200. mask = _maybe_get_mask(values, skipna, mask)
  1201. if skipna and mask is not None:
  1202. values = values.copy()
  1203. values[mask] = 1
  1204. result = values.prod(axis)
  1205. # error: Incompatible return value type (got "Union[ndarray, float]", expected
  1206. # "float")
  1207. return _maybe_null_out( # type: ignore[return-value]
  1208. result, axis, mask, values.shape, min_count=min_count
  1209. )
  1210. def _maybe_arg_null_out(
  1211. result: np.ndarray,
  1212. axis: AxisInt | None,
  1213. mask: npt.NDArray[np.bool_] | None,
  1214. skipna: bool,
  1215. ) -> np.ndarray | int:
  1216. # helper function for nanargmin/nanargmax
  1217. if mask is None:
  1218. return result
  1219. if axis is None or not getattr(result, "ndim", False):
  1220. if skipna:
  1221. if mask.all():
  1222. return -1
  1223. else:
  1224. if mask.any():
  1225. return -1
  1226. else:
  1227. if skipna:
  1228. na_mask = mask.all(axis)
  1229. else:
  1230. na_mask = mask.any(axis)
  1231. if na_mask.any():
  1232. result[na_mask] = -1
  1233. return result
  1234. def _get_counts(
  1235. values_shape: Shape,
  1236. mask: npt.NDArray[np.bool_] | None,
  1237. axis: AxisInt | None,
  1238. dtype: np.dtype[np.floating] = np.dtype(np.float64),
  1239. ) -> np.floating | npt.NDArray[np.floating]:
  1240. """
  1241. Get the count of non-null values along an axis
  1242. Parameters
  1243. ----------
  1244. values_shape : tuple of int
  1245. shape tuple from values ndarray, used if mask is None
  1246. mask : Optional[ndarray[bool]]
  1247. locations in values that should be considered missing
  1248. axis : Optional[int]
  1249. axis to count along
  1250. dtype : type, optional
  1251. type to use for count
  1252. Returns
  1253. -------
  1254. count : scalar or array
  1255. """
  1256. if axis is None:
  1257. if mask is not None:
  1258. n = mask.size - mask.sum()
  1259. else:
  1260. n = np.prod(values_shape)
  1261. return dtype.type(n)
  1262. if mask is not None:
  1263. count = mask.shape[axis] - mask.sum(axis)
  1264. else:
  1265. count = values_shape[axis]
  1266. if is_integer(count):
  1267. return dtype.type(count)
  1268. return count.astype(dtype, copy=False)
  1269. def _maybe_null_out(
  1270. result: np.ndarray | float | NaTType,
  1271. axis: AxisInt | None,
  1272. mask: npt.NDArray[np.bool_] | None,
  1273. shape: tuple[int, ...],
  1274. min_count: int = 1,
  1275. ) -> np.ndarray | float | NaTType:
  1276. """
  1277. Returns
  1278. -------
  1279. Dtype
  1280. The product of all elements on a given axis. ( NaNs are treated as 1)
  1281. """
  1282. if mask is None and min_count == 0:
  1283. # nothing to check; short-circuit
  1284. return result
  1285. if axis is not None and isinstance(result, np.ndarray):
  1286. if mask is not None:
  1287. null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
  1288. else:
  1289. # we have no nulls, kept mask=None in _maybe_get_mask
  1290. below_count = shape[axis] - min_count < 0
  1291. new_shape = shape[:axis] + shape[axis + 1 :]
  1292. null_mask = np.broadcast_to(below_count, new_shape)
  1293. if np.any(null_mask):
  1294. if is_numeric_dtype(result):
  1295. if np.iscomplexobj(result):
  1296. result = result.astype("c16")
  1297. elif not is_float_dtype(result):
  1298. result = result.astype("f8", copy=False)
  1299. result[null_mask] = np.nan
  1300. else:
  1301. # GH12941, use None to auto cast null
  1302. result[null_mask] = None
  1303. elif result is not NaT:
  1304. if check_below_min_count(shape, mask, min_count):
  1305. result_dtype = getattr(result, "dtype", None)
  1306. if is_float_dtype(result_dtype):
  1307. # error: Item "None" of "Optional[Any]" has no attribute "type"
  1308. result = result_dtype.type("nan") # type: ignore[union-attr]
  1309. else:
  1310. result = np.nan
  1311. return result
  1312. def check_below_min_count(
  1313. shape: tuple[int, ...], mask: npt.NDArray[np.bool_] | None, min_count: int
  1314. ) -> bool:
  1315. """
  1316. Check for the `min_count` keyword. Returns True if below `min_count` (when
  1317. missing value should be returned from the reduction).
  1318. Parameters
  1319. ----------
  1320. shape : tuple
  1321. The shape of the values (`values.shape`).
  1322. mask : ndarray[bool] or None
  1323. Boolean numpy array (typically of same shape as `shape`) or None.
  1324. min_count : int
  1325. Keyword passed through from sum/prod call.
  1326. Returns
  1327. -------
  1328. bool
  1329. """
  1330. if min_count > 0:
  1331. if mask is None:
  1332. # no missing values, only check size
  1333. non_nulls = np.prod(shape)
  1334. else:
  1335. non_nulls = mask.size - mask.sum()
  1336. if non_nulls < min_count:
  1337. return True
  1338. return False
  1339. def _zero_out_fperr(arg):
  1340. # #18044 reference this behavior to fix rolling skew/kurt issue
  1341. if isinstance(arg, np.ndarray):
  1342. return np.where(np.abs(arg) < 1e-14, 0, arg)
  1343. else:
  1344. return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
  1345. @disallow("M8", "m8")
  1346. def nancorr(
  1347. a: np.ndarray,
  1348. b: np.ndarray,
  1349. *,
  1350. method: CorrelationMethod = "pearson",
  1351. min_periods: int | None = None,
  1352. ) -> float:
  1353. """
  1354. a, b: ndarrays
  1355. """
  1356. if len(a) != len(b):
  1357. raise AssertionError("Operands to nancorr must have same size")
  1358. if min_periods is None:
  1359. min_periods = 1
  1360. valid = notna(a) & notna(b)
  1361. if not valid.all():
  1362. a = a[valid]
  1363. b = b[valid]
  1364. if len(a) < min_periods:
  1365. return np.nan
  1366. a = _ensure_numeric(a)
  1367. b = _ensure_numeric(b)
  1368. f = get_corr_func(method)
  1369. return f(a, b)
  1370. def get_corr_func(
  1371. method: CorrelationMethod,
  1372. ) -> Callable[[np.ndarray, np.ndarray], float]:
  1373. if method == "kendall":
  1374. from scipy.stats import kendalltau
  1375. def func(a, b):
  1376. return kendalltau(a, b)[0]
  1377. return func
  1378. elif method == "spearman":
  1379. from scipy.stats import spearmanr
  1380. def func(a, b):
  1381. return spearmanr(a, b)[0]
  1382. return func
  1383. elif method == "pearson":
  1384. def func(a, b):
  1385. return np.corrcoef(a, b)[0, 1]
  1386. return func
  1387. elif callable(method):
  1388. return method
  1389. raise ValueError(
  1390. f"Unknown method '{method}', expected one of "
  1391. "'kendall', 'spearman', 'pearson', or callable"
  1392. )
  1393. @disallow("M8", "m8")
  1394. def nancov(
  1395. a: np.ndarray,
  1396. b: np.ndarray,
  1397. *,
  1398. min_periods: int | None = None,
  1399. ddof: int | None = 1,
  1400. ) -> float:
  1401. if len(a) != len(b):
  1402. raise AssertionError("Operands to nancov must have same size")
  1403. if min_periods is None:
  1404. min_periods = 1
  1405. valid = notna(a) & notna(b)
  1406. if not valid.all():
  1407. a = a[valid]
  1408. b = b[valid]
  1409. if len(a) < min_periods:
  1410. return np.nan
  1411. a = _ensure_numeric(a)
  1412. b = _ensure_numeric(b)
  1413. return np.cov(a, b, ddof=ddof)[0, 1]
  1414. def _ensure_numeric(x):
  1415. if isinstance(x, np.ndarray):
  1416. if x.dtype.kind in "biu":
  1417. x = x.astype(np.float64)
  1418. elif x.dtype == object:
  1419. inferred = lib.infer_dtype(x)
  1420. if inferred in ["string", "mixed"]:
  1421. # GH#44008, GH#36703 avoid casting e.g. strings to numeric
  1422. raise TypeError(f"Could not convert {x} to numeric")
  1423. try:
  1424. x = x.astype(np.complex128)
  1425. except (TypeError, ValueError):
  1426. try:
  1427. x = x.astype(np.float64)
  1428. except ValueError as err:
  1429. # GH#29941 we get here with object arrays containing strs
  1430. raise TypeError(f"Could not convert {x} to numeric") from err
  1431. else:
  1432. if not np.any(np.imag(x)):
  1433. x = x.real
  1434. elif not (is_float(x) or is_integer(x) or is_complex(x)):
  1435. if isinstance(x, str):
  1436. # GH#44008, GH#36703 avoid casting e.g. strings to numeric
  1437. raise TypeError(f"Could not convert string '{x}' to numeric")
  1438. try:
  1439. x = float(x)
  1440. except (TypeError, ValueError):
  1441. # e.g. "1+1j" or "foo"
  1442. try:
  1443. x = complex(x)
  1444. except ValueError as err:
  1445. # e.g. "foo"
  1446. raise TypeError(f"Could not convert {x} to numeric") from err
  1447. return x
  1448. def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike:
  1449. """
  1450. Cumulative function with skipna support.
  1451. Parameters
  1452. ----------
  1453. values : np.ndarray or ExtensionArray
  1454. accum_func : {np.cumprod, np.maximum.accumulate, np.cumsum, np.minimum.accumulate}
  1455. skipna : bool
  1456. Returns
  1457. -------
  1458. np.ndarray or ExtensionArray
  1459. """
  1460. mask_a, mask_b = {
  1461. np.cumprod: (1.0, np.nan),
  1462. np.maximum.accumulate: (-np.inf, np.nan),
  1463. np.cumsum: (0.0, np.nan),
  1464. np.minimum.accumulate: (np.inf, np.nan),
  1465. }[accum_func]
  1466. # This should go through ea interface
  1467. assert values.dtype.kind not in "mM"
  1468. # We will be applying this function to block values
  1469. if skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)):
  1470. vals = values.copy()
  1471. mask = isna(vals)
  1472. vals[mask] = mask_a
  1473. result = accum_func(vals, axis=0)
  1474. result[mask] = mask_b
  1475. else:
  1476. result = accum_func(values, axis=0)
  1477. return result