missing.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158
  1. """
  2. Routines for filling missing data.
  3. """
  4. from __future__ import annotations
  5. from functools import wraps
  6. from typing import (
  7. TYPE_CHECKING,
  8. Any,
  9. Literal,
  10. cast,
  11. overload,
  12. )
  13. import numpy as np
  14. from pandas._libs import (
  15. NaT,
  16. algos,
  17. lib,
  18. )
  19. from pandas._typing import (
  20. ArrayLike,
  21. AxisInt,
  22. F,
  23. ReindexMethod,
  24. npt,
  25. )
  26. from pandas.compat._optional import import_optional_dependency
  27. from pandas.core.dtypes.cast import infer_dtype_from
  28. from pandas.core.dtypes.common import (
  29. is_array_like,
  30. is_bool_dtype,
  31. is_numeric_dtype,
  32. is_numeric_v_string_like,
  33. is_object_dtype,
  34. needs_i8_conversion,
  35. )
  36. from pandas.core.dtypes.dtypes import DatetimeTZDtype
  37. from pandas.core.dtypes.missing import (
  38. is_valid_na_for_dtype,
  39. isna,
  40. na_value_for_dtype,
  41. )
  42. if TYPE_CHECKING:
  43. from pandas import Index
  44. def check_value_size(value, mask: npt.NDArray[np.bool_], length: int):
  45. """
  46. Validate the size of the values passed to ExtensionArray.fillna.
  47. """
  48. if is_array_like(value):
  49. if len(value) != length:
  50. raise ValueError(
  51. f"Length of 'value' does not match. Got ({len(value)}) "
  52. f" expected {length}"
  53. )
  54. value = value[mask]
  55. return value
  56. def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]:
  57. """
  58. Return a masking array of same size/shape as arr
  59. with entries equaling any member of values_to_mask set to True
  60. Parameters
  61. ----------
  62. arr : ArrayLike
  63. values_to_mask: list, tuple, or scalar
  64. Returns
  65. -------
  66. np.ndarray[bool]
  67. """
  68. # When called from Block.replace/replace_list, values_to_mask is a scalar
  69. # known to be holdable by arr.
  70. # When called from Series._single_replace, values_to_mask is tuple or list
  71. dtype, values_to_mask = infer_dtype_from(values_to_mask)
  72. if isinstance(dtype, np.dtype):
  73. values_to_mask = np.array(values_to_mask, dtype=dtype)
  74. else:
  75. cls = dtype.construct_array_type()
  76. if not lib.is_list_like(values_to_mask):
  77. values_to_mask = [values_to_mask]
  78. values_to_mask = cls._from_sequence(values_to_mask, dtype=dtype, copy=False)
  79. potential_na = False
  80. if is_object_dtype(arr.dtype):
  81. # pre-compute mask to avoid comparison to NA
  82. potential_na = True
  83. arr_mask = ~isna(arr)
  84. na_mask = isna(values_to_mask)
  85. nonna = values_to_mask[~na_mask]
  86. # GH 21977
  87. mask = np.zeros(arr.shape, dtype=bool)
  88. if (
  89. is_numeric_dtype(arr.dtype)
  90. and not is_bool_dtype(arr.dtype)
  91. and is_bool_dtype(nonna.dtype)
  92. ):
  93. pass
  94. elif (
  95. is_bool_dtype(arr.dtype)
  96. and is_numeric_dtype(nonna.dtype)
  97. and not is_bool_dtype(nonna.dtype)
  98. ):
  99. pass
  100. else:
  101. for x in nonna:
  102. if is_numeric_v_string_like(arr, x):
  103. # GH#29553 prevent numpy deprecation warnings
  104. pass
  105. else:
  106. if potential_na:
  107. new_mask = np.zeros(arr.shape, dtype=np.bool_)
  108. new_mask[arr_mask] = arr[arr_mask] == x
  109. else:
  110. new_mask = arr == x
  111. if not isinstance(new_mask, np.ndarray):
  112. # usually BooleanArray
  113. new_mask = new_mask.to_numpy(dtype=bool, na_value=False)
  114. mask |= new_mask
  115. if na_mask.any():
  116. mask |= isna(arr)
  117. return mask
  118. @overload
  119. def clean_fill_method(
  120. method: Literal["ffill", "pad", "bfill", "backfill"],
  121. *,
  122. allow_nearest: Literal[False] = ...,
  123. ) -> Literal["pad", "backfill"]:
  124. ...
  125. @overload
  126. def clean_fill_method(
  127. method: Literal["ffill", "pad", "bfill", "backfill", "nearest"],
  128. *,
  129. allow_nearest: Literal[True],
  130. ) -> Literal["pad", "backfill", "nearest"]:
  131. ...
  132. def clean_fill_method(
  133. method: Literal["ffill", "pad", "bfill", "backfill", "nearest"],
  134. *,
  135. allow_nearest: bool = False,
  136. ) -> Literal["pad", "backfill", "nearest"]:
  137. if isinstance(method, str):
  138. # error: Incompatible types in assignment (expression has type "str", variable
  139. # has type "Literal['ffill', 'pad', 'bfill', 'backfill', 'nearest']")
  140. method = method.lower() # type: ignore[assignment]
  141. if method == "ffill":
  142. method = "pad"
  143. elif method == "bfill":
  144. method = "backfill"
  145. valid_methods = ["pad", "backfill"]
  146. expecting = "pad (ffill) or backfill (bfill)"
  147. if allow_nearest:
  148. valid_methods.append("nearest")
  149. expecting = "pad (ffill), backfill (bfill) or nearest"
  150. if method not in valid_methods:
  151. raise ValueError(f"Invalid fill method. Expecting {expecting}. Got {method}")
  152. return method
  153. # interpolation methods that dispatch to np.interp
  154. NP_METHODS = ["linear", "time", "index", "values"]
  155. # interpolation methods that dispatch to _interpolate_scipy_wrapper
  156. SP_METHODS = [
  157. "nearest",
  158. "zero",
  159. "slinear",
  160. "quadratic",
  161. "cubic",
  162. "barycentric",
  163. "krogh",
  164. "spline",
  165. "polynomial",
  166. "from_derivatives",
  167. "piecewise_polynomial",
  168. "pchip",
  169. "akima",
  170. "cubicspline",
  171. ]
  172. def clean_interp_method(method: str, index: Index, **kwargs) -> str:
  173. order = kwargs.get("order")
  174. if method in ("spline", "polynomial") and order is None:
  175. raise ValueError("You must specify the order of the spline or polynomial.")
  176. valid = NP_METHODS + SP_METHODS
  177. if method not in valid:
  178. raise ValueError(f"method must be one of {valid}. Got '{method}' instead.")
  179. if method in ("krogh", "piecewise_polynomial", "pchip"):
  180. if not index.is_monotonic_increasing:
  181. raise ValueError(
  182. f"{method} interpolation requires that the index be monotonic."
  183. )
  184. return method
  185. def find_valid_index(how: str, is_valid: npt.NDArray[np.bool_]) -> int | None:
  186. """
  187. Retrieves the positional index of the first valid value.
  188. Parameters
  189. ----------
  190. how : {'first', 'last'}
  191. Use this parameter to change between the first or last valid index.
  192. is_valid: np.ndarray
  193. Mask to find na_values.
  194. Returns
  195. -------
  196. int or None
  197. """
  198. assert how in ["first", "last"]
  199. if len(is_valid) == 0: # early stop
  200. return None
  201. if is_valid.ndim == 2:
  202. is_valid = is_valid.any(axis=1) # reduce axis 1
  203. if how == "first":
  204. idxpos = is_valid[::].argmax()
  205. elif how == "last":
  206. idxpos = len(is_valid) - 1 - is_valid[::-1].argmax()
  207. chk_notna = is_valid[idxpos]
  208. if not chk_notna:
  209. return None
  210. # Incompatible return value type (got "signedinteger[Any]",
  211. # expected "Optional[int]")
  212. return idxpos # type: ignore[return-value]
  213. def validate_limit_direction(
  214. limit_direction: str,
  215. ) -> Literal["forward", "backward", "both"]:
  216. valid_limit_directions = ["forward", "backward", "both"]
  217. limit_direction = limit_direction.lower()
  218. if limit_direction not in valid_limit_directions:
  219. raise ValueError(
  220. "Invalid limit_direction: expecting one of "
  221. f"{valid_limit_directions}, got '{limit_direction}'."
  222. )
  223. # error: Incompatible return value type (got "str", expected
  224. # "Literal['forward', 'backward', 'both']")
  225. return limit_direction # type: ignore[return-value]
  226. def validate_limit_area(limit_area: str | None) -> Literal["inside", "outside"] | None:
  227. if limit_area is not None:
  228. valid_limit_areas = ["inside", "outside"]
  229. limit_area = limit_area.lower()
  230. if limit_area not in valid_limit_areas:
  231. raise ValueError(
  232. f"Invalid limit_area: expecting one of {valid_limit_areas}, got "
  233. f"{limit_area}."
  234. )
  235. # error: Incompatible return value type (got "Optional[str]", expected
  236. # "Optional[Literal['inside', 'outside']]")
  237. return limit_area # type: ignore[return-value]
  238. def infer_limit_direction(
  239. limit_direction: Literal["backward", "forward", "both"] | None, method: str
  240. ) -> Literal["backward", "forward", "both"]:
  241. # Set `limit_direction` depending on `method`
  242. if limit_direction is None:
  243. if method in ("backfill", "bfill"):
  244. limit_direction = "backward"
  245. else:
  246. limit_direction = "forward"
  247. else:
  248. if method in ("pad", "ffill") and limit_direction != "forward":
  249. raise ValueError(
  250. f"`limit_direction` must be 'forward' for method `{method}`"
  251. )
  252. if method in ("backfill", "bfill") and limit_direction != "backward":
  253. raise ValueError(
  254. f"`limit_direction` must be 'backward' for method `{method}`"
  255. )
  256. return limit_direction
  257. def get_interp_index(method, index: Index) -> Index:
  258. # create/use the index
  259. if method == "linear":
  260. # prior default
  261. from pandas import Index
  262. index = Index(np.arange(len(index)))
  263. else:
  264. methods = {"index", "values", "nearest", "time"}
  265. is_numeric_or_datetime = (
  266. is_numeric_dtype(index.dtype)
  267. or isinstance(index.dtype, DatetimeTZDtype)
  268. or lib.is_np_dtype(index.dtype, "mM")
  269. )
  270. if method not in methods and not is_numeric_or_datetime:
  271. raise ValueError(
  272. "Index column must be numeric or datetime type when "
  273. f"using {method} method other than linear. "
  274. "Try setting a numeric or datetime index column before "
  275. "interpolating."
  276. )
  277. if isna(index).any():
  278. raise NotImplementedError(
  279. "Interpolation with NaNs in the index "
  280. "has not been implemented. Try filling "
  281. "those NaNs before interpolating."
  282. )
  283. return index
  284. def interpolate_2d_inplace(
  285. data: np.ndarray, # floating dtype
  286. index: Index,
  287. axis: AxisInt,
  288. method: str = "linear",
  289. limit: int | None = None,
  290. limit_direction: str = "forward",
  291. limit_area: str | None = None,
  292. fill_value: Any | None = None,
  293. mask=None,
  294. **kwargs,
  295. ) -> None:
  296. """
  297. Column-wise application of _interpolate_1d.
  298. Notes
  299. -----
  300. Alters 'data' in-place.
  301. The signature does differ from _interpolate_1d because it only
  302. includes what is needed for Block.interpolate.
  303. """
  304. # validate the interp method
  305. clean_interp_method(method, index, **kwargs)
  306. if is_valid_na_for_dtype(fill_value, data.dtype):
  307. fill_value = na_value_for_dtype(data.dtype, compat=False)
  308. if method == "time":
  309. if not needs_i8_conversion(index.dtype):
  310. raise ValueError(
  311. "time-weighted interpolation only works "
  312. "on Series or DataFrames with a "
  313. "DatetimeIndex"
  314. )
  315. method = "values"
  316. limit_direction = validate_limit_direction(limit_direction)
  317. limit_area_validated = validate_limit_area(limit_area)
  318. # default limit is unlimited GH #16282
  319. limit = algos.validate_limit(nobs=None, limit=limit)
  320. indices = _index_to_interp_indices(index, method)
  321. def func(yvalues: np.ndarray) -> None:
  322. # process 1-d slices in the axis direction
  323. _interpolate_1d(
  324. indices=indices,
  325. yvalues=yvalues,
  326. method=method,
  327. limit=limit,
  328. limit_direction=limit_direction,
  329. limit_area=limit_area_validated,
  330. fill_value=fill_value,
  331. bounds_error=False,
  332. mask=mask,
  333. **kwargs,
  334. )
  335. # error: Argument 1 to "apply_along_axis" has incompatible type
  336. # "Callable[[ndarray[Any, Any]], None]"; expected "Callable[...,
  337. # Union[_SupportsArray[dtype[<nothing>]], Sequence[_SupportsArray
  338. # [dtype[<nothing>]]], Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]],
  339. # Sequence[Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]]],
  340. # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]]]]]]"
  341. np.apply_along_axis(func, axis, data) # type: ignore[arg-type]
  342. def _index_to_interp_indices(index: Index, method: str) -> np.ndarray:
  343. """
  344. Convert Index to ndarray of indices to pass to NumPy/SciPy.
  345. """
  346. xarr = index._values
  347. if needs_i8_conversion(xarr.dtype):
  348. # GH#1646 for dt64tz
  349. xarr = xarr.view("i8")
  350. if method == "linear":
  351. inds = xarr
  352. inds = cast(np.ndarray, inds)
  353. else:
  354. inds = np.asarray(xarr)
  355. if method in ("values", "index"):
  356. if inds.dtype == np.object_:
  357. inds = lib.maybe_convert_objects(inds)
  358. return inds
  359. def _interpolate_1d(
  360. indices: np.ndarray,
  361. yvalues: np.ndarray,
  362. method: str = "linear",
  363. limit: int | None = None,
  364. limit_direction: str = "forward",
  365. limit_area: Literal["inside", "outside"] | None = None,
  366. fill_value: Any | None = None,
  367. bounds_error: bool = False,
  368. order: int | None = None,
  369. mask=None,
  370. **kwargs,
  371. ) -> None:
  372. """
  373. Logic for the 1-d interpolation. The input
  374. indices and yvalues will each be 1-d arrays of the same length.
  375. Bounds_error is currently hardcoded to False since non-scipy ones don't
  376. take it as an argument.
  377. Notes
  378. -----
  379. Fills 'yvalues' in-place.
  380. """
  381. if mask is not None:
  382. invalid = mask
  383. else:
  384. invalid = isna(yvalues)
  385. valid = ~invalid
  386. if not valid.any():
  387. return
  388. if valid.all():
  389. return
  390. # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
  391. all_nans = set(np.flatnonzero(invalid))
  392. first_valid_index = find_valid_index(how="first", is_valid=valid)
  393. if first_valid_index is None: # no nan found in start
  394. first_valid_index = 0
  395. start_nans = set(range(first_valid_index))
  396. last_valid_index = find_valid_index(how="last", is_valid=valid)
  397. if last_valid_index is None: # no nan found in end
  398. last_valid_index = len(yvalues)
  399. end_nans = set(range(1 + last_valid_index, len(valid)))
  400. # Like the sets above, preserve_nans contains indices of invalid values,
  401. # but in this case, it is the final set of indices that need to be
  402. # preserved as NaN after the interpolation.
  403. # For example if limit_direction='forward' then preserve_nans will
  404. # contain indices of NaNs at the beginning of the series, and NaNs that
  405. # are more than 'limit' away from the prior non-NaN.
  406. # set preserve_nans based on direction using _interp_limit
  407. preserve_nans: list | set
  408. if limit_direction == "forward":
  409. preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
  410. elif limit_direction == "backward":
  411. preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
  412. else:
  413. # both directions... just use _interp_limit
  414. preserve_nans = set(_interp_limit(invalid, limit, limit))
  415. # if limit_area is set, add either mid or outside indices
  416. # to preserve_nans GH #16284
  417. if limit_area == "inside":
  418. # preserve NaNs on the outside
  419. preserve_nans |= start_nans | end_nans
  420. elif limit_area == "outside":
  421. # preserve NaNs on the inside
  422. mid_nans = all_nans - start_nans - end_nans
  423. preserve_nans |= mid_nans
  424. # sort preserve_nans and convert to list
  425. preserve_nans = sorted(preserve_nans)
  426. is_datetimelike = yvalues.dtype.kind in "mM"
  427. if is_datetimelike:
  428. yvalues = yvalues.view("i8")
  429. if method in NP_METHODS:
  430. # np.interp requires sorted X values, #21037
  431. indexer = np.argsort(indices[valid])
  432. yvalues[invalid] = np.interp(
  433. indices[invalid], indices[valid][indexer], yvalues[valid][indexer]
  434. )
  435. else:
  436. yvalues[invalid] = _interpolate_scipy_wrapper(
  437. indices[valid],
  438. yvalues[valid],
  439. indices[invalid],
  440. method=method,
  441. fill_value=fill_value,
  442. bounds_error=bounds_error,
  443. order=order,
  444. **kwargs,
  445. )
  446. if mask is not None:
  447. mask[:] = False
  448. mask[preserve_nans] = True
  449. elif is_datetimelike:
  450. yvalues[preserve_nans] = NaT.value
  451. else:
  452. yvalues[preserve_nans] = np.nan
  453. return
  454. def _interpolate_scipy_wrapper(
  455. x: np.ndarray,
  456. y: np.ndarray,
  457. new_x: np.ndarray,
  458. method: str,
  459. fill_value=None,
  460. bounds_error: bool = False,
  461. order=None,
  462. **kwargs,
  463. ):
  464. """
  465. Passed off to scipy.interpolate.interp1d. method is scipy's kind.
  466. Returns an array interpolated at new_x. Add any new methods to
  467. the list in _clean_interp_method.
  468. """
  469. extra = f"{method} interpolation requires SciPy."
  470. import_optional_dependency("scipy", extra=extra)
  471. from scipy import interpolate
  472. new_x = np.asarray(new_x)
  473. # ignores some kwargs that could be passed along.
  474. alt_methods = {
  475. "barycentric": interpolate.barycentric_interpolate,
  476. "krogh": interpolate.krogh_interpolate,
  477. "from_derivatives": _from_derivatives,
  478. "piecewise_polynomial": _from_derivatives,
  479. "cubicspline": _cubicspline_interpolate,
  480. "akima": _akima_interpolate,
  481. "pchip": interpolate.pchip_interpolate,
  482. }
  483. interp1d_methods = [
  484. "nearest",
  485. "zero",
  486. "slinear",
  487. "quadratic",
  488. "cubic",
  489. "polynomial",
  490. ]
  491. if method in interp1d_methods:
  492. if method == "polynomial":
  493. kind = order
  494. else:
  495. kind = method
  496. terp = interpolate.interp1d(
  497. x, y, kind=kind, fill_value=fill_value, bounds_error=bounds_error
  498. )
  499. new_y = terp(new_x)
  500. elif method == "spline":
  501. # GH #10633, #24014
  502. if isna(order) or (order <= 0):
  503. raise ValueError(
  504. f"order needs to be specified and greater than 0; got order: {order}"
  505. )
  506. terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs)
  507. new_y = terp(new_x)
  508. else:
  509. # GH 7295: need to be able to write for some reason
  510. # in some circumstances: check all three
  511. if not x.flags.writeable:
  512. x = x.copy()
  513. if not y.flags.writeable:
  514. y = y.copy()
  515. if not new_x.flags.writeable:
  516. new_x = new_x.copy()
  517. terp = alt_methods[method]
  518. new_y = terp(x, y, new_x, **kwargs)
  519. return new_y
  520. def _from_derivatives(
  521. xi: np.ndarray,
  522. yi: np.ndarray,
  523. x: np.ndarray,
  524. order=None,
  525. der: int | list[int] | None = 0,
  526. extrapolate: bool = False,
  527. ):
  528. """
  529. Convenience function for interpolate.BPoly.from_derivatives.
  530. Construct a piecewise polynomial in the Bernstein basis, compatible
  531. with the specified values and derivatives at breakpoints.
  532. Parameters
  533. ----------
  534. xi : array-like
  535. sorted 1D array of x-coordinates
  536. yi : array-like or list of array-likes
  537. yi[i][j] is the j-th derivative known at xi[i]
  538. order: None or int or array-like of ints. Default: None.
  539. Specifies the degree of local polynomials. If not None, some
  540. derivatives are ignored.
  541. der : int or list
  542. How many derivatives to extract; None for all potentially nonzero
  543. derivatives (that is a number equal to the number of points), or a
  544. list of derivatives to extract. This number includes the function
  545. value as 0th derivative.
  546. extrapolate : bool, optional
  547. Whether to extrapolate to ouf-of-bounds points based on first and last
  548. intervals, or to return NaNs. Default: True.
  549. See Also
  550. --------
  551. scipy.interpolate.BPoly.from_derivatives
  552. Returns
  553. -------
  554. y : scalar or array-like
  555. The result, of length R or length M or M by R.
  556. """
  557. from scipy import interpolate
  558. # return the method for compat with scipy version & backwards compat
  559. method = interpolate.BPoly.from_derivatives
  560. m = method(xi, yi.reshape(-1, 1), orders=order, extrapolate=extrapolate)
  561. return m(x)
  562. def _akima_interpolate(
  563. xi: np.ndarray,
  564. yi: np.ndarray,
  565. x: np.ndarray,
  566. der: int | list[int] | None = 0,
  567. axis: AxisInt = 0,
  568. ):
  569. """
  570. Convenience function for akima interpolation.
  571. xi and yi are arrays of values used to approximate some function f,
  572. with ``yi = f(xi)``.
  573. See `Akima1DInterpolator` for details.
  574. Parameters
  575. ----------
  576. xi : np.ndarray
  577. A sorted list of x-coordinates, of length N.
  578. yi : np.ndarray
  579. A 1-D array of real values. `yi`'s length along the interpolation
  580. axis must be equal to the length of `xi`. If N-D array, use axis
  581. parameter to select correct axis.
  582. x : np.ndarray
  583. Of length M.
  584. der : int, optional
  585. How many derivatives to extract; None for all potentially
  586. nonzero derivatives (that is a number equal to the number
  587. of points), or a list of derivatives to extract. This number
  588. includes the function value as 0th derivative.
  589. axis : int, optional
  590. Axis in the yi array corresponding to the x-coordinate values.
  591. See Also
  592. --------
  593. scipy.interpolate.Akima1DInterpolator
  594. Returns
  595. -------
  596. y : scalar or array-like
  597. The result, of length R or length M or M by R,
  598. """
  599. from scipy import interpolate
  600. P = interpolate.Akima1DInterpolator(xi, yi, axis=axis)
  601. return P(x, nu=der)
  602. def _cubicspline_interpolate(
  603. xi: np.ndarray,
  604. yi: np.ndarray,
  605. x: np.ndarray,
  606. axis: AxisInt = 0,
  607. bc_type: str | tuple[Any, Any] = "not-a-knot",
  608. extrapolate=None,
  609. ):
  610. """
  611. Convenience function for cubic spline data interpolator.
  612. See `scipy.interpolate.CubicSpline` for details.
  613. Parameters
  614. ----------
  615. xi : np.ndarray, shape (n,)
  616. 1-d array containing values of the independent variable.
  617. Values must be real, finite and in strictly increasing order.
  618. yi : np.ndarray
  619. Array containing values of the dependent variable. It can have
  620. arbitrary number of dimensions, but the length along ``axis``
  621. (see below) must match the length of ``x``. Values must be finite.
  622. x : np.ndarray, shape (m,)
  623. axis : int, optional
  624. Axis along which `y` is assumed to be varying. Meaning that for
  625. ``x[i]`` the corresponding values are ``np.take(y, i, axis=axis)``.
  626. Default is 0.
  627. bc_type : string or 2-tuple, optional
  628. Boundary condition type. Two additional equations, given by the
  629. boundary conditions, are required to determine all coefficients of
  630. polynomials on each segment [2]_.
  631. If `bc_type` is a string, then the specified condition will be applied
  632. at both ends of a spline. Available conditions are:
  633. * 'not-a-knot' (default): The first and second segment at a curve end
  634. are the same polynomial. It is a good default when there is no
  635. information on boundary conditions.
  636. * 'periodic': The interpolated functions is assumed to be periodic
  637. of period ``x[-1] - x[0]``. The first and last value of `y` must be
  638. identical: ``y[0] == y[-1]``. This boundary condition will result in
  639. ``y'[0] == y'[-1]`` and ``y''[0] == y''[-1]``.
  640. * 'clamped': The first derivative at curves ends are zero. Assuming
  641. a 1D `y`, ``bc_type=((1, 0.0), (1, 0.0))`` is the same condition.
  642. * 'natural': The second derivative at curve ends are zero. Assuming
  643. a 1D `y`, ``bc_type=((2, 0.0), (2, 0.0))`` is the same condition.
  644. If `bc_type` is a 2-tuple, the first and the second value will be
  645. applied at the curve start and end respectively. The tuple values can
  646. be one of the previously mentioned strings (except 'periodic') or a
  647. tuple `(order, deriv_values)` allowing to specify arbitrary
  648. derivatives at curve ends:
  649. * `order`: the derivative order, 1 or 2.
  650. * `deriv_value`: array-like containing derivative values, shape must
  651. be the same as `y`, excluding ``axis`` dimension. For example, if
  652. `y` is 1D, then `deriv_value` must be a scalar. If `y` is 3D with
  653. the shape (n0, n1, n2) and axis=2, then `deriv_value` must be 2D
  654. and have the shape (n0, n1).
  655. extrapolate : {bool, 'periodic', None}, optional
  656. If bool, determines whether to extrapolate to out-of-bounds points
  657. based on first and last intervals, or to return NaNs. If 'periodic',
  658. periodic extrapolation is used. If None (default), ``extrapolate`` is
  659. set to 'periodic' for ``bc_type='periodic'`` and to True otherwise.
  660. See Also
  661. --------
  662. scipy.interpolate.CubicHermiteSpline
  663. Returns
  664. -------
  665. y : scalar or array-like
  666. The result, of shape (m,)
  667. References
  668. ----------
  669. .. [1] `Cubic Spline Interpolation
  670. <https://en.wikiversity.org/wiki/Cubic_Spline_Interpolation>`_
  671. on Wikiversity.
  672. .. [2] Carl de Boor, "A Practical Guide to Splines", Springer-Verlag, 1978.
  673. """
  674. from scipy import interpolate
  675. P = interpolate.CubicSpline(
  676. xi, yi, axis=axis, bc_type=bc_type, extrapolate=extrapolate
  677. )
  678. return P(x)
  679. def _interpolate_with_limit_area(
  680. values: np.ndarray,
  681. method: Literal["pad", "backfill"],
  682. limit: int | None,
  683. limit_area: Literal["inside", "outside"],
  684. ) -> None:
  685. """
  686. Apply interpolation and limit_area logic to values along a to-be-specified axis.
  687. Parameters
  688. ----------
  689. values: np.ndarray
  690. Input array.
  691. method: str
  692. Interpolation method. Could be "bfill" or "pad"
  693. limit: int, optional
  694. Index limit on interpolation.
  695. limit_area: {'inside', 'outside'}
  696. Limit area for interpolation.
  697. Notes
  698. -----
  699. Modifies values in-place.
  700. """
  701. invalid = isna(values)
  702. is_valid = ~invalid
  703. if not invalid.all():
  704. first = find_valid_index(how="first", is_valid=is_valid)
  705. if first is None:
  706. first = 0
  707. last = find_valid_index(how="last", is_valid=is_valid)
  708. if last is None:
  709. last = len(values)
  710. pad_or_backfill_inplace(
  711. values,
  712. method=method,
  713. limit=limit,
  714. limit_area=limit_area,
  715. )
  716. if limit_area == "inside":
  717. invalid[first : last + 1] = False
  718. elif limit_area == "outside":
  719. invalid[:first] = invalid[last + 1 :] = False
  720. else:
  721. raise ValueError("limit_area should be 'inside' or 'outside'")
  722. values[invalid] = np.nan
  723. def pad_or_backfill_inplace(
  724. values: np.ndarray,
  725. method: Literal["pad", "backfill"] = "pad",
  726. axis: AxisInt = 0,
  727. limit: int | None = None,
  728. limit_area: Literal["inside", "outside"] | None = None,
  729. ) -> None:
  730. """
  731. Perform an actual interpolation of values, values will be make 2-d if
  732. needed fills inplace, returns the result.
  733. Parameters
  734. ----------
  735. values: np.ndarray
  736. Input array.
  737. method: str, default "pad"
  738. Interpolation method. Could be "bfill" or "pad"
  739. axis: 0 or 1
  740. Interpolation axis
  741. limit: int, optional
  742. Index limit on interpolation.
  743. limit_area: str, optional
  744. Limit area for interpolation. Can be "inside" or "outside"
  745. Notes
  746. -----
  747. Modifies values in-place.
  748. """
  749. transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
  750. # reshape a 1 dim if needed
  751. if values.ndim == 1:
  752. if axis != 0: # pragma: no cover
  753. raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0")
  754. values = values.reshape(tuple((1,) + values.shape))
  755. method = clean_fill_method(method)
  756. tvalues = transf(values)
  757. func = get_fill_func(method, ndim=2)
  758. # _pad_2d and _backfill_2d both modify tvalues inplace
  759. func(tvalues, limit=limit, limit_area=limit_area)
  760. def _fillna_prep(
  761. values, mask: npt.NDArray[np.bool_] | None = None
  762. ) -> npt.NDArray[np.bool_]:
  763. # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d
  764. if mask is None:
  765. mask = isna(values)
  766. return mask
  767. def _datetimelike_compat(func: F) -> F:
  768. """
  769. Wrapper to handle datetime64 and timedelta64 dtypes.
  770. """
  771. @wraps(func)
  772. def new_func(
  773. values,
  774. limit: int | None = None,
  775. limit_area: Literal["inside", "outside"] | None = None,
  776. mask=None,
  777. ):
  778. if needs_i8_conversion(values.dtype):
  779. if mask is None:
  780. # This needs to occur before casting to int64
  781. mask = isna(values)
  782. result, mask = func(
  783. values.view("i8"), limit=limit, limit_area=limit_area, mask=mask
  784. )
  785. return result.view(values.dtype), mask
  786. return func(values, limit=limit, limit_area=limit_area, mask=mask)
  787. return cast(F, new_func)
  788. @_datetimelike_compat
  789. def _pad_1d(
  790. values: np.ndarray,
  791. limit: int | None = None,
  792. limit_area: Literal["inside", "outside"] | None = None,
  793. mask: npt.NDArray[np.bool_] | None = None,
  794. ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]:
  795. mask = _fillna_prep(values, mask)
  796. if limit_area is not None and not mask.all():
  797. _fill_limit_area_1d(mask, limit_area)
  798. algos.pad_inplace(values, mask, limit=limit)
  799. return values, mask
  800. @_datetimelike_compat
  801. def _backfill_1d(
  802. values: np.ndarray,
  803. limit: int | None = None,
  804. limit_area: Literal["inside", "outside"] | None = None,
  805. mask: npt.NDArray[np.bool_] | None = None,
  806. ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]:
  807. mask = _fillna_prep(values, mask)
  808. if limit_area is not None and not mask.all():
  809. _fill_limit_area_1d(mask, limit_area)
  810. algos.backfill_inplace(values, mask, limit=limit)
  811. return values, mask
  812. @_datetimelike_compat
  813. def _pad_2d(
  814. values: np.ndarray,
  815. limit: int | None = None,
  816. limit_area: Literal["inside", "outside"] | None = None,
  817. mask: npt.NDArray[np.bool_] | None = None,
  818. ):
  819. mask = _fillna_prep(values, mask)
  820. if limit_area is not None:
  821. _fill_limit_area_2d(mask, limit_area)
  822. if values.size:
  823. algos.pad_2d_inplace(values, mask, limit=limit)
  824. else:
  825. # for test coverage
  826. pass
  827. return values, mask
  828. @_datetimelike_compat
  829. def _backfill_2d(
  830. values,
  831. limit: int | None = None,
  832. limit_area: Literal["inside", "outside"] | None = None,
  833. mask: npt.NDArray[np.bool_] | None = None,
  834. ):
  835. mask = _fillna_prep(values, mask)
  836. if limit_area is not None:
  837. _fill_limit_area_2d(mask, limit_area)
  838. if values.size:
  839. algos.backfill_2d_inplace(values, mask, limit=limit)
  840. else:
  841. # for test coverage
  842. pass
  843. return values, mask
  844. def _fill_limit_area_1d(
  845. mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"]
  846. ) -> None:
  847. """Prepare 1d mask for ffill/bfill with limit_area.
  848. Caller is responsible for checking at least one value of mask is False.
  849. When called, mask will no longer faithfully represent when
  850. the corresponding are NA or not.
  851. Parameters
  852. ----------
  853. mask : np.ndarray[bool, ndim=1]
  854. Mask representing NA values when filling.
  855. limit_area : { "outside", "inside" }
  856. Whether to limit filling to outside or inside the outer most non-NA value.
  857. """
  858. neg_mask = ~mask
  859. first = neg_mask.argmax()
  860. last = len(neg_mask) - neg_mask[::-1].argmax() - 1
  861. if limit_area == "inside":
  862. mask[:first] = False
  863. mask[last + 1 :] = False
  864. elif limit_area == "outside":
  865. mask[first + 1 : last] = False
  866. def _fill_limit_area_2d(
  867. mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"]
  868. ) -> None:
  869. """Prepare 2d mask for ffill/bfill with limit_area.
  870. When called, mask will no longer faithfully represent when
  871. the corresponding are NA or not.
  872. Parameters
  873. ----------
  874. mask : np.ndarray[bool, ndim=1]
  875. Mask representing NA values when filling.
  876. limit_area : { "outside", "inside" }
  877. Whether to limit filling to outside or inside the outer most non-NA value.
  878. """
  879. neg_mask = ~mask.T
  880. if limit_area == "outside":
  881. # Identify inside
  882. la_mask = (
  883. np.maximum.accumulate(neg_mask, axis=0)
  884. & np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1]
  885. )
  886. else:
  887. # Identify outside
  888. la_mask = (
  889. ~np.maximum.accumulate(neg_mask, axis=0)
  890. | ~np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1]
  891. )
  892. mask[la_mask.T] = False
  893. _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d}
  894. def get_fill_func(method, ndim: int = 1):
  895. method = clean_fill_method(method)
  896. if ndim == 1:
  897. return _fill_methods[method]
  898. return {"pad": _pad_2d, "backfill": _backfill_2d}[method]
  899. def clean_reindex_fill_method(method) -> ReindexMethod | None:
  900. if method is None:
  901. return None
  902. return clean_fill_method(method, allow_nearest=True)
  903. def _interp_limit(
  904. invalid: npt.NDArray[np.bool_], fw_limit: int | None, bw_limit: int | None
  905. ):
  906. """
  907. Get indexers of values that won't be filled
  908. because they exceed the limits.
  909. Parameters
  910. ----------
  911. invalid : np.ndarray[bool]
  912. fw_limit : int or None
  913. forward limit to index
  914. bw_limit : int or None
  915. backward limit to index
  916. Returns
  917. -------
  918. set of indexers
  919. Notes
  920. -----
  921. This is equivalent to the more readable, but slower
  922. .. code-block:: python
  923. def _interp_limit(invalid, fw_limit, bw_limit):
  924. for x in np.where(invalid)[0]:
  925. if invalid[max(0, x - fw_limit):x + bw_limit + 1].all():
  926. yield x
  927. """
  928. # handle forward first; the backward direction is the same except
  929. # 1. operate on the reversed array
  930. # 2. subtract the returned indices from N - 1
  931. N = len(invalid)
  932. f_idx = set()
  933. b_idx = set()
  934. def inner(invalid, limit: int):
  935. limit = min(limit, N)
  936. windowed = _rolling_window(invalid, limit + 1).all(1)
  937. idx = set(np.where(windowed)[0] + limit) | set(
  938. np.where((~invalid[: limit + 1]).cumsum() == 0)[0]
  939. )
  940. return idx
  941. if fw_limit is not None:
  942. if fw_limit == 0:
  943. f_idx = set(np.where(invalid)[0])
  944. else:
  945. f_idx = inner(invalid, fw_limit)
  946. if bw_limit is not None:
  947. if bw_limit == 0:
  948. # then we don't even need to care about backwards
  949. # just use forwards
  950. return f_idx
  951. else:
  952. b_idx_inv = list(inner(invalid[::-1], bw_limit))
  953. b_idx = set(N - 1 - np.asarray(b_idx_inv))
  954. if fw_limit == 0:
  955. return b_idx
  956. return f_idx & b_idx
  957. def _rolling_window(a: npt.NDArray[np.bool_], window: int) -> npt.NDArray[np.bool_]:
  958. """
  959. [True, True, False, True, False], 2 ->
  960. [
  961. [True, True],
  962. [True, False],
  963. [False, True],
  964. [True, False],
  965. ]
  966. """
  967. # https://stackoverflow.com/a/6811241
  968. shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
  969. strides = a.strides + (a.strides[-1],)
  970. return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)