base.py 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400
  1. """
  2. Base and utility classes for pandas objects.
  3. """
  4. from __future__ import annotations
  5. import textwrap
  6. from typing import (
  7. TYPE_CHECKING,
  8. Any,
  9. Generic,
  10. Literal,
  11. cast,
  12. final,
  13. overload,
  14. )
  15. import warnings
  16. import numpy as np
  17. from pandas._config import using_copy_on_write
  18. from pandas._libs import lib
  19. from pandas._typing import (
  20. AxisInt,
  21. DtypeObj,
  22. IndexLabel,
  23. NDFrameT,
  24. Self,
  25. Shape,
  26. npt,
  27. )
  28. from pandas.compat import PYPY
  29. from pandas.compat.numpy import function as nv
  30. from pandas.errors import AbstractMethodError
  31. from pandas.util._decorators import (
  32. cache_readonly,
  33. doc,
  34. )
  35. from pandas.util._exceptions import find_stack_level
  36. from pandas.core.dtypes.cast import can_hold_element
  37. from pandas.core.dtypes.common import (
  38. is_object_dtype,
  39. is_scalar,
  40. )
  41. from pandas.core.dtypes.dtypes import ExtensionDtype
  42. from pandas.core.dtypes.generic import (
  43. ABCDataFrame,
  44. ABCIndex,
  45. ABCMultiIndex,
  46. ABCSeries,
  47. )
  48. from pandas.core.dtypes.missing import (
  49. isna,
  50. remove_na_arraylike,
  51. )
  52. from pandas.core import (
  53. algorithms,
  54. nanops,
  55. ops,
  56. )
  57. from pandas.core.accessor import DirNamesMixin
  58. from pandas.core.arraylike import OpsMixin
  59. from pandas.core.arrays import ExtensionArray
  60. from pandas.core.construction import (
  61. ensure_wrapped_if_datetimelike,
  62. extract_array,
  63. )
  64. if TYPE_CHECKING:
  65. from collections.abc import (
  66. Hashable,
  67. Iterator,
  68. )
  69. from pandas._typing import (
  70. DropKeep,
  71. NumpySorter,
  72. NumpyValueArrayLike,
  73. ScalarLike_co,
  74. )
  75. from pandas import (
  76. DataFrame,
  77. Index,
  78. Series,
  79. )
  80. _shared_docs: dict[str, str] = {}
  81. _indexops_doc_kwargs = {
  82. "klass": "IndexOpsMixin",
  83. "inplace": "",
  84. "unique": "IndexOpsMixin",
  85. "duplicated": "IndexOpsMixin",
  86. }
  87. class PandasObject(DirNamesMixin):
  88. """
  89. Baseclass for various pandas objects.
  90. """
  91. # results from calls to methods decorated with cache_readonly get added to _cache
  92. _cache: dict[str, Any]
  93. @property
  94. def _constructor(self):
  95. """
  96. Class constructor (for this class it's just `__class__`).
  97. """
  98. return type(self)
  99. def __repr__(self) -> str:
  100. """
  101. Return a string representation for a particular object.
  102. """
  103. # Should be overwritten by base classes
  104. return object.__repr__(self)
  105. def _reset_cache(self, key: str | None = None) -> None:
  106. """
  107. Reset cached properties. If ``key`` is passed, only clears that key.
  108. """
  109. if not hasattr(self, "_cache"):
  110. return
  111. if key is None:
  112. self._cache.clear()
  113. else:
  114. self._cache.pop(key, None)
  115. def __sizeof__(self) -> int:
  116. """
  117. Generates the total memory usage for an object that returns
  118. either a value or Series of values
  119. """
  120. memory_usage = getattr(self, "memory_usage", None)
  121. if memory_usage:
  122. mem = memory_usage(deep=True) # pylint: disable=not-callable
  123. return int(mem if is_scalar(mem) else mem.sum())
  124. # no memory_usage attribute, so fall back to object's 'sizeof'
  125. return super().__sizeof__()
  126. class NoNewAttributesMixin:
  127. """
  128. Mixin which prevents adding new attributes.
  129. Prevents additional attributes via xxx.attribute = "something" after a
  130. call to `self.__freeze()`. Mainly used to prevent the user from using
  131. wrong attributes on an accessor (`Series.cat/.str/.dt`).
  132. If you really want to add a new attribute at a later time, you need to use
  133. `object.__setattr__(self, key, value)`.
  134. """
  135. def _freeze(self) -> None:
  136. """
  137. Prevents setting additional attributes.
  138. """
  139. object.__setattr__(self, "__frozen", True)
  140. # prevent adding any attribute via s.xxx.new_attribute = ...
  141. def __setattr__(self, key: str, value) -> None:
  142. # _cache is used by a decorator
  143. # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key)
  144. # because
  145. # 1.) getattr is false for attributes that raise errors
  146. # 2.) cls.__dict__ doesn't traverse into base classes
  147. if getattr(self, "__frozen", False) and not (
  148. key == "_cache"
  149. or key in type(self).__dict__
  150. or getattr(self, key, None) is not None
  151. ):
  152. raise AttributeError(f"You cannot add any new attribute '{key}'")
  153. object.__setattr__(self, key, value)
  154. class SelectionMixin(Generic[NDFrameT]):
  155. """
  156. mixin implementing the selection & aggregation interface on a group-like
  157. object sub-classes need to define: obj, exclusions
  158. """
  159. obj: NDFrameT
  160. _selection: IndexLabel | None = None
  161. exclusions: frozenset[Hashable]
  162. _internal_names = ["_cache", "__setstate__"]
  163. _internal_names_set = set(_internal_names)
  164. @final
  165. @property
  166. def _selection_list(self):
  167. if not isinstance(
  168. self._selection, (list, tuple, ABCSeries, ABCIndex, np.ndarray)
  169. ):
  170. return [self._selection]
  171. return self._selection
  172. @cache_readonly
  173. def _selected_obj(self):
  174. if self._selection is None or isinstance(self.obj, ABCSeries):
  175. return self.obj
  176. else:
  177. return self.obj[self._selection]
  178. @final
  179. @cache_readonly
  180. def ndim(self) -> int:
  181. return self._selected_obj.ndim
  182. @final
  183. @cache_readonly
  184. def _obj_with_exclusions(self):
  185. if isinstance(self.obj, ABCSeries):
  186. return self.obj
  187. if self._selection is not None:
  188. return self.obj._getitem_nocopy(self._selection_list)
  189. if len(self.exclusions) > 0:
  190. # equivalent to `self.obj.drop(self.exclusions, axis=1)
  191. # but this avoids consolidating and making a copy
  192. # TODO: following GH#45287 can we now use .drop directly without
  193. # making a copy?
  194. return self.obj._drop_axis(self.exclusions, axis=1, only_slice=True)
  195. else:
  196. return self.obj
  197. def __getitem__(self, key):
  198. if self._selection is not None:
  199. raise IndexError(f"Column(s) {self._selection} already selected")
  200. if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)):
  201. if len(self.obj.columns.intersection(key)) != len(set(key)):
  202. bad_keys = list(set(key).difference(self.obj.columns))
  203. raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}")
  204. return self._gotitem(list(key), ndim=2)
  205. else:
  206. if key not in self.obj:
  207. raise KeyError(f"Column not found: {key}")
  208. ndim = self.obj[key].ndim
  209. return self._gotitem(key, ndim=ndim)
  210. def _gotitem(self, key, ndim: int, subset=None):
  211. """
  212. sub-classes to define
  213. return a sliced object
  214. Parameters
  215. ----------
  216. key : str / list of selections
  217. ndim : {1, 2}
  218. requested ndim of result
  219. subset : object, default None
  220. subset to act on
  221. """
  222. raise AbstractMethodError(self)
  223. @final
  224. def _infer_selection(self, key, subset: Series | DataFrame):
  225. """
  226. Infer the `selection` to pass to our constructor in _gotitem.
  227. """
  228. # Shared by Rolling and Resample
  229. selection = None
  230. if subset.ndim == 2 and (
  231. (lib.is_scalar(key) and key in subset) or lib.is_list_like(key)
  232. ):
  233. selection = key
  234. elif subset.ndim == 1 and lib.is_scalar(key) and key == subset.name:
  235. selection = key
  236. return selection
  237. def aggregate(self, func, *args, **kwargs):
  238. raise AbstractMethodError(self)
  239. agg = aggregate
  240. class IndexOpsMixin(OpsMixin):
  241. """
  242. Common ops mixin to support a unified interface / docs for Series / Index
  243. """
  244. # ndarray compatibility
  245. __array_priority__ = 1000
  246. _hidden_attrs: frozenset[str] = frozenset(
  247. ["tolist"] # tolist is not deprecated, just suppressed in the __dir__
  248. )
  249. @property
  250. def dtype(self) -> DtypeObj:
  251. # must be defined here as a property for mypy
  252. raise AbstractMethodError(self)
  253. @property
  254. def _values(self) -> ExtensionArray | np.ndarray:
  255. # must be defined here as a property for mypy
  256. raise AbstractMethodError(self)
  257. @final
  258. def transpose(self, *args, **kwargs) -> Self:
  259. """
  260. Return the transpose, which is by definition self.
  261. Returns
  262. -------
  263. %(klass)s
  264. """
  265. nv.validate_transpose(args, kwargs)
  266. return self
  267. T = property(
  268. transpose,
  269. doc="""
  270. Return the transpose, which is by definition self.
  271. Examples
  272. --------
  273. For Series:
  274. >>> s = pd.Series(['Ant', 'Bear', 'Cow'])
  275. >>> s
  276. 0 Ant
  277. 1 Bear
  278. 2 Cow
  279. dtype: object
  280. >>> s.T
  281. 0 Ant
  282. 1 Bear
  283. 2 Cow
  284. dtype: object
  285. For Index:
  286. >>> idx = pd.Index([1, 2, 3])
  287. >>> idx.T
  288. Index([1, 2, 3], dtype='int64')
  289. """,
  290. )
  291. @property
  292. def shape(self) -> Shape:
  293. """
  294. Return a tuple of the shape of the underlying data.
  295. Examples
  296. --------
  297. >>> s = pd.Series([1, 2, 3])
  298. >>> s.shape
  299. (3,)
  300. """
  301. return self._values.shape
  302. def __len__(self) -> int:
  303. # We need this defined here for mypy
  304. raise AbstractMethodError(self)
  305. # Temporarily avoid using `-> Literal[1]:` because of an IPython (jedi) bug
  306. # https://github.com/ipython/ipython/issues/14412
  307. # https://github.com/davidhalter/jedi/issues/1990
  308. @property
  309. def ndim(self) -> int:
  310. """
  311. Number of dimensions of the underlying data, by definition 1.
  312. Examples
  313. --------
  314. >>> s = pd.Series(['Ant', 'Bear', 'Cow'])
  315. >>> s
  316. 0 Ant
  317. 1 Bear
  318. 2 Cow
  319. dtype: object
  320. >>> s.ndim
  321. 1
  322. For Index:
  323. >>> idx = pd.Index([1, 2, 3])
  324. >>> idx
  325. Index([1, 2, 3], dtype='int64')
  326. >>> idx.ndim
  327. 1
  328. """
  329. return 1
  330. @final
  331. def item(self):
  332. """
  333. Return the first element of the underlying data as a Python scalar.
  334. Returns
  335. -------
  336. scalar
  337. The first element of Series or Index.
  338. Raises
  339. ------
  340. ValueError
  341. If the data is not length = 1.
  342. Examples
  343. --------
  344. >>> s = pd.Series([1])
  345. >>> s.item()
  346. 1
  347. For an index:
  348. >>> s = pd.Series([1], index=['a'])
  349. >>> s.index.item()
  350. 'a'
  351. """
  352. if len(self) == 1:
  353. return next(iter(self))
  354. raise ValueError("can only convert an array of size 1 to a Python scalar")
  355. @property
  356. def nbytes(self) -> int:
  357. """
  358. Return the number of bytes in the underlying data.
  359. Examples
  360. --------
  361. For Series:
  362. >>> s = pd.Series(['Ant', 'Bear', 'Cow'])
  363. >>> s
  364. 0 Ant
  365. 1 Bear
  366. 2 Cow
  367. dtype: object
  368. >>> s.nbytes
  369. 24
  370. For Index:
  371. >>> idx = pd.Index([1, 2, 3])
  372. >>> idx
  373. Index([1, 2, 3], dtype='int64')
  374. >>> idx.nbytes
  375. 24
  376. """
  377. return self._values.nbytes
  378. @property
  379. def size(self) -> int:
  380. """
  381. Return the number of elements in the underlying data.
  382. Examples
  383. --------
  384. For Series:
  385. >>> s = pd.Series(['Ant', 'Bear', 'Cow'])
  386. >>> s
  387. 0 Ant
  388. 1 Bear
  389. 2 Cow
  390. dtype: object
  391. >>> s.size
  392. 3
  393. For Index:
  394. >>> idx = pd.Index([1, 2, 3])
  395. >>> idx
  396. Index([1, 2, 3], dtype='int64')
  397. >>> idx.size
  398. 3
  399. """
  400. return len(self._values)
  401. @property
  402. def array(self) -> ExtensionArray:
  403. """
  404. The ExtensionArray of the data backing this Series or Index.
  405. Returns
  406. -------
  407. ExtensionArray
  408. An ExtensionArray of the values stored within. For extension
  409. types, this is the actual array. For NumPy native types, this
  410. is a thin (no copy) wrapper around :class:`numpy.ndarray`.
  411. ``.array`` differs from ``.values``, which may require converting
  412. the data to a different form.
  413. See Also
  414. --------
  415. Index.to_numpy : Similar method that always returns a NumPy array.
  416. Series.to_numpy : Similar method that always returns a NumPy array.
  417. Notes
  418. -----
  419. This table lays out the different array types for each extension
  420. dtype within pandas.
  421. ================== =============================
  422. dtype array type
  423. ================== =============================
  424. category Categorical
  425. period PeriodArray
  426. interval IntervalArray
  427. IntegerNA IntegerArray
  428. string StringArray
  429. boolean BooleanArray
  430. datetime64[ns, tz] DatetimeArray
  431. ================== =============================
  432. For any 3rd-party extension types, the array type will be an
  433. ExtensionArray.
  434. For all remaining dtypes ``.array`` will be a
  435. :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray
  436. stored within. If you absolutely need a NumPy array (possibly with
  437. copying / coercing data), then use :meth:`Series.to_numpy` instead.
  438. Examples
  439. --------
  440. For regular NumPy types like int, and float, a NumpyExtensionArray
  441. is returned.
  442. >>> pd.Series([1, 2, 3]).array
  443. <NumpyExtensionArray>
  444. [1, 2, 3]
  445. Length: 3, dtype: int64
  446. For extension types, like Categorical, the actual ExtensionArray
  447. is returned
  448. >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
  449. >>> ser.array
  450. ['a', 'b', 'a']
  451. Categories (2, object): ['a', 'b']
  452. """
  453. raise AbstractMethodError(self)
  454. @final
  455. def to_numpy(
  456. self,
  457. dtype: npt.DTypeLike | None = None,
  458. copy: bool = False,
  459. na_value: object = lib.no_default,
  460. **kwargs,
  461. ) -> np.ndarray:
  462. """
  463. A NumPy ndarray representing the values in this Series or Index.
  464. Parameters
  465. ----------
  466. dtype : str or numpy.dtype, optional
  467. The dtype to pass to :meth:`numpy.asarray`.
  468. copy : bool, default False
  469. Whether to ensure that the returned value is not a view on
  470. another array. Note that ``copy=False`` does not *ensure* that
  471. ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
  472. a copy is made, even if not strictly necessary.
  473. na_value : Any, optional
  474. The value to use for missing values. The default value depends
  475. on `dtype` and the type of the array.
  476. **kwargs
  477. Additional keywords passed through to the ``to_numpy`` method
  478. of the underlying array (for extension arrays).
  479. Returns
  480. -------
  481. numpy.ndarray
  482. See Also
  483. --------
  484. Series.array : Get the actual data stored within.
  485. Index.array : Get the actual data stored within.
  486. DataFrame.to_numpy : Similar method for DataFrame.
  487. Notes
  488. -----
  489. The returned array will be the same up to equality (values equal
  490. in `self` will be equal in the returned array; likewise for values
  491. that are not equal). When `self` contains an ExtensionArray, the
  492. dtype may be different. For example, for a category-dtype Series,
  493. ``to_numpy()`` will return a NumPy array and the categorical dtype
  494. will be lost.
  495. For NumPy dtypes, this will be a reference to the actual data stored
  496. in this Series or Index (assuming ``copy=False``). Modifying the result
  497. in place will modify the data stored in the Series or Index (not that
  498. we recommend doing that).
  499. For extension types, ``to_numpy()`` *may* require copying data and
  500. coercing the result to a NumPy type (possibly object), which may be
  501. expensive. When you need a no-copy reference to the underlying data,
  502. :attr:`Series.array` should be used instead.
  503. This table lays out the different dtypes and default return types of
  504. ``to_numpy()`` for various dtypes within pandas.
  505. ================== ================================
  506. dtype array type
  507. ================== ================================
  508. category[T] ndarray[T] (same dtype as input)
  509. period ndarray[object] (Periods)
  510. interval ndarray[object] (Intervals)
  511. IntegerNA ndarray[object]
  512. datetime64[ns] datetime64[ns]
  513. datetime64[ns, tz] ndarray[object] (Timestamps)
  514. ================== ================================
  515. Examples
  516. --------
  517. >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
  518. >>> ser.to_numpy()
  519. array(['a', 'b', 'a'], dtype=object)
  520. Specify the `dtype` to control how datetime-aware data is represented.
  521. Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
  522. objects, each with the correct ``tz``.
  523. >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
  524. >>> ser.to_numpy(dtype=object)
  525. array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),
  526. Timestamp('2000-01-02 00:00:00+0100', tz='CET')],
  527. dtype=object)
  528. Or ``dtype='datetime64[ns]'`` to return an ndarray of native
  529. datetime64 values. The values are converted to UTC and the timezone
  530. info is dropped.
  531. >>> ser.to_numpy(dtype="datetime64[ns]")
  532. ... # doctest: +ELLIPSIS
  533. array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
  534. dtype='datetime64[ns]')
  535. """
  536. if isinstance(self.dtype, ExtensionDtype):
  537. return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs)
  538. elif kwargs:
  539. bad_keys = next(iter(kwargs.keys()))
  540. raise TypeError(
  541. f"to_numpy() got an unexpected keyword argument '{bad_keys}'"
  542. )
  543. fillna = (
  544. na_value is not lib.no_default
  545. # no need to fillna with np.nan if we already have a float dtype
  546. and not (na_value is np.nan and np.issubdtype(self.dtype, np.floating))
  547. )
  548. values = self._values
  549. if fillna:
  550. if not can_hold_element(values, na_value):
  551. # if we can't hold the na_value asarray either makes a copy or we
  552. # error before modifying values. The asarray later on thus won't make
  553. # another copy
  554. values = np.asarray(values, dtype=dtype)
  555. else:
  556. values = values.copy()
  557. values[np.asanyarray(isna(self))] = na_value
  558. result = np.asarray(values, dtype=dtype)
  559. if (copy and not fillna) or (not copy and using_copy_on_write()):
  560. if np.shares_memory(self._values[:2], result[:2]):
  561. # Take slices to improve performance of check
  562. if using_copy_on_write() and not copy:
  563. result = result.view()
  564. result.flags.writeable = False
  565. else:
  566. result = result.copy()
  567. return result
  568. @final
  569. @property
  570. def empty(self) -> bool:
  571. return not self.size
  572. @doc(op="max", oppose="min", value="largest")
  573. def argmax(
  574. self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs
  575. ) -> int:
  576. """
  577. Return int position of the {value} value in the Series.
  578. If the {op}imum is achieved in multiple locations,
  579. the first row position is returned.
  580. Parameters
  581. ----------
  582. axis : {{None}}
  583. Unused. Parameter needed for compatibility with DataFrame.
  584. skipna : bool, default True
  585. Exclude NA/null values when showing the result.
  586. *args, **kwargs
  587. Additional arguments and keywords for compatibility with NumPy.
  588. Returns
  589. -------
  590. int
  591. Row position of the {op}imum value.
  592. See Also
  593. --------
  594. Series.arg{op} : Return position of the {op}imum value.
  595. Series.arg{oppose} : Return position of the {oppose}imum value.
  596. numpy.ndarray.arg{op} : Equivalent method for numpy arrays.
  597. Series.idxmax : Return index label of the maximum values.
  598. Series.idxmin : Return index label of the minimum values.
  599. Examples
  600. --------
  601. Consider dataset containing cereal calories
  602. >>> s = pd.Series({{'Corn Flakes': 100.0, 'Almond Delight': 110.0,
  603. ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}})
  604. >>> s
  605. Corn Flakes 100.0
  606. Almond Delight 110.0
  607. Cinnamon Toast Crunch 120.0
  608. Cocoa Puff 110.0
  609. dtype: float64
  610. >>> s.argmax()
  611. 2
  612. >>> s.argmin()
  613. 0
  614. The maximum cereal calories is the third element and
  615. the minimum cereal calories is the first element,
  616. since series is zero-indexed.
  617. """
  618. delegate = self._values
  619. nv.validate_minmax_axis(axis)
  620. skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs)
  621. if isinstance(delegate, ExtensionArray):
  622. if not skipna and delegate.isna().any():
  623. warnings.warn(
  624. f"The behavior of {type(self).__name__}.argmax/argmin "
  625. "with skipna=False and NAs, or with all-NAs is deprecated. "
  626. "In a future version this will raise ValueError.",
  627. FutureWarning,
  628. stacklevel=find_stack_level(),
  629. )
  630. return -1
  631. else:
  632. return delegate.argmax()
  633. else:
  634. result = nanops.nanargmax(delegate, skipna=skipna)
  635. if result == -1:
  636. warnings.warn(
  637. f"The behavior of {type(self).__name__}.argmax/argmin "
  638. "with skipna=False and NAs, or with all-NAs is deprecated. "
  639. "In a future version this will raise ValueError.",
  640. FutureWarning,
  641. stacklevel=find_stack_level(),
  642. )
  643. # error: Incompatible return value type (got "Union[int, ndarray]", expected
  644. # "int")
  645. return result # type: ignore[return-value]
  646. @doc(argmax, op="min", oppose="max", value="smallest")
  647. def argmin(
  648. self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs
  649. ) -> int:
  650. delegate = self._values
  651. nv.validate_minmax_axis(axis)
  652. skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs)
  653. if isinstance(delegate, ExtensionArray):
  654. if not skipna and delegate.isna().any():
  655. warnings.warn(
  656. f"The behavior of {type(self).__name__}.argmax/argmin "
  657. "with skipna=False and NAs, or with all-NAs is deprecated. "
  658. "In a future version this will raise ValueError.",
  659. FutureWarning,
  660. stacklevel=find_stack_level(),
  661. )
  662. return -1
  663. else:
  664. return delegate.argmin()
  665. else:
  666. result = nanops.nanargmin(delegate, skipna=skipna)
  667. if result == -1:
  668. warnings.warn(
  669. f"The behavior of {type(self).__name__}.argmax/argmin "
  670. "with skipna=False and NAs, or with all-NAs is deprecated. "
  671. "In a future version this will raise ValueError.",
  672. FutureWarning,
  673. stacklevel=find_stack_level(),
  674. )
  675. # error: Incompatible return value type (got "Union[int, ndarray]", expected
  676. # "int")
  677. return result # type: ignore[return-value]
  678. def tolist(self):
  679. """
  680. Return a list of the values.
  681. These are each a scalar type, which is a Python scalar
  682. (for str, int, float) or a pandas scalar
  683. (for Timestamp/Timedelta/Interval/Period)
  684. Returns
  685. -------
  686. list
  687. See Also
  688. --------
  689. numpy.ndarray.tolist : Return the array as an a.ndim-levels deep
  690. nested list of Python scalars.
  691. Examples
  692. --------
  693. For Series
  694. >>> s = pd.Series([1, 2, 3])
  695. >>> s.to_list()
  696. [1, 2, 3]
  697. For Index:
  698. >>> idx = pd.Index([1, 2, 3])
  699. >>> idx
  700. Index([1, 2, 3], dtype='int64')
  701. >>> idx.to_list()
  702. [1, 2, 3]
  703. """
  704. return self._values.tolist()
  705. to_list = tolist
  706. def __iter__(self) -> Iterator:
  707. """
  708. Return an iterator of the values.
  709. These are each a scalar type, which is a Python scalar
  710. (for str, int, float) or a pandas scalar
  711. (for Timestamp/Timedelta/Interval/Period)
  712. Returns
  713. -------
  714. iterator
  715. Examples
  716. --------
  717. >>> s = pd.Series([1, 2, 3])
  718. >>> for x in s:
  719. ... print(x)
  720. 1
  721. 2
  722. 3
  723. """
  724. # We are explicitly making element iterators.
  725. if not isinstance(self._values, np.ndarray):
  726. # Check type instead of dtype to catch DTA/TDA
  727. return iter(self._values)
  728. else:
  729. return map(self._values.item, range(self._values.size))
  730. @cache_readonly
  731. def hasnans(self) -> bool:
  732. """
  733. Return True if there are any NaNs.
  734. Enables various performance speedups.
  735. Returns
  736. -------
  737. bool
  738. Examples
  739. --------
  740. >>> s = pd.Series([1, 2, 3, None])
  741. >>> s
  742. 0 1.0
  743. 1 2.0
  744. 2 3.0
  745. 3 NaN
  746. dtype: float64
  747. >>> s.hasnans
  748. True
  749. """
  750. # error: Item "bool" of "Union[bool, ndarray[Any, dtype[bool_]], NDFrame]"
  751. # has no attribute "any"
  752. return bool(isna(self).any()) # type: ignore[union-attr]
  753. @final
  754. def _map_values(self, mapper, na_action=None, convert: bool = True):
  755. """
  756. An internal function that maps values using the input
  757. correspondence (which can be a dict, Series, or function).
  758. Parameters
  759. ----------
  760. mapper : function, dict, or Series
  761. The input correspondence object
  762. na_action : {None, 'ignore'}
  763. If 'ignore', propagate NA values, without passing them to the
  764. mapping function
  765. convert : bool, default True
  766. Try to find better dtype for elementwise function results. If
  767. False, leave as dtype=object. Note that the dtype is always
  768. preserved for some extension array dtypes, such as Categorical.
  769. Returns
  770. -------
  771. Union[Index, MultiIndex], inferred
  772. The output of the mapping function applied to the index.
  773. If the function returns a tuple with more than one element
  774. a MultiIndex will be returned.
  775. """
  776. arr = self._values
  777. if isinstance(arr, ExtensionArray):
  778. return arr.map(mapper, na_action=na_action)
  779. return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)
  780. @final
  781. def value_counts(
  782. self,
  783. normalize: bool = False,
  784. sort: bool = True,
  785. ascending: bool = False,
  786. bins=None,
  787. dropna: bool = True,
  788. ) -> Series:
  789. """
  790. Return a Series containing counts of unique values.
  791. The resulting object will be in descending order so that the
  792. first element is the most frequently-occurring element.
  793. Excludes NA values by default.
  794. Parameters
  795. ----------
  796. normalize : bool, default False
  797. If True then the object returned will contain the relative
  798. frequencies of the unique values.
  799. sort : bool, default True
  800. Sort by frequencies when True. Preserve the order of the data when False.
  801. ascending : bool, default False
  802. Sort in ascending order.
  803. bins : int, optional
  804. Rather than count values, group them into half-open bins,
  805. a convenience for ``pd.cut``, only works with numeric data.
  806. dropna : bool, default True
  807. Don't include counts of NaN.
  808. Returns
  809. -------
  810. Series
  811. See Also
  812. --------
  813. Series.count: Number of non-NA elements in a Series.
  814. DataFrame.count: Number of non-NA elements in a DataFrame.
  815. DataFrame.value_counts: Equivalent method on DataFrames.
  816. Examples
  817. --------
  818. >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
  819. >>> index.value_counts()
  820. 3.0 2
  821. 1.0 1
  822. 2.0 1
  823. 4.0 1
  824. Name: count, dtype: int64
  825. With `normalize` set to `True`, returns the relative frequency by
  826. dividing all values by the sum of values.
  827. >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
  828. >>> s.value_counts(normalize=True)
  829. 3.0 0.4
  830. 1.0 0.2
  831. 2.0 0.2
  832. 4.0 0.2
  833. Name: proportion, dtype: float64
  834. **bins**
  835. Bins can be useful for going from a continuous variable to a
  836. categorical variable; instead of counting unique
  837. apparitions of values, divide the index in the specified
  838. number of half-open bins.
  839. >>> s.value_counts(bins=3)
  840. (0.996, 2.0] 2
  841. (2.0, 3.0] 2
  842. (3.0, 4.0] 1
  843. Name: count, dtype: int64
  844. **dropna**
  845. With `dropna` set to `False` we can also see NaN index values.
  846. >>> s.value_counts(dropna=False)
  847. 3.0 2
  848. 1.0 1
  849. 2.0 1
  850. 4.0 1
  851. NaN 1
  852. Name: count, dtype: int64
  853. """
  854. return algorithms.value_counts_internal(
  855. self,
  856. sort=sort,
  857. ascending=ascending,
  858. normalize=normalize,
  859. bins=bins,
  860. dropna=dropna,
  861. )
  862. def unique(self):
  863. values = self._values
  864. if not isinstance(values, np.ndarray):
  865. # i.e. ExtensionArray
  866. result = values.unique()
  867. else:
  868. result = algorithms.unique1d(values)
  869. return result
  870. @final
  871. def nunique(self, dropna: bool = True) -> int:
  872. """
  873. Return number of unique elements in the object.
  874. Excludes NA values by default.
  875. Parameters
  876. ----------
  877. dropna : bool, default True
  878. Don't include NaN in the count.
  879. Returns
  880. -------
  881. int
  882. See Also
  883. --------
  884. DataFrame.nunique: Method nunique for DataFrame.
  885. Series.count: Count non-NA/null observations in the Series.
  886. Examples
  887. --------
  888. >>> s = pd.Series([1, 3, 5, 7, 7])
  889. >>> s
  890. 0 1
  891. 1 3
  892. 2 5
  893. 3 7
  894. 4 7
  895. dtype: int64
  896. >>> s.nunique()
  897. 4
  898. """
  899. uniqs = self.unique()
  900. if dropna:
  901. uniqs = remove_na_arraylike(uniqs)
  902. return len(uniqs)
  903. @property
  904. def is_unique(self) -> bool:
  905. """
  906. Return boolean if values in the object are unique.
  907. Returns
  908. -------
  909. bool
  910. Examples
  911. --------
  912. >>> s = pd.Series([1, 2, 3])
  913. >>> s.is_unique
  914. True
  915. >>> s = pd.Series([1, 2, 3, 1])
  916. >>> s.is_unique
  917. False
  918. """
  919. return self.nunique(dropna=False) == len(self)
  920. @property
  921. def is_monotonic_increasing(self) -> bool:
  922. """
  923. Return boolean if values in the object are monotonically increasing.
  924. Returns
  925. -------
  926. bool
  927. Examples
  928. --------
  929. >>> s = pd.Series([1, 2, 2])
  930. >>> s.is_monotonic_increasing
  931. True
  932. >>> s = pd.Series([3, 2, 1])
  933. >>> s.is_monotonic_increasing
  934. False
  935. """
  936. from pandas import Index
  937. return Index(self).is_monotonic_increasing
  938. @property
  939. def is_monotonic_decreasing(self) -> bool:
  940. """
  941. Return boolean if values in the object are monotonically decreasing.
  942. Returns
  943. -------
  944. bool
  945. Examples
  946. --------
  947. >>> s = pd.Series([3, 2, 2, 1])
  948. >>> s.is_monotonic_decreasing
  949. True
  950. >>> s = pd.Series([1, 2, 3])
  951. >>> s.is_monotonic_decreasing
  952. False
  953. """
  954. from pandas import Index
  955. return Index(self).is_monotonic_decreasing
  956. @final
  957. def _memory_usage(self, deep: bool = False) -> int:
  958. """
  959. Memory usage of the values.
  960. Parameters
  961. ----------
  962. deep : bool, default False
  963. Introspect the data deeply, interrogate
  964. `object` dtypes for system-level memory consumption.
  965. Returns
  966. -------
  967. bytes used
  968. See Also
  969. --------
  970. numpy.ndarray.nbytes : Total bytes consumed by the elements of the
  971. array.
  972. Notes
  973. -----
  974. Memory usage does not include memory consumed by elements that
  975. are not components of the array if deep=False or if used on PyPy
  976. Examples
  977. --------
  978. >>> idx = pd.Index([1, 2, 3])
  979. >>> idx.memory_usage()
  980. 24
  981. """
  982. if hasattr(self.array, "memory_usage"):
  983. return self.array.memory_usage( # pyright: ignore[reportGeneralTypeIssues]
  984. deep=deep,
  985. )
  986. v = self.array.nbytes
  987. if deep and is_object_dtype(self.dtype) and not PYPY:
  988. values = cast(np.ndarray, self._values)
  989. v += lib.memory_usage_of_objects(values)
  990. return v
  991. @doc(
  992. algorithms.factorize,
  993. values="",
  994. order="",
  995. size_hint="",
  996. sort=textwrap.dedent(
  997. """\
  998. sort : bool, default False
  999. Sort `uniques` and shuffle `codes` to maintain the
  1000. relationship.
  1001. """
  1002. ),
  1003. )
  1004. def factorize(
  1005. self,
  1006. sort: bool = False,
  1007. use_na_sentinel: bool = True,
  1008. ) -> tuple[npt.NDArray[np.intp], Index]:
  1009. codes, uniques = algorithms.factorize(
  1010. self._values, sort=sort, use_na_sentinel=use_na_sentinel
  1011. )
  1012. if uniques.dtype == np.float16:
  1013. uniques = uniques.astype(np.float32)
  1014. if isinstance(self, ABCMultiIndex):
  1015. # preserve MultiIndex
  1016. uniques = self._constructor(uniques)
  1017. else:
  1018. from pandas import Index
  1019. try:
  1020. uniques = Index(uniques, dtype=self.dtype)
  1021. except NotImplementedError:
  1022. # not all dtypes are supported in Index that are allowed for Series
  1023. # e.g. float16 or bytes
  1024. uniques = Index(uniques)
  1025. return codes, uniques
  1026. _shared_docs[
  1027. "searchsorted"
  1028. ] = """
  1029. Find indices where elements should be inserted to maintain order.
  1030. Find the indices into a sorted {klass} `self` such that, if the
  1031. corresponding elements in `value` were inserted before the indices,
  1032. the order of `self` would be preserved.
  1033. .. note::
  1034. The {klass} *must* be monotonically sorted, otherwise
  1035. wrong locations will likely be returned. Pandas does *not*
  1036. check this for you.
  1037. Parameters
  1038. ----------
  1039. value : array-like or scalar
  1040. Values to insert into `self`.
  1041. side : {{'left', 'right'}}, optional
  1042. If 'left', the index of the first suitable location found is given.
  1043. If 'right', return the last such index. If there is no suitable
  1044. index, return either 0 or N (where N is the length of `self`).
  1045. sorter : 1-D array-like, optional
  1046. Optional array of integer indices that sort `self` into ascending
  1047. order. They are typically the result of ``np.argsort``.
  1048. Returns
  1049. -------
  1050. int or array of int
  1051. A scalar or array of insertion points with the
  1052. same shape as `value`.
  1053. See Also
  1054. --------
  1055. sort_values : Sort by the values along either axis.
  1056. numpy.searchsorted : Similar method from NumPy.
  1057. Notes
  1058. -----
  1059. Binary search is used to find the required insertion points.
  1060. Examples
  1061. --------
  1062. >>> ser = pd.Series([1, 2, 3])
  1063. >>> ser
  1064. 0 1
  1065. 1 2
  1066. 2 3
  1067. dtype: int64
  1068. >>> ser.searchsorted(4)
  1069. 3
  1070. >>> ser.searchsorted([0, 4])
  1071. array([0, 3])
  1072. >>> ser.searchsorted([1, 3], side='left')
  1073. array([0, 2])
  1074. >>> ser.searchsorted([1, 3], side='right')
  1075. array([1, 3])
  1076. >>> ser = pd.Series(pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000']))
  1077. >>> ser
  1078. 0 2000-03-11
  1079. 1 2000-03-12
  1080. 2 2000-03-13
  1081. dtype: datetime64[ns]
  1082. >>> ser.searchsorted('3/14/2000')
  1083. 3
  1084. >>> ser = pd.Categorical(
  1085. ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True
  1086. ... )
  1087. >>> ser
  1088. ['apple', 'bread', 'bread', 'cheese', 'milk']
  1089. Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk']
  1090. >>> ser.searchsorted('bread')
  1091. 1
  1092. >>> ser.searchsorted(['bread'], side='right')
  1093. array([3])
  1094. If the values are not monotonically sorted, wrong locations
  1095. may be returned:
  1096. >>> ser = pd.Series([2, 1, 3])
  1097. >>> ser
  1098. 0 2
  1099. 1 1
  1100. 2 3
  1101. dtype: int64
  1102. >>> ser.searchsorted(1) # doctest: +SKIP
  1103. 0 # wrong result, correct would be 1
  1104. """
  1105. # This overload is needed so that the call to searchsorted in
  1106. # pandas.core.resample.TimeGrouper._get_period_bins picks the correct result
  1107. # error: Overloaded function signatures 1 and 2 overlap with incompatible
  1108. # return types
  1109. @overload
  1110. def searchsorted( # type: ignore[overload-overlap]
  1111. self,
  1112. value: ScalarLike_co,
  1113. side: Literal["left", "right"] = ...,
  1114. sorter: NumpySorter = ...,
  1115. ) -> np.intp:
  1116. ...
  1117. @overload
  1118. def searchsorted(
  1119. self,
  1120. value: npt.ArrayLike | ExtensionArray,
  1121. side: Literal["left", "right"] = ...,
  1122. sorter: NumpySorter = ...,
  1123. ) -> npt.NDArray[np.intp]:
  1124. ...
  1125. @doc(_shared_docs["searchsorted"], klass="Index")
  1126. def searchsorted(
  1127. self,
  1128. value: NumpyValueArrayLike | ExtensionArray,
  1129. side: Literal["left", "right"] = "left",
  1130. sorter: NumpySorter | None = None,
  1131. ) -> npt.NDArray[np.intp] | np.intp:
  1132. if isinstance(value, ABCDataFrame):
  1133. msg = (
  1134. "Value must be 1-D array-like or scalar, "
  1135. f"{type(value).__name__} is not supported"
  1136. )
  1137. raise ValueError(msg)
  1138. values = self._values
  1139. if not isinstance(values, np.ndarray):
  1140. # Going through EA.searchsorted directly improves performance GH#38083
  1141. return values.searchsorted(value, side=side, sorter=sorter)
  1142. return algorithms.searchsorted(
  1143. values,
  1144. value,
  1145. side=side,
  1146. sorter=sorter,
  1147. )
  1148. def drop_duplicates(self, *, keep: DropKeep = "first"):
  1149. duplicated = self._duplicated(keep=keep)
  1150. # error: Value of type "IndexOpsMixin" is not indexable
  1151. return self[~duplicated] # type: ignore[index]
  1152. @final
  1153. def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
  1154. arr = self._values
  1155. if isinstance(arr, ExtensionArray):
  1156. return arr.duplicated(keep=keep)
  1157. return algorithms.duplicated(arr, keep=keep)
  1158. def _arith_method(self, other, op):
  1159. res_name = ops.get_op_result_name(self, other)
  1160. lvalues = self._values
  1161. rvalues = extract_array(other, extract_numpy=True, extract_range=True)
  1162. rvalues = ops.maybe_prepare_scalar_for_op(rvalues, lvalues.shape)
  1163. rvalues = ensure_wrapped_if_datetimelike(rvalues)
  1164. if isinstance(rvalues, range):
  1165. rvalues = np.arange(rvalues.start, rvalues.stop, rvalues.step)
  1166. with np.errstate(all="ignore"):
  1167. result = ops.arithmetic_op(lvalues, rvalues, op)
  1168. return self._construct_result(result, name=res_name)
  1169. def _construct_result(self, result, name):
  1170. """
  1171. Construct an appropriately-wrapped result from the ArrayLike result
  1172. of an arithmetic-like operation.
  1173. """
  1174. raise AbstractMethodError(self)