array.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. """
  2. Test extension array for storing nested data in a pandas container.
  3. The JSONArray stores lists of dictionaries. The storage mechanism is a list,
  4. not an ndarray.
  5. Note
  6. ----
  7. We currently store lists of UserDicts. Pandas has a few places
  8. internally that specifically check for dicts, and does non-scalar things
  9. in that case. We *want* the dictionaries to be treated as scalars, so we
  10. hack around pandas by using UserDicts.
  11. """
  12. from __future__ import annotations
  13. from collections import (
  14. UserDict,
  15. abc,
  16. )
  17. import itertools
  18. import numbers
  19. import string
  20. import sys
  21. from typing import (
  22. TYPE_CHECKING,
  23. Any,
  24. )
  25. import warnings
  26. import numpy as np
  27. from pandas.util._exceptions import find_stack_level
  28. from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
  29. from pandas.core.dtypes.common import (
  30. is_bool_dtype,
  31. is_list_like,
  32. pandas_dtype,
  33. )
  34. import pandas as pd
  35. from pandas.api.extensions import (
  36. ExtensionArray,
  37. ExtensionDtype,
  38. )
  39. from pandas.core.indexers import unpack_tuple_and_ellipses
  40. if TYPE_CHECKING:
  41. from collections.abc import Mapping
  42. from pandas._typing import type_t
  43. class JSONDtype(ExtensionDtype):
  44. type = abc.Mapping
  45. name = "json"
  46. na_value: Mapping[str, Any] = UserDict()
  47. @classmethod
  48. def construct_array_type(cls) -> type_t[JSONArray]:
  49. """
  50. Return the array type associated with this dtype.
  51. Returns
  52. -------
  53. type
  54. """
  55. return JSONArray
  56. class JSONArray(ExtensionArray):
  57. dtype = JSONDtype()
  58. __array_priority__ = 1000
  59. def __init__(self, values, dtype=None, copy=False) -> None:
  60. for val in values:
  61. if not isinstance(val, self.dtype.type):
  62. raise TypeError("All values must be of type " + str(self.dtype.type))
  63. self.data = values
  64. # Some aliases for common attribute names to ensure pandas supports
  65. # these
  66. self._items = self._data = self.data
  67. # those aliases are currently not working due to assumptions
  68. # in internal code (GH-20735)
  69. # self._values = self.values = self.data
  70. @classmethod
  71. def _from_sequence(cls, scalars, *, dtype=None, copy=False):
  72. return cls(scalars)
  73. @classmethod
  74. def _from_factorized(cls, values, original):
  75. return cls([UserDict(x) for x in values if x != ()])
  76. def __getitem__(self, item):
  77. if isinstance(item, tuple):
  78. item = unpack_tuple_and_ellipses(item)
  79. if isinstance(item, numbers.Integral):
  80. return self.data[item]
  81. elif isinstance(item, slice) and item == slice(None):
  82. # Make sure we get a view
  83. return type(self)(self.data)
  84. elif isinstance(item, slice):
  85. # slice
  86. return type(self)(self.data[item])
  87. elif not is_list_like(item):
  88. # e.g. "foo" or 2.5
  89. # exception message copied from numpy
  90. raise IndexError(
  91. r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
  92. r"(`None`) and integer or boolean arrays are valid indices"
  93. )
  94. else:
  95. item = pd.api.indexers.check_array_indexer(self, item)
  96. if is_bool_dtype(item.dtype):
  97. return type(self)._from_sequence(
  98. [x for x, m in zip(self, item) if m], dtype=self.dtype
  99. )
  100. # integer
  101. return type(self)([self.data[i] for i in item])
  102. def __setitem__(self, key, value) -> None:
  103. if isinstance(key, numbers.Integral):
  104. self.data[key] = value
  105. else:
  106. if not isinstance(value, (type(self), abc.Sequence)):
  107. # broadcast value
  108. value = itertools.cycle([value])
  109. if isinstance(key, np.ndarray) and key.dtype == "bool":
  110. # masking
  111. for i, (k, v) in enumerate(zip(key, value)):
  112. if k:
  113. assert isinstance(v, self.dtype.type)
  114. self.data[i] = v
  115. else:
  116. for k, v in zip(key, value):
  117. assert isinstance(v, self.dtype.type)
  118. self.data[k] = v
  119. def __len__(self) -> int:
  120. return len(self.data)
  121. def __eq__(self, other):
  122. return NotImplemented
  123. def __ne__(self, other):
  124. return NotImplemented
  125. def __array__(self, dtype=None, copy=None):
  126. if copy is False:
  127. warnings.warn(
  128. "Starting with NumPy 2.0, the behavior of the 'copy' keyword has "
  129. "changed and passing 'copy=False' raises an error when returning "
  130. "a zero-copy NumPy array is not possible. pandas will follow "
  131. "this behavior starting with pandas 3.0.\nThis conversion to "
  132. "NumPy requires a copy, but 'copy=False' was passed. Consider "
  133. "using 'np.asarray(..)' instead.",
  134. FutureWarning,
  135. stacklevel=find_stack_level(),
  136. )
  137. if dtype is None:
  138. dtype = object
  139. if dtype == object:
  140. # on py38 builds it looks like numpy is inferring to a non-1D array
  141. return construct_1d_object_array_from_listlike(list(self))
  142. if copy is None:
  143. # Note: branch avoids `copy=None` for NumPy 1.x support
  144. return np.asarray(self.data, dtype=dtype)
  145. return np.asarray(self.data, dtype=dtype, copy=copy)
  146. @property
  147. def nbytes(self) -> int:
  148. return sys.getsizeof(self.data)
  149. def isna(self):
  150. return np.array([x == self.dtype.na_value for x in self.data], dtype=bool)
  151. def take(self, indexer, allow_fill=False, fill_value=None):
  152. # re-implement here, since NumPy has trouble setting
  153. # sized objects like UserDicts into scalar slots of
  154. # an ndarary.
  155. indexer = np.asarray(indexer)
  156. msg = (
  157. "Index is out of bounds or cannot do a "
  158. "non-empty take from an empty array."
  159. )
  160. if allow_fill:
  161. if fill_value is None:
  162. fill_value = self.dtype.na_value
  163. # bounds check
  164. if (indexer < -1).any():
  165. raise ValueError
  166. try:
  167. output = [
  168. self.data[loc] if loc != -1 else fill_value for loc in indexer
  169. ]
  170. except IndexError as err:
  171. raise IndexError(msg) from err
  172. else:
  173. try:
  174. output = [self.data[loc] for loc in indexer]
  175. except IndexError as err:
  176. raise IndexError(msg) from err
  177. return type(self)._from_sequence(output, dtype=self.dtype)
  178. def copy(self):
  179. return type(self)(self.data[:])
  180. def astype(self, dtype, copy=True):
  181. # NumPy has issues when all the dicts are the same length.
  182. # np.array([UserDict(...), UserDict(...)]) fails,
  183. # but np.array([{...}, {...}]) works, so cast.
  184. from pandas.core.arrays.string_ import StringDtype
  185. dtype = pandas_dtype(dtype)
  186. # needed to add this check for the Series constructor
  187. if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
  188. if copy:
  189. return self.copy()
  190. return self
  191. elif isinstance(dtype, StringDtype):
  192. arr_cls = dtype.construct_array_type()
  193. return arr_cls._from_sequence(self, dtype=dtype, copy=False)
  194. elif not copy:
  195. return np.asarray([dict(x) for x in self], dtype=dtype)
  196. else:
  197. return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
  198. def unique(self):
  199. # Parent method doesn't work since np.array will try to infer
  200. # a 2-dim object.
  201. return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}])
  202. @classmethod
  203. def _concat_same_type(cls, to_concat):
  204. data = list(itertools.chain.from_iterable(x.data for x in to_concat))
  205. return cls(data)
  206. def _values_for_factorize(self):
  207. frozen = self._values_for_argsort()
  208. if len(frozen) == 0:
  209. # factorize_array expects 1-d array, this is a len-0 2-d array.
  210. frozen = frozen.ravel()
  211. return frozen, ()
  212. def _values_for_argsort(self):
  213. # Bypass NumPy's shape inference to get a (N,) array of tuples.
  214. frozen = [tuple(x.items()) for x in self]
  215. return construct_1d_object_array_from_listlike(frozen)
  216. def _pad_or_backfill(self, *, method, limit=None, copy=True):
  217. # GH#56616 - test EA method without limit_area argument
  218. return super()._pad_or_backfill(method=method, limit=limit, copy=copy)
  219. def make_data():
  220. # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
  221. rng = np.random.default_rng(2)
  222. return [
  223. UserDict(
  224. [
  225. (rng.choice(list(string.ascii_letters)), rng.integers(0, 100))
  226. for _ in range(rng.integers(0, 10))
  227. ]
  228. )
  229. for _ in range(100)
  230. ]