construction.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821
  1. """
  2. Constructor functions intended to be shared by pd.array, Series.__init__,
  3. and Index.__new__.
  4. These should not depend on core.internals.
  5. """
  6. from __future__ import annotations
  7. from collections.abc import Sequence
  8. from typing import (
  9. TYPE_CHECKING,
  10. Optional,
  11. Union,
  12. cast,
  13. overload,
  14. )
  15. import warnings
  16. import numpy as np
  17. from numpy import ma
  18. from pandas._config import using_string_dtype
  19. from pandas._libs import lib
  20. from pandas._libs.tslibs import (
  21. Period,
  22. get_supported_dtype,
  23. is_supported_dtype,
  24. )
  25. from pandas._typing import (
  26. AnyArrayLike,
  27. ArrayLike,
  28. Dtype,
  29. DtypeObj,
  30. T,
  31. )
  32. from pandas.util._exceptions import find_stack_level
  33. from pandas.core.dtypes.base import ExtensionDtype
  34. from pandas.core.dtypes.cast import (
  35. construct_1d_arraylike_from_scalar,
  36. construct_1d_object_array_from_listlike,
  37. maybe_cast_to_datetime,
  38. maybe_cast_to_integer_array,
  39. maybe_convert_platform,
  40. maybe_infer_to_datetimelike,
  41. maybe_promote,
  42. )
  43. from pandas.core.dtypes.common import (
  44. is_list_like,
  45. is_object_dtype,
  46. is_string_dtype,
  47. pandas_dtype,
  48. )
  49. from pandas.core.dtypes.dtypes import NumpyEADtype
  50. from pandas.core.dtypes.generic import (
  51. ABCDataFrame,
  52. ABCExtensionArray,
  53. ABCIndex,
  54. ABCSeries,
  55. )
  56. from pandas.core.dtypes.missing import isna
  57. import pandas.core.common as com
  58. if TYPE_CHECKING:
  59. from pandas import (
  60. Index,
  61. Series,
  62. )
  63. from pandas.core.arrays.base import ExtensionArray
  64. def array(
  65. data: Sequence[object] | AnyArrayLike,
  66. dtype: Dtype | None = None,
  67. copy: bool = True,
  68. ) -> ExtensionArray:
  69. """
  70. Create an array.
  71. Parameters
  72. ----------
  73. data : Sequence of objects
  74. The scalars inside `data` should be instances of the
  75. scalar type for `dtype`. It's expected that `data`
  76. represents a 1-dimensional array of data.
  77. When `data` is an Index or Series, the underlying array
  78. will be extracted from `data`.
  79. dtype : str, np.dtype, or ExtensionDtype, optional
  80. The dtype to use for the array. This may be a NumPy
  81. dtype or an extension type registered with pandas using
  82. :meth:`pandas.api.extensions.register_extension_dtype`.
  83. If not specified, there are two possibilities:
  84. 1. When `data` is a :class:`Series`, :class:`Index`, or
  85. :class:`ExtensionArray`, the `dtype` will be taken
  86. from the data.
  87. 2. Otherwise, pandas will attempt to infer the `dtype`
  88. from the data.
  89. Note that when `data` is a NumPy array, ``data.dtype`` is
  90. *not* used for inferring the array type. This is because
  91. NumPy cannot represent all the types of data that can be
  92. held in extension arrays.
  93. Currently, pandas will infer an extension dtype for sequences of
  94. ============================== =======================================
  95. Scalar Type Array Type
  96. ============================== =======================================
  97. :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray`
  98. :class:`pandas.Period` :class:`pandas.arrays.PeriodArray`
  99. :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray`
  100. :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray`
  101. :class:`int` :class:`pandas.arrays.IntegerArray`
  102. :class:`float` :class:`pandas.arrays.FloatingArray`
  103. :class:`str` :class:`pandas.arrays.StringArray` or
  104. :class:`pandas.arrays.ArrowStringArray`
  105. :class:`bool` :class:`pandas.arrays.BooleanArray`
  106. ============================== =======================================
  107. The ExtensionArray created when the scalar type is :class:`str` is determined by
  108. ``pd.options.mode.string_storage`` if the dtype is not explicitly given.
  109. For all other cases, NumPy's usual inference rules will be used.
  110. copy : bool, default True
  111. Whether to copy the data, even if not necessary. Depending
  112. on the type of `data`, creating the new array may require
  113. copying data, even if ``copy=False``.
  114. Returns
  115. -------
  116. ExtensionArray
  117. The newly created array.
  118. Raises
  119. ------
  120. ValueError
  121. When `data` is not 1-dimensional.
  122. See Also
  123. --------
  124. numpy.array : Construct a NumPy array.
  125. Series : Construct a pandas Series.
  126. Index : Construct a pandas Index.
  127. arrays.NumpyExtensionArray : ExtensionArray wrapping a NumPy array.
  128. Series.array : Extract the array stored within a Series.
  129. Notes
  130. -----
  131. Omitting the `dtype` argument means pandas will attempt to infer the
  132. best array type from the values in the data. As new array types are
  133. added by pandas and 3rd party libraries, the "best" array type may
  134. change. We recommend specifying `dtype` to ensure that
  135. 1. the correct array type for the data is returned
  136. 2. the returned array type doesn't change as new extension types
  137. are added by pandas and third-party libraries
  138. Additionally, if the underlying memory representation of the returned
  139. array matters, we recommend specifying the `dtype` as a concrete object
  140. rather than a string alias or allowing it to be inferred. For example,
  141. a future version of pandas or a 3rd-party library may include a
  142. dedicated ExtensionArray for string data. In this event, the following
  143. would no longer return a :class:`arrays.NumpyExtensionArray` backed by a
  144. NumPy array.
  145. >>> pd.array(['a', 'b'], dtype=str)
  146. <NumpyExtensionArray>
  147. ['a', 'b']
  148. Length: 2, dtype: str32
  149. This would instead return the new ExtensionArray dedicated for string
  150. data. If you really need the new array to be backed by a NumPy array,
  151. specify that in the dtype.
  152. >>> pd.array(['a', 'b'], dtype=np.dtype("<U1"))
  153. <NumpyExtensionArray>
  154. ['a', 'b']
  155. Length: 2, dtype: str32
  156. Finally, Pandas has arrays that mostly overlap with NumPy
  157. * :class:`arrays.DatetimeArray`
  158. * :class:`arrays.TimedeltaArray`
  159. When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is
  160. passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray``
  161. rather than a ``NumpyExtensionArray``. This is for symmetry with the case of
  162. timezone-aware data, which NumPy does not natively support.
  163. >>> pd.array(['2015', '2016'], dtype='datetime64[ns]')
  164. <DatetimeArray>
  165. ['2015-01-01 00:00:00', '2016-01-01 00:00:00']
  166. Length: 2, dtype: datetime64[ns]
  167. >>> pd.array(["1h", "2h"], dtype='timedelta64[ns]')
  168. <TimedeltaArray>
  169. ['0 days 01:00:00', '0 days 02:00:00']
  170. Length: 2, dtype: timedelta64[ns]
  171. Examples
  172. --------
  173. If a dtype is not specified, pandas will infer the best dtype from the values.
  174. See the description of `dtype` for the types pandas infers for.
  175. >>> pd.array([1, 2])
  176. <IntegerArray>
  177. [1, 2]
  178. Length: 2, dtype: Int64
  179. >>> pd.array([1, 2, np.nan])
  180. <IntegerArray>
  181. [1, 2, <NA>]
  182. Length: 3, dtype: Int64
  183. >>> pd.array([1.1, 2.2])
  184. <FloatingArray>
  185. [1.1, 2.2]
  186. Length: 2, dtype: Float64
  187. >>> pd.array(["a", None, "c"])
  188. <StringArray>
  189. ['a', <NA>, 'c']
  190. Length: 3, dtype: string
  191. >>> with pd.option_context("string_storage", "pyarrow"):
  192. ... arr = pd.array(["a", None, "c"])
  193. ...
  194. >>> arr
  195. <ArrowStringArray>
  196. ['a', <NA>, 'c']
  197. Length: 3, dtype: string
  198. >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
  199. <PeriodArray>
  200. ['2000-01-01', '2000-01-01']
  201. Length: 2, dtype: period[D]
  202. You can use the string alias for `dtype`
  203. >>> pd.array(['a', 'b', 'a'], dtype='category')
  204. ['a', 'b', 'a']
  205. Categories (2, object): ['a', 'b']
  206. Or specify the actual dtype
  207. >>> pd.array(['a', 'b', 'a'],
  208. ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
  209. ['a', 'b', 'a']
  210. Categories (3, object): ['a' < 'b' < 'c']
  211. If pandas does not infer a dedicated extension type a
  212. :class:`arrays.NumpyExtensionArray` is returned.
  213. >>> pd.array([1 + 1j, 3 + 2j])
  214. <NumpyExtensionArray>
  215. [(1+1j), (3+2j)]
  216. Length: 2, dtype: complex128
  217. As mentioned in the "Notes" section, new extension types may be added
  218. in the future (by pandas or 3rd party libraries), causing the return
  219. value to no longer be a :class:`arrays.NumpyExtensionArray`. Specify the
  220. `dtype` as a NumPy dtype if you need to ensure there's no future change in
  221. behavior.
  222. >>> pd.array([1, 2], dtype=np.dtype("int32"))
  223. <NumpyExtensionArray>
  224. [1, 2]
  225. Length: 2, dtype: int32
  226. `data` must be 1-dimensional. A ValueError is raised when the input
  227. has the wrong dimensionality.
  228. >>> pd.array(1)
  229. Traceback (most recent call last):
  230. ...
  231. ValueError: Cannot pass scalar '1' to 'pandas.array'.
  232. """
  233. from pandas.core.arrays import (
  234. BooleanArray,
  235. DatetimeArray,
  236. ExtensionArray,
  237. FloatingArray,
  238. IntegerArray,
  239. IntervalArray,
  240. NumpyExtensionArray,
  241. PeriodArray,
  242. TimedeltaArray,
  243. )
  244. from pandas.core.arrays.string_ import StringDtype
  245. if lib.is_scalar(data):
  246. msg = f"Cannot pass scalar '{data}' to 'pandas.array'."
  247. raise ValueError(msg)
  248. elif isinstance(data, ABCDataFrame):
  249. raise TypeError("Cannot pass DataFrame to 'pandas.array'")
  250. if dtype is None and isinstance(data, (ABCSeries, ABCIndex, ExtensionArray)):
  251. # Note: we exclude np.ndarray here, will do type inference on it
  252. dtype = data.dtype
  253. data = extract_array(data, extract_numpy=True)
  254. # this returns None for not-found dtypes.
  255. if dtype is not None:
  256. dtype = pandas_dtype(dtype)
  257. if isinstance(data, ExtensionArray) and (dtype is None or data.dtype == dtype):
  258. # e.g. TimedeltaArray[s], avoid casting to NumpyExtensionArray
  259. if copy:
  260. return data.copy()
  261. return data
  262. if isinstance(dtype, ExtensionDtype):
  263. cls = dtype.construct_array_type()
  264. return cls._from_sequence(data, dtype=dtype, copy=copy)
  265. if dtype is None:
  266. inferred_dtype = lib.infer_dtype(data, skipna=True)
  267. if inferred_dtype == "period":
  268. period_data = cast(Union[Sequence[Optional[Period]], AnyArrayLike], data)
  269. return PeriodArray._from_sequence(period_data, copy=copy)
  270. elif inferred_dtype == "interval":
  271. return IntervalArray(data, copy=copy)
  272. elif inferred_dtype.startswith("datetime"):
  273. # datetime, datetime64
  274. try:
  275. return DatetimeArray._from_sequence(data, copy=copy)
  276. except ValueError:
  277. # Mixture of timezones, fall back to NumpyExtensionArray
  278. pass
  279. elif inferred_dtype.startswith("timedelta"):
  280. # timedelta, timedelta64
  281. return TimedeltaArray._from_sequence(data, copy=copy)
  282. elif inferred_dtype == "string":
  283. # StringArray/ArrowStringArray depending on pd.options.mode.string_storage
  284. dtype = StringDtype()
  285. cls = dtype.construct_array_type()
  286. return cls._from_sequence(data, dtype=dtype, copy=copy)
  287. elif inferred_dtype == "integer":
  288. return IntegerArray._from_sequence(data, copy=copy)
  289. elif inferred_dtype == "empty" and not hasattr(data, "dtype") and not len(data):
  290. return FloatingArray._from_sequence(data, copy=copy)
  291. elif (
  292. inferred_dtype in ("floating", "mixed-integer-float")
  293. and getattr(data, "dtype", None) != np.float16
  294. ):
  295. # GH#44715 Exclude np.float16 bc FloatingArray does not support it;
  296. # we will fall back to NumpyExtensionArray.
  297. return FloatingArray._from_sequence(data, copy=copy)
  298. elif inferred_dtype == "boolean":
  299. return BooleanArray._from_sequence(data, dtype="boolean", copy=copy)
  300. # Pandas overrides NumPy for
  301. # 1. datetime64[ns,us,ms,s]
  302. # 2. timedelta64[ns,us,ms,s]
  303. # so that a DatetimeArray is returned.
  304. if lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype):
  305. return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy)
  306. if lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
  307. return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy)
  308. elif lib.is_np_dtype(dtype, "mM"):
  309. warnings.warn(
  310. r"datetime64 and timedelta64 dtype resolutions other than "
  311. r"'s', 'ms', 'us', and 'ns' are deprecated. "
  312. r"In future releases passing unsupported resolutions will "
  313. r"raise an exception.",
  314. FutureWarning,
  315. stacklevel=find_stack_level(),
  316. )
  317. return NumpyExtensionArray._from_sequence(data, dtype=dtype, copy=copy)
  318. _typs = frozenset(
  319. {
  320. "index",
  321. "rangeindex",
  322. "multiindex",
  323. "datetimeindex",
  324. "timedeltaindex",
  325. "periodindex",
  326. "categoricalindex",
  327. "intervalindex",
  328. "series",
  329. }
  330. )
  331. @overload
  332. def extract_array(
  333. obj: Series | Index, extract_numpy: bool = ..., extract_range: bool = ...
  334. ) -> ArrayLike:
  335. ...
  336. @overload
  337. def extract_array(
  338. obj: T, extract_numpy: bool = ..., extract_range: bool = ...
  339. ) -> T | ArrayLike:
  340. ...
  341. def extract_array(
  342. obj: T, extract_numpy: bool = False, extract_range: bool = False
  343. ) -> T | ArrayLike:
  344. """
  345. Extract the ndarray or ExtensionArray from a Series or Index.
  346. For all other types, `obj` is just returned as is.
  347. Parameters
  348. ----------
  349. obj : object
  350. For Series / Index, the underlying ExtensionArray is unboxed.
  351. extract_numpy : bool, default False
  352. Whether to extract the ndarray from a NumpyExtensionArray.
  353. extract_range : bool, default False
  354. If we have a RangeIndex, return range._values if True
  355. (which is a materialized integer ndarray), otherwise return unchanged.
  356. Returns
  357. -------
  358. arr : object
  359. Examples
  360. --------
  361. >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category'))
  362. ['a', 'b', 'c']
  363. Categories (3, object): ['a', 'b', 'c']
  364. Other objects like lists, arrays, and DataFrames are just passed through.
  365. >>> extract_array([1, 2, 3])
  366. [1, 2, 3]
  367. For an ndarray-backed Series / Index the ndarray is returned.
  368. >>> extract_array(pd.Series([1, 2, 3]))
  369. array([1, 2, 3])
  370. To extract all the way down to the ndarray, pass ``extract_numpy=True``.
  371. >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True)
  372. array([1, 2, 3])
  373. """
  374. typ = getattr(obj, "_typ", None)
  375. if typ in _typs:
  376. # i.e. isinstance(obj, (ABCIndex, ABCSeries))
  377. if typ == "rangeindex":
  378. if extract_range:
  379. # error: "T" has no attribute "_values"
  380. return obj._values # type: ignore[attr-defined]
  381. return obj
  382. # error: "T" has no attribute "_values"
  383. return obj._values # type: ignore[attr-defined]
  384. elif extract_numpy and typ == "npy_extension":
  385. # i.e. isinstance(obj, ABCNumpyExtensionArray)
  386. # error: "T" has no attribute "to_numpy"
  387. return obj.to_numpy() # type: ignore[attr-defined]
  388. return obj
  389. def ensure_wrapped_if_datetimelike(arr):
  390. """
  391. Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray.
  392. """
  393. if isinstance(arr, np.ndarray):
  394. if arr.dtype.kind == "M":
  395. from pandas.core.arrays import DatetimeArray
  396. dtype = get_supported_dtype(arr.dtype)
  397. return DatetimeArray._from_sequence(arr, dtype=dtype)
  398. elif arr.dtype.kind == "m":
  399. from pandas.core.arrays import TimedeltaArray
  400. dtype = get_supported_dtype(arr.dtype)
  401. return TimedeltaArray._from_sequence(arr, dtype=dtype)
  402. return arr
  403. def sanitize_masked_array(data: ma.MaskedArray) -> np.ndarray:
  404. """
  405. Convert numpy MaskedArray to ensure mask is softened.
  406. """
  407. mask = ma.getmaskarray(data)
  408. if mask.any():
  409. dtype, fill_value = maybe_promote(data.dtype, np.nan)
  410. dtype = cast(np.dtype, dtype)
  411. data = ma.asarray(data.astype(dtype, copy=True))
  412. data.soften_mask() # set hardmask False if it was True
  413. data[mask] = fill_value
  414. else:
  415. data = data.copy()
  416. return data
  417. def sanitize_array(
  418. data,
  419. index: Index | None,
  420. dtype: DtypeObj | None = None,
  421. copy: bool = False,
  422. *,
  423. allow_2d: bool = False,
  424. ) -> ArrayLike:
  425. """
  426. Sanitize input data to an ndarray or ExtensionArray, copy if specified,
  427. coerce to the dtype if specified.
  428. Parameters
  429. ----------
  430. data : Any
  431. index : Index or None, default None
  432. dtype : np.dtype, ExtensionDtype, or None, default None
  433. copy : bool, default False
  434. allow_2d : bool, default False
  435. If False, raise if we have a 2D Arraylike.
  436. Returns
  437. -------
  438. np.ndarray or ExtensionArray
  439. """
  440. original_dtype = dtype
  441. if isinstance(data, ma.MaskedArray):
  442. data = sanitize_masked_array(data)
  443. if isinstance(dtype, NumpyEADtype):
  444. # Avoid ending up with a NumpyExtensionArray
  445. dtype = dtype.numpy_dtype
  446. object_index = False
  447. if isinstance(data, ABCIndex) and data.dtype == object and dtype is None:
  448. object_index = True
  449. # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray
  450. data = extract_array(data, extract_numpy=True, extract_range=True)
  451. if isinstance(data, np.ndarray) and data.ndim == 0:
  452. if dtype is None:
  453. dtype = data.dtype
  454. data = lib.item_from_zerodim(data)
  455. elif isinstance(data, range):
  456. # GH#16804
  457. data = range_to_ndarray(data)
  458. copy = False
  459. if not is_list_like(data):
  460. if index is None:
  461. raise ValueError("index must be specified when data is not list-like")
  462. if isinstance(data, str) and using_string_dtype() and original_dtype is None:
  463. from pandas.core.arrays.string_ import StringDtype
  464. dtype = StringDtype(na_value=np.nan)
  465. data = construct_1d_arraylike_from_scalar(data, len(index), dtype)
  466. return data
  467. elif isinstance(data, ABCExtensionArray):
  468. # it is already ensured above this is not a NumpyExtensionArray
  469. # Until GH#49309 is fixed this check needs to come before the
  470. # ExtensionDtype check
  471. if dtype is not None:
  472. subarr = data.astype(dtype, copy=copy)
  473. elif copy:
  474. subarr = data.copy()
  475. else:
  476. subarr = data
  477. elif isinstance(dtype, ExtensionDtype):
  478. # create an extension array from its dtype
  479. _sanitize_non_ordered(data)
  480. cls = dtype.construct_array_type()
  481. if not hasattr(data, "__array__"):
  482. data = list(data)
  483. subarr = cls._from_sequence(data, dtype=dtype, copy=copy)
  484. # GH#846
  485. elif isinstance(data, np.ndarray):
  486. if isinstance(data, np.matrix):
  487. data = data.A
  488. if dtype is None:
  489. subarr = data
  490. if data.dtype == object:
  491. subarr = maybe_infer_to_datetimelike(data)
  492. if object_index and using_string_dtype() and is_string_dtype(subarr):
  493. # Avoid inference when string option is set
  494. subarr = data
  495. elif data.dtype.kind == "U" and using_string_dtype():
  496. from pandas.core.arrays.string_ import StringDtype
  497. dtype = StringDtype(na_value=np.nan)
  498. subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
  499. if (
  500. subarr is data
  501. or (subarr.dtype == "str" and subarr.dtype.storage == "python") # type: ignore[union-attr]
  502. ) and copy:
  503. subarr = subarr.copy()
  504. else:
  505. # we will try to copy by-definition here
  506. subarr = _try_cast(data, dtype, copy)
  507. elif hasattr(data, "__array__"):
  508. # e.g. dask array GH#38645
  509. if not copy:
  510. data = np.asarray(data)
  511. else:
  512. data = np.array(data, copy=copy)
  513. return sanitize_array(
  514. data,
  515. index=index,
  516. dtype=dtype,
  517. copy=False,
  518. allow_2d=allow_2d,
  519. )
  520. else:
  521. _sanitize_non_ordered(data)
  522. # materialize e.g. generators, convert e.g. tuples, abc.ValueView
  523. data = list(data)
  524. if len(data) == 0 and dtype is None:
  525. # We default to float64, matching numpy
  526. subarr = np.array([], dtype=np.float64)
  527. elif dtype is not None:
  528. subarr = _try_cast(data, dtype, copy)
  529. else:
  530. subarr = maybe_convert_platform(data)
  531. if subarr.dtype == object:
  532. subarr = cast(np.ndarray, subarr)
  533. subarr = maybe_infer_to_datetimelike(subarr)
  534. subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
  535. if isinstance(subarr, np.ndarray):
  536. # at this point we should have dtype be None or subarr.dtype == dtype
  537. dtype = cast(np.dtype, dtype)
  538. subarr = _sanitize_str_dtypes(subarr, data, dtype, copy)
  539. return subarr
  540. def range_to_ndarray(rng: range) -> np.ndarray:
  541. """
  542. Cast a range object to ndarray.
  543. """
  544. # GH#30171 perf avoid realizing range as a list in np.array
  545. try:
  546. arr = np.arange(rng.start, rng.stop, rng.step, dtype="int64")
  547. except OverflowError:
  548. # GH#30173 handling for ranges that overflow int64
  549. if (rng.start >= 0 and rng.step > 0) or (rng.step < 0 <= rng.stop):
  550. try:
  551. arr = np.arange(rng.start, rng.stop, rng.step, dtype="uint64")
  552. except OverflowError:
  553. arr = construct_1d_object_array_from_listlike(list(rng))
  554. else:
  555. arr = construct_1d_object_array_from_listlike(list(rng))
  556. return arr
  557. def _sanitize_non_ordered(data) -> None:
  558. """
  559. Raise only for unordered sets, e.g., not for dict_keys
  560. """
  561. if isinstance(data, (set, frozenset)):
  562. raise TypeError(f"'{type(data).__name__}' type is unordered")
  563. def _sanitize_ndim(
  564. result: ArrayLike,
  565. data,
  566. dtype: DtypeObj | None,
  567. index: Index | None,
  568. *,
  569. allow_2d: bool = False,
  570. ) -> ArrayLike:
  571. """
  572. Ensure we have a 1-dimensional result array.
  573. """
  574. if getattr(result, "ndim", 0) == 0:
  575. raise ValueError("result should be arraylike with ndim > 0")
  576. if result.ndim == 1:
  577. # the result that we want
  578. result = _maybe_repeat(result, index)
  579. elif result.ndim > 1:
  580. if isinstance(data, np.ndarray):
  581. if allow_2d:
  582. return result
  583. raise ValueError(
  584. f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead"
  585. )
  586. if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype):
  587. # i.e. NumpyEADtype("O")
  588. result = com.asarray_tuplesafe(data, dtype=np.dtype("object"))
  589. cls = dtype.construct_array_type()
  590. result = cls._from_sequence(result, dtype=dtype)
  591. else:
  592. # error: Argument "dtype" to "asarray_tuplesafe" has incompatible type
  593. # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[str,
  594. # dtype[Any], None]"
  595. result = com.asarray_tuplesafe(data, dtype=dtype) # type: ignore[arg-type]
  596. return result
  597. def _sanitize_str_dtypes(
  598. result: np.ndarray, data, dtype: np.dtype | None, copy: bool
  599. ) -> np.ndarray:
  600. """
  601. Ensure we have a dtype that is supported by pandas.
  602. """
  603. # This is to prevent mixed-type Series getting all casted to
  604. # NumPy string type, e.g. NaN --> '-1#IND'.
  605. if issubclass(result.dtype.type, str):
  606. # GH#16605
  607. # If not empty convert the data to dtype
  608. # GH#19853: If data is a scalar, result has already the result
  609. if not lib.is_scalar(data):
  610. if not np.all(isna(data)):
  611. data = np.asarray(data, dtype=dtype)
  612. if not copy:
  613. result = np.asarray(data, dtype=object)
  614. else:
  615. result = np.array(data, dtype=object, copy=copy)
  616. return result
  617. def _maybe_repeat(arr: ArrayLike, index: Index | None) -> ArrayLike:
  618. """
  619. If we have a length-1 array and an index describing how long we expect
  620. the result to be, repeat the array.
  621. """
  622. if index is not None:
  623. if 1 == len(arr) != len(index):
  624. arr = arr.repeat(len(index))
  625. return arr
  626. def _try_cast(
  627. arr: list | np.ndarray,
  628. dtype: np.dtype,
  629. copy: bool,
  630. ) -> ArrayLike:
  631. """
  632. Convert input to numpy ndarray and optionally cast to a given dtype.
  633. Parameters
  634. ----------
  635. arr : ndarray or list
  636. Excludes: ExtensionArray, Series, Index.
  637. dtype : np.dtype
  638. copy : bool
  639. If False, don't copy the data if not needed.
  640. Returns
  641. -------
  642. np.ndarray or ExtensionArray
  643. """
  644. is_ndarray = isinstance(arr, np.ndarray)
  645. if dtype == object:
  646. if not is_ndarray:
  647. subarr = construct_1d_object_array_from_listlike(arr)
  648. return subarr
  649. return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy)
  650. elif dtype.kind == "U":
  651. # TODO: test cases with arr.dtype.kind in "mM"
  652. if is_ndarray:
  653. arr = cast(np.ndarray, arr)
  654. shape = arr.shape
  655. if arr.ndim > 1:
  656. arr = arr.ravel()
  657. else:
  658. shape = (len(arr),)
  659. return lib.ensure_string_array(arr, convert_na_value=False, copy=copy).reshape(
  660. shape
  661. )
  662. elif dtype.kind in "mM":
  663. return maybe_cast_to_datetime(arr, dtype)
  664. # GH#15832: Check if we are requesting a numeric dtype and
  665. # that we can convert the data to the requested dtype.
  666. elif dtype.kind in "iu":
  667. # this will raise if we have e.g. floats
  668. subarr = maybe_cast_to_integer_array(arr, dtype)
  669. elif not copy:
  670. subarr = np.asarray(arr, dtype=dtype)
  671. else:
  672. subarr = np.array(arr, dtype=dtype, copy=copy)
  673. return subarr