test_array.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. import datetime
  2. import decimal
  3. import re
  4. import numpy as np
  5. import pytest
  6. import pytz
  7. from pandas._config import using_string_dtype
  8. import pandas as pd
  9. import pandas._testing as tm
  10. from pandas.api.extensions import register_extension_dtype
  11. from pandas.arrays import (
  12. BooleanArray,
  13. DatetimeArray,
  14. FloatingArray,
  15. IntegerArray,
  16. IntervalArray,
  17. SparseArray,
  18. TimedeltaArray,
  19. )
  20. from pandas.core.arrays import (
  21. NumpyExtensionArray,
  22. period_array,
  23. )
  24. from pandas.tests.extension.decimal import (
  25. DecimalArray,
  26. DecimalDtype,
  27. to_decimal,
  28. )
  29. @pytest.mark.parametrize("dtype_unit", ["M8[h]", "M8[m]", "m8[h]", "M8[m]"])
  30. def test_dt64_array(dtype_unit):
  31. # PR 53817
  32. dtype_var = np.dtype(dtype_unit)
  33. msg = (
  34. r"datetime64 and timedelta64 dtype resolutions other than "
  35. r"'s', 'ms', 'us', and 'ns' are deprecated. "
  36. r"In future releases passing unsupported resolutions will "
  37. r"raise an exception."
  38. )
  39. with tm.assert_produces_warning(FutureWarning, match=re.escape(msg)):
  40. pd.array([], dtype=dtype_var)
  41. @pytest.mark.parametrize(
  42. "data, dtype, expected",
  43. [
  44. # Basic NumPy defaults.
  45. ([], None, FloatingArray._from_sequence([], dtype="Float64")),
  46. ([1, 2], None, IntegerArray._from_sequence([1, 2], dtype="Int64")),
  47. ([1, 2], object, NumpyExtensionArray(np.array([1, 2], dtype=object))),
  48. (
  49. [1, 2],
  50. np.dtype("float32"),
  51. NumpyExtensionArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),
  52. ),
  53. (
  54. np.array([], dtype=object),
  55. None,
  56. NumpyExtensionArray(np.array([], dtype=object)),
  57. ),
  58. (
  59. np.array([1, 2], dtype="int64"),
  60. None,
  61. IntegerArray._from_sequence([1, 2], dtype="Int64"),
  62. ),
  63. (
  64. np.array([1.0, 2.0], dtype="float64"),
  65. None,
  66. FloatingArray._from_sequence([1.0, 2.0], dtype="Float64"),
  67. ),
  68. # String alias passes through to NumPy
  69. ([1, 2], "float32", NumpyExtensionArray(np.array([1, 2], dtype="float32"))),
  70. ([1, 2], "int64", NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
  71. # GH#44715 FloatingArray does not support float16, so fall
  72. # back to NumpyExtensionArray
  73. (
  74. np.array([1, 2], dtype=np.float16),
  75. None,
  76. NumpyExtensionArray(np.array([1, 2], dtype=np.float16)),
  77. ),
  78. # idempotency with e.g. pd.array(pd.array([1, 2], dtype="int64"))
  79. (
  80. NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
  81. None,
  82. NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
  83. ),
  84. # Period alias
  85. (
  86. [pd.Period("2000", "D"), pd.Period("2001", "D")],
  87. "Period[D]",
  88. period_array(["2000", "2001"], freq="D"),
  89. ),
  90. # Period dtype
  91. (
  92. [pd.Period("2000", "D")],
  93. pd.PeriodDtype("D"),
  94. period_array(["2000"], freq="D"),
  95. ),
  96. # Datetime (naive)
  97. (
  98. [1, 2],
  99. np.dtype("datetime64[ns]"),
  100. DatetimeArray._from_sequence(
  101. np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
  102. ),
  103. ),
  104. (
  105. [1, 2],
  106. np.dtype("datetime64[s]"),
  107. DatetimeArray._from_sequence(
  108. np.array([1, 2], dtype="M8[s]"), dtype="M8[s]"
  109. ),
  110. ),
  111. (
  112. np.array([1, 2], dtype="datetime64[ns]"),
  113. None,
  114. DatetimeArray._from_sequence(
  115. np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
  116. ),
  117. ),
  118. (
  119. pd.DatetimeIndex(["2000", "2001"]),
  120. np.dtype("datetime64[ns]"),
  121. DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
  122. ),
  123. (
  124. pd.DatetimeIndex(["2000", "2001"]),
  125. None,
  126. DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
  127. ),
  128. (
  129. ["2000", "2001"],
  130. np.dtype("datetime64[ns]"),
  131. DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
  132. ),
  133. # Datetime (tz-aware)
  134. (
  135. ["2000", "2001"],
  136. pd.DatetimeTZDtype(tz="CET"),
  137. DatetimeArray._from_sequence(
  138. ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
  139. ),
  140. ),
  141. # Timedelta
  142. (
  143. ["1h", "2h"],
  144. np.dtype("timedelta64[ns]"),
  145. TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
  146. ),
  147. (
  148. pd.TimedeltaIndex(["1h", "2h"]),
  149. np.dtype("timedelta64[ns]"),
  150. TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
  151. ),
  152. (
  153. np.array([1, 2], dtype="m8[s]"),
  154. np.dtype("timedelta64[s]"),
  155. TimedeltaArray._from_sequence(
  156. np.array([1, 2], dtype="m8[s]"), dtype="m8[s]"
  157. ),
  158. ),
  159. (
  160. pd.TimedeltaIndex(["1h", "2h"]),
  161. None,
  162. TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
  163. ),
  164. (
  165. # preserve non-nano, i.e. don't cast to NumpyExtensionArray
  166. TimedeltaArray._simple_new(
  167. np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
  168. ),
  169. None,
  170. TimedeltaArray._simple_new(
  171. np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
  172. ),
  173. ),
  174. (
  175. # preserve non-nano, i.e. don't cast to NumpyExtensionArray
  176. TimedeltaArray._simple_new(
  177. np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
  178. ),
  179. np.dtype("m8[s]"),
  180. TimedeltaArray._simple_new(
  181. np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
  182. ),
  183. ),
  184. # Category
  185. (["a", "b"], "category", pd.Categorical(["a", "b"])),
  186. (
  187. ["a", "b"],
  188. pd.CategoricalDtype(None, ordered=True),
  189. pd.Categorical(["a", "b"], ordered=True),
  190. ),
  191. # Interval
  192. (
  193. [pd.Interval(1, 2), pd.Interval(3, 4)],
  194. "interval",
  195. IntervalArray.from_tuples([(1, 2), (3, 4)]),
  196. ),
  197. # Sparse
  198. ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")),
  199. # IntegerNA
  200. ([1, None], "Int16", pd.array([1, None], dtype="Int16")),
  201. (
  202. pd.Series([1, 2]),
  203. None,
  204. NumpyExtensionArray(np.array([1, 2], dtype=np.int64)),
  205. ),
  206. # String
  207. (
  208. ["a", None],
  209. "string",
  210. pd.StringDtype()
  211. .construct_array_type()
  212. ._from_sequence(["a", None], dtype=pd.StringDtype()),
  213. ),
  214. (
  215. ["a", None],
  216. "str",
  217. pd.StringDtype(na_value=np.nan)
  218. .construct_array_type()
  219. ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan))
  220. if using_string_dtype()
  221. else NumpyExtensionArray(np.array(["a", "None"])),
  222. ),
  223. (
  224. ["a", None],
  225. pd.StringDtype(),
  226. pd.StringDtype()
  227. .construct_array_type()
  228. ._from_sequence(["a", None], dtype=pd.StringDtype()),
  229. ),
  230. (
  231. ["a", None],
  232. pd.StringDtype(na_value=np.nan),
  233. pd.StringDtype(na_value=np.nan)
  234. .construct_array_type()
  235. ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)),
  236. ),
  237. (
  238. # numpy array with string dtype
  239. np.array(["a", "b"], dtype=str),
  240. pd.StringDtype(),
  241. pd.StringDtype()
  242. .construct_array_type()
  243. ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
  244. ),
  245. (
  246. # numpy array with string dtype
  247. np.array(["a", "b"], dtype=str),
  248. pd.StringDtype(na_value=np.nan),
  249. pd.StringDtype(na_value=np.nan)
  250. .construct_array_type()
  251. ._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
  252. ),
  253. # Boolean
  254. (
  255. [True, None],
  256. "boolean",
  257. BooleanArray._from_sequence([True, None], dtype="boolean"),
  258. ),
  259. (
  260. [True, None],
  261. pd.BooleanDtype(),
  262. BooleanArray._from_sequence([True, None], dtype="boolean"),
  263. ),
  264. # Index
  265. (pd.Index([1, 2]), None, NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
  266. # Series[EA] returns the EA
  267. (
  268. pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])),
  269. None,
  270. pd.Categorical(["a", "b"], categories=["a", "b", "c"]),
  271. ),
  272. # "3rd party" EAs work
  273. ([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])),
  274. # pass an ExtensionArray, but a different dtype
  275. (
  276. period_array(["2000", "2001"], freq="D"),
  277. "category",
  278. pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]),
  279. ),
  280. ],
  281. )
  282. def test_array(data, dtype, expected):
  283. result = pd.array(data, dtype=dtype)
  284. tm.assert_equal(result, expected)
  285. def test_array_copy():
  286. a = np.array([1, 2])
  287. # default is to copy
  288. b = pd.array(a, dtype=a.dtype)
  289. assert not tm.shares_memory(a, b)
  290. # copy=True
  291. b = pd.array(a, dtype=a.dtype, copy=True)
  292. assert not tm.shares_memory(a, b)
  293. # copy=False
  294. b = pd.array(a, dtype=a.dtype, copy=False)
  295. assert tm.shares_memory(a, b)
  296. cet = pytz.timezone("CET")
  297. @pytest.mark.parametrize(
  298. "data, expected",
  299. [
  300. # period
  301. (
  302. [pd.Period("2000", "D"), pd.Period("2001", "D")],
  303. period_array(["2000", "2001"], freq="D"),
  304. ),
  305. # interval
  306. ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])),
  307. # datetime
  308. (
  309. [pd.Timestamp("2000"), pd.Timestamp("2001")],
  310. DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
  311. ),
  312. (
  313. [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)],
  314. DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
  315. ),
  316. (
  317. np.array([1, 2], dtype="M8[ns]"),
  318. DatetimeArray._from_sequence(np.array([1, 2], dtype="M8[ns]")),
  319. ),
  320. (
  321. np.array([1, 2], dtype="M8[us]"),
  322. DatetimeArray._simple_new(
  323. np.array([1, 2], dtype="M8[us]"), dtype=np.dtype("M8[us]")
  324. ),
  325. ),
  326. # datetimetz
  327. (
  328. [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")],
  329. DatetimeArray._from_sequence(
  330. ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns")
  331. ),
  332. ),
  333. (
  334. [
  335. datetime.datetime(2000, 1, 1, tzinfo=cet),
  336. datetime.datetime(2001, 1, 1, tzinfo=cet),
  337. ],
  338. DatetimeArray._from_sequence(
  339. ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns")
  340. ),
  341. ),
  342. # timedelta
  343. (
  344. [pd.Timedelta("1h"), pd.Timedelta("2h")],
  345. TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
  346. ),
  347. (
  348. np.array([1, 2], dtype="m8[ns]"),
  349. TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")),
  350. ),
  351. (
  352. np.array([1, 2], dtype="m8[us]"),
  353. TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")),
  354. ),
  355. # integer
  356. ([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")),
  357. ([1, None], IntegerArray._from_sequence([1, None], dtype="Int64")),
  358. ([1, pd.NA], IntegerArray._from_sequence([1, pd.NA], dtype="Int64")),
  359. ([1, np.nan], IntegerArray._from_sequence([1, np.nan], dtype="Int64")),
  360. # float
  361. ([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2], dtype="Float64")),
  362. ([0.1, None], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
  363. ([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
  364. ([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
  365. # integer-like float
  366. ([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
  367. ([1.0, None], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
  368. ([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
  369. ([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
  370. # mixed-integer-float
  371. ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
  372. (
  373. [1, np.nan, 2.0],
  374. FloatingArray._from_sequence([1.0, None, 2.0], dtype="Float64"),
  375. ),
  376. # string
  377. (
  378. ["a", "b"],
  379. pd.StringDtype()
  380. .construct_array_type()
  381. ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
  382. ),
  383. (
  384. ["a", None],
  385. pd.StringDtype()
  386. .construct_array_type()
  387. ._from_sequence(["a", None], dtype=pd.StringDtype()),
  388. ),
  389. (
  390. # numpy array with string dtype
  391. np.array(["a", "b"], dtype=str),
  392. pd.StringDtype()
  393. .construct_array_type()
  394. ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
  395. ),
  396. # Boolean
  397. ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")),
  398. ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")),
  399. ],
  400. )
  401. def test_array_inference(data, expected):
  402. result = pd.array(data)
  403. tm.assert_equal(result, expected)
  404. @pytest.mark.parametrize(
  405. "data",
  406. [
  407. # mix of frequencies
  408. [pd.Period("2000", "D"), pd.Period("2001", "Y")],
  409. # mix of closed
  410. [pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")],
  411. # Mix of timezones
  412. [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")],
  413. # Mix of tz-aware and tz-naive
  414. [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")],
  415. np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]),
  416. ],
  417. )
  418. def test_array_inference_fails(data):
  419. result = pd.array(data)
  420. expected = NumpyExtensionArray(np.array(data, dtype=object))
  421. tm.assert_extension_array_equal(result, expected)
  422. @pytest.mark.parametrize("data", [np.array(0)])
  423. def test_nd_raises(data):
  424. with pytest.raises(ValueError, match="NumpyExtensionArray must be 1-dimensional"):
  425. pd.array(data, dtype="int64")
  426. def test_scalar_raises():
  427. with pytest.raises(ValueError, match="Cannot pass scalar '1'"):
  428. pd.array(1)
  429. def test_dataframe_raises():
  430. # GH#51167 don't accidentally cast to StringArray by doing inference on columns
  431. df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
  432. msg = "Cannot pass DataFrame to 'pandas.array'"
  433. with pytest.raises(TypeError, match=msg):
  434. pd.array(df)
  435. def test_bounds_check():
  436. # GH21796
  437. with pytest.raises(
  438. TypeError, match=r"cannot safely cast non-equivalent int(32|64) to uint16"
  439. ):
  440. pd.array([-1, 2, 3], dtype="UInt16")
  441. # ---------------------------------------------------------------------------
  442. # A couple dummy classes to ensure that Series and Indexes are unboxed before
  443. # getting to the EA classes.
  444. @register_extension_dtype
  445. class DecimalDtype2(DecimalDtype):
  446. name = "decimal2"
  447. @classmethod
  448. def construct_array_type(cls):
  449. """
  450. Return the array type associated with this dtype.
  451. Returns
  452. -------
  453. type
  454. """
  455. return DecimalArray2
  456. class DecimalArray2(DecimalArray):
  457. @classmethod
  458. def _from_sequence(cls, scalars, *, dtype=None, copy=False):
  459. if isinstance(scalars, (pd.Series, pd.Index)):
  460. raise TypeError("scalars should not be of type pd.Series or pd.Index")
  461. return super()._from_sequence(scalars, dtype=dtype, copy=copy)
  462. def test_array_unboxes(index_or_series):
  463. box = index_or_series
  464. data = box([decimal.Decimal("1"), decimal.Decimal("2")])
  465. dtype = DecimalDtype2()
  466. # make sure it works
  467. with pytest.raises(
  468. TypeError, match="scalars should not be of type pd.Series or pd.Index"
  469. ):
  470. DecimalArray2._from_sequence(data, dtype=dtype)
  471. result = pd.array(data, dtype="decimal2")
  472. expected = DecimalArray2._from_sequence(data.values, dtype=dtype)
  473. tm.assert_equal(result, expected)
  474. def test_array_to_numpy_na():
  475. # GH#40638
  476. arr = pd.array([pd.NA, 1], dtype="string[python]")
  477. result = arr.to_numpy(na_value=True, dtype=bool)
  478. expected = np.array([True, True])
  479. tm.assert_numpy_array_equal(result, expected)