test_conversion.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. import numpy as np
  2. import pytest
  3. from pandas.compat import HAS_PYARROW
  4. from pandas.compat.numpy import np_version_gt2
  5. from pandas.core.dtypes.dtypes import DatetimeTZDtype
  6. import pandas as pd
  7. from pandas import (
  8. CategoricalIndex,
  9. Series,
  10. Timedelta,
  11. Timestamp,
  12. date_range,
  13. )
  14. import pandas._testing as tm
  15. from pandas.core.arrays import (
  16. DatetimeArray,
  17. IntervalArray,
  18. NumpyExtensionArray,
  19. PeriodArray,
  20. SparseArray,
  21. TimedeltaArray,
  22. )
  23. from pandas.core.arrays.string_ import StringArrayNumpySemantics
  24. from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
  25. class TestToIterable:
  26. # test that we convert an iterable to python types
  27. dtypes = [
  28. ("int8", int),
  29. ("int16", int),
  30. ("int32", int),
  31. ("int64", int),
  32. ("uint8", int),
  33. ("uint16", int),
  34. ("uint32", int),
  35. ("uint64", int),
  36. ("float16", float),
  37. ("float32", float),
  38. ("float64", float),
  39. ("datetime64[ns]", Timestamp),
  40. ("datetime64[ns, US/Eastern]", Timestamp),
  41. ("timedelta64[ns]", Timedelta),
  42. ]
  43. @pytest.mark.parametrize("dtype, rdtype", dtypes)
  44. @pytest.mark.parametrize(
  45. "method",
  46. [
  47. lambda x: x.tolist(),
  48. lambda x: x.to_list(),
  49. lambda x: list(x),
  50. lambda x: list(x.__iter__()),
  51. ],
  52. ids=["tolist", "to_list", "list", "iter"],
  53. )
  54. def test_iterable(self, index_or_series, method, dtype, rdtype):
  55. # gh-10904
  56. # gh-13258
  57. # coerce iteration to underlying python / pandas types
  58. typ = index_or_series
  59. if dtype == "float16" and issubclass(typ, pd.Index):
  60. with pytest.raises(NotImplementedError, match="float16 indexes are not "):
  61. typ([1], dtype=dtype)
  62. return
  63. s = typ([1], dtype=dtype)
  64. result = method(s)[0]
  65. assert isinstance(result, rdtype)
  66. @pytest.mark.parametrize(
  67. "dtype, rdtype, obj",
  68. [
  69. ("object", object, "a"),
  70. ("object", int, 1),
  71. ("category", object, "a"),
  72. ("category", int, 1),
  73. ],
  74. )
  75. @pytest.mark.parametrize(
  76. "method",
  77. [
  78. lambda x: x.tolist(),
  79. lambda x: x.to_list(),
  80. lambda x: list(x),
  81. lambda x: list(x.__iter__()),
  82. ],
  83. ids=["tolist", "to_list", "list", "iter"],
  84. )
  85. def test_iterable_object_and_category(
  86. self, index_or_series, method, dtype, rdtype, obj
  87. ):
  88. # gh-10904
  89. # gh-13258
  90. # coerce iteration to underlying python / pandas types
  91. typ = index_or_series
  92. s = typ([obj], dtype=dtype)
  93. result = method(s)[0]
  94. assert isinstance(result, rdtype)
  95. @pytest.mark.parametrize("dtype, rdtype", dtypes)
  96. def test_iterable_items(self, dtype, rdtype):
  97. # gh-13258
  98. # test if items yields the correct boxed scalars
  99. # this only applies to series
  100. s = Series([1], dtype=dtype)
  101. _, result = next(iter(s.items()))
  102. assert isinstance(result, rdtype)
  103. _, result = next(iter(s.items()))
  104. assert isinstance(result, rdtype)
  105. @pytest.mark.parametrize(
  106. "dtype, rdtype", dtypes + [("object", int), ("category", int)]
  107. )
  108. def test_iterable_map(self, index_or_series, dtype, rdtype):
  109. # gh-13236
  110. # coerce iteration to underlying python / pandas types
  111. typ = index_or_series
  112. if dtype == "float16" and issubclass(typ, pd.Index):
  113. with pytest.raises(NotImplementedError, match="float16 indexes are not "):
  114. typ([1], dtype=dtype)
  115. return
  116. s = typ([1], dtype=dtype)
  117. result = s.map(type)[0]
  118. if not isinstance(rdtype, tuple):
  119. rdtype = (rdtype,)
  120. assert result in rdtype
  121. @pytest.mark.parametrize(
  122. "method",
  123. [
  124. lambda x: x.tolist(),
  125. lambda x: x.to_list(),
  126. lambda x: list(x),
  127. lambda x: list(x.__iter__()),
  128. ],
  129. ids=["tolist", "to_list", "list", "iter"],
  130. )
  131. def test_categorial_datetimelike(self, method):
  132. i = CategoricalIndex([Timestamp("1999-12-31"), Timestamp("2000-12-31")])
  133. result = method(i)[0]
  134. assert isinstance(result, Timestamp)
  135. def test_iter_box_dt64(self, unit):
  136. vals = [Timestamp("2011-01-01"), Timestamp("2011-01-02")]
  137. ser = Series(vals).dt.as_unit(unit)
  138. assert ser.dtype == f"datetime64[{unit}]"
  139. for res, exp in zip(ser, vals):
  140. assert isinstance(res, Timestamp)
  141. assert res.tz is None
  142. assert res == exp
  143. assert res.unit == unit
  144. def test_iter_box_dt64tz(self, unit):
  145. vals = [
  146. Timestamp("2011-01-01", tz="US/Eastern"),
  147. Timestamp("2011-01-02", tz="US/Eastern"),
  148. ]
  149. ser = Series(vals).dt.as_unit(unit)
  150. assert ser.dtype == f"datetime64[{unit}, US/Eastern]"
  151. for res, exp in zip(ser, vals):
  152. assert isinstance(res, Timestamp)
  153. assert res.tz == exp.tz
  154. assert res == exp
  155. assert res.unit == unit
  156. def test_iter_box_timedelta64(self, unit):
  157. # timedelta
  158. vals = [Timedelta("1 days"), Timedelta("2 days")]
  159. ser = Series(vals).dt.as_unit(unit)
  160. assert ser.dtype == f"timedelta64[{unit}]"
  161. for res, exp in zip(ser, vals):
  162. assert isinstance(res, Timedelta)
  163. assert res == exp
  164. assert res.unit == unit
  165. def test_iter_box_period(self):
  166. # period
  167. vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")]
  168. s = Series(vals)
  169. assert s.dtype == "Period[M]"
  170. for res, exp in zip(s, vals):
  171. assert isinstance(res, pd.Period)
  172. assert res.freq == "ME"
  173. assert res == exp
  174. @pytest.mark.parametrize(
  175. "arr, expected_type, dtype",
  176. [
  177. (np.array([0, 1], dtype=np.int64), np.ndarray, "int64"),
  178. (np.array(["a", "b"]), np.ndarray, "object"),
  179. (pd.Categorical(["a", "b"]), pd.Categorical, "category"),
  180. (
  181. pd.DatetimeIndex(["2017", "2018"], tz="US/Central"),
  182. DatetimeArray,
  183. "datetime64[ns, US/Central]",
  184. ),
  185. (
  186. pd.PeriodIndex([2018, 2019], freq="Y"),
  187. PeriodArray,
  188. pd.core.dtypes.dtypes.PeriodDtype("Y-DEC"),
  189. ),
  190. (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval"),
  191. (
  192. pd.DatetimeIndex(["2017", "2018"]),
  193. DatetimeArray,
  194. "datetime64[ns]",
  195. ),
  196. (
  197. pd.TimedeltaIndex([10**10]),
  198. TimedeltaArray,
  199. "m8[ns]",
  200. ),
  201. ],
  202. )
  203. def test_values_consistent(arr, expected_type, dtype, using_infer_string):
  204. if using_infer_string and dtype == "object":
  205. expected_type = (
  206. ArrowStringArrayNumpySemantics if HAS_PYARROW else StringArrayNumpySemantics
  207. )
  208. l_values = Series(arr)._values
  209. r_values = pd.Index(arr)._values
  210. assert type(l_values) is expected_type
  211. assert type(l_values) is type(r_values)
  212. tm.assert_equal(l_values, r_values)
  213. @pytest.mark.parametrize("arr", [np.array([1, 2, 3])])
  214. def test_numpy_array(arr):
  215. ser = Series(arr)
  216. result = ser.array
  217. expected = NumpyExtensionArray(arr)
  218. tm.assert_extension_array_equal(result, expected)
  219. def test_numpy_array_all_dtypes(any_numpy_dtype):
  220. ser = Series(dtype=any_numpy_dtype)
  221. result = ser.array
  222. if np.dtype(any_numpy_dtype).kind == "M":
  223. assert isinstance(result, DatetimeArray)
  224. elif np.dtype(any_numpy_dtype).kind == "m":
  225. assert isinstance(result, TimedeltaArray)
  226. else:
  227. assert isinstance(result, NumpyExtensionArray)
  228. @pytest.mark.parametrize(
  229. "arr, attr",
  230. [
  231. (pd.Categorical(["a", "b"]), "_codes"),
  232. (PeriodArray._from_sequence(["2000", "2001"], dtype="period[D]"), "_ndarray"),
  233. (pd.array([0, np.nan], dtype="Int64"), "_data"),
  234. (IntervalArray.from_breaks([0, 1]), "_left"),
  235. (SparseArray([0, 1]), "_sparse_values"),
  236. (
  237. DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")),
  238. "_ndarray",
  239. ),
  240. # tz-aware Datetime
  241. (
  242. DatetimeArray._from_sequence(
  243. np.array(
  244. ["2000-01-01T12:00:00", "2000-01-02T12:00:00"], dtype="M8[ns]"
  245. ),
  246. dtype=DatetimeTZDtype(tz="US/Central"),
  247. ),
  248. "_ndarray",
  249. ),
  250. ],
  251. )
  252. def test_array(arr, attr, index_or_series, request):
  253. box = index_or_series
  254. result = box(arr, copy=False).array
  255. if attr:
  256. arr = getattr(arr, attr)
  257. result = getattr(result, attr)
  258. assert result is arr
  259. def test_array_multiindex_raises():
  260. idx = pd.MultiIndex.from_product([["A"], ["a", "b"]])
  261. msg = "MultiIndex has no single backing array"
  262. with pytest.raises(ValueError, match=msg):
  263. idx.array
  264. @pytest.mark.parametrize(
  265. "arr, expected, zero_copy",
  266. [
  267. (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64), True),
  268. (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object), False),
  269. (
  270. pd.core.arrays.period_array(["2000", "2001"], freq="D"),
  271. np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]),
  272. False,
  273. ),
  274. (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan]), False),
  275. (
  276. IntervalArray.from_breaks([0, 1, 2]),
  277. np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
  278. False,
  279. ),
  280. (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64), False),
  281. # tz-naive datetime
  282. (
  283. DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")),
  284. np.array(["2000", "2001"], dtype="M8[ns]"),
  285. True,
  286. ),
  287. # tz-aware stays tz`-aware
  288. (
  289. DatetimeArray._from_sequence(
  290. np.array(["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]")
  291. )
  292. .tz_localize("UTC")
  293. .tz_convert("US/Central"),
  294. np.array(
  295. [
  296. Timestamp("2000-01-01", tz="US/Central"),
  297. Timestamp("2000-01-02", tz="US/Central"),
  298. ]
  299. ),
  300. False,
  301. ),
  302. # Timedelta
  303. (
  304. TimedeltaArray._from_sequence(
  305. np.array([0, 3600000000000], dtype="i8").view("m8[ns]")
  306. ),
  307. np.array([0, 3600000000000], dtype="m8[ns]"),
  308. True,
  309. ),
  310. # GH#26406 tz is preserved in Categorical[dt64tz]
  311. (
  312. pd.Categorical(date_range("2016-01-01", periods=2, tz="US/Pacific")),
  313. np.array(
  314. [
  315. Timestamp("2016-01-01", tz="US/Pacific"),
  316. Timestamp("2016-01-02", tz="US/Pacific"),
  317. ]
  318. ),
  319. False,
  320. ),
  321. ],
  322. )
  323. def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array):
  324. box = index_or_series_or_array
  325. with tm.assert_produces_warning(None):
  326. thing = box(arr)
  327. result = thing.to_numpy()
  328. tm.assert_numpy_array_equal(result, expected)
  329. result = np.asarray(thing)
  330. tm.assert_numpy_array_equal(result, expected)
  331. # Additionally, we check the `copy=` semantics for array/asarray
  332. # (these are implemented by us via `__array__`).
  333. result_cp1 = np.array(thing, copy=True)
  334. result_cp2 = np.array(thing, copy=True)
  335. # When called with `copy=True` NumPy/we should ensure a copy was made
  336. assert not np.may_share_memory(result_cp1, result_cp2)
  337. if not np_version_gt2:
  338. # copy=False semantics are only supported in NumPy>=2.
  339. return
  340. if not zero_copy:
  341. msg = "Starting with NumPy 2.0, the behavior of the 'copy' keyword has changed"
  342. with tm.assert_produces_warning(FutureWarning, match=msg):
  343. np.array(thing, copy=False)
  344. else:
  345. result_nocopy1 = np.array(thing, copy=False)
  346. result_nocopy2 = np.array(thing, copy=False)
  347. # If copy=False was given, these must share the same data
  348. assert np.may_share_memory(result_nocopy1, result_nocopy2)
  349. @pytest.mark.parametrize("as_series", [True, False])
  350. @pytest.mark.parametrize(
  351. "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)]
  352. )
  353. def test_to_numpy_copy(arr, as_series, using_infer_string):
  354. obj = pd.Index(arr, copy=False)
  355. if as_series:
  356. obj = Series(obj.values, copy=False)
  357. # no copy by default
  358. result = obj.to_numpy()
  359. if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow":
  360. assert np.shares_memory(arr, result) is False
  361. else:
  362. assert np.shares_memory(arr, result) is True
  363. result = obj.to_numpy(copy=False)
  364. if using_infer_string and arr.dtype == object and obj.dtype.storage == "pyarrow":
  365. assert np.shares_memory(arr, result) is False
  366. else:
  367. assert np.shares_memory(arr, result) is True
  368. # copy=True
  369. result = obj.to_numpy(copy=True)
  370. assert np.shares_memory(arr, result) is False
  371. @pytest.mark.parametrize("as_series", [True, False])
  372. def test_to_numpy_dtype(as_series, unit):
  373. tz = "US/Eastern"
  374. obj = pd.DatetimeIndex(["2000", "2001"], tz=tz)
  375. if as_series:
  376. obj = Series(obj)
  377. # preserve tz by default
  378. result = obj.to_numpy()
  379. expected = np.array(
  380. [Timestamp("2000", tz=tz), Timestamp("2001", tz=tz)], dtype=object
  381. )
  382. tm.assert_numpy_array_equal(result, expected)
  383. result = obj.to_numpy(dtype="object")
  384. tm.assert_numpy_array_equal(result, expected)
  385. result = obj.to_numpy(dtype="M8[ns]")
  386. expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]")
  387. tm.assert_numpy_array_equal(result, expected)
  388. @pytest.mark.parametrize(
  389. "values, dtype, na_value, expected",
  390. [
  391. ([1, 2, None], "float64", 0, [1.0, 2.0, 0.0]),
  392. (
  393. [Timestamp("2000"), Timestamp("2000"), pd.NaT],
  394. None,
  395. Timestamp("2000"),
  396. [np.datetime64("2000-01-01T00:00:00.000000000")] * 3,
  397. ),
  398. ],
  399. )
  400. def test_to_numpy_na_value_numpy_dtype(
  401. index_or_series, values, dtype, na_value, expected
  402. ):
  403. obj = index_or_series(values)
  404. result = obj.to_numpy(dtype=dtype, na_value=na_value)
  405. expected = np.array(expected)
  406. tm.assert_numpy_array_equal(result, expected)
  407. @pytest.mark.parametrize(
  408. "data, multiindex, dtype, na_value, expected",
  409. [
  410. (
  411. [1, 2, None, 4],
  412. [(0, "a"), (0, "b"), (1, "b"), (1, "c")],
  413. float,
  414. None,
  415. [1.0, 2.0, np.nan, 4.0],
  416. ),
  417. (
  418. [1, 2, None, 4],
  419. [(0, "a"), (0, "b"), (1, "b"), (1, "c")],
  420. float,
  421. np.nan,
  422. [1.0, 2.0, np.nan, 4.0],
  423. ),
  424. (
  425. [1.0, 2.0, np.nan, 4.0],
  426. [("a", 0), ("a", 1), ("a", 2), ("b", 0)],
  427. int,
  428. 0,
  429. [1, 2, 0, 4],
  430. ),
  431. (
  432. [Timestamp("2000"), Timestamp("2000"), pd.NaT],
  433. [(0, Timestamp("2021")), (0, Timestamp("2022")), (1, Timestamp("2000"))],
  434. None,
  435. Timestamp("2000"),
  436. [np.datetime64("2000-01-01T00:00:00.000000000")] * 3,
  437. ),
  438. ],
  439. )
  440. def test_to_numpy_multiindex_series_na_value(
  441. data, multiindex, dtype, na_value, expected
  442. ):
  443. index = pd.MultiIndex.from_tuples(multiindex)
  444. series = Series(data, index=index)
  445. result = series.to_numpy(dtype=dtype, na_value=na_value)
  446. expected = np.array(expected)
  447. tm.assert_numpy_array_equal(result, expected)
  448. def test_to_numpy_kwargs_raises():
  449. # numpy
  450. s = Series([1, 2, 3])
  451. msg = r"to_numpy\(\) got an unexpected keyword argument 'foo'"
  452. with pytest.raises(TypeError, match=msg):
  453. s.to_numpy(foo=True)
  454. # extension
  455. s = Series([1, 2, 3], dtype="Int64")
  456. with pytest.raises(TypeError, match=msg):
  457. s.to_numpy(foo=True)
  458. @pytest.mark.parametrize(
  459. "data",
  460. [
  461. {"a": [1, 2, 3], "b": [1, 2, None]},
  462. {"a": np.array([1, 2, 3]), "b": np.array([1, 2, np.nan])},
  463. {"a": pd.array([1, 2, 3]), "b": pd.array([1, 2, None])},
  464. ],
  465. )
  466. @pytest.mark.parametrize("dtype, na_value", [(float, np.nan), (object, None)])
  467. def test_to_numpy_dataframe_na_value(data, dtype, na_value):
  468. # https://github.com/pandas-dev/pandas/issues/33820
  469. df = pd.DataFrame(data)
  470. result = df.to_numpy(dtype=dtype, na_value=na_value)
  471. expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype)
  472. tm.assert_numpy_array_equal(result, expected)
  473. @pytest.mark.parametrize(
  474. "data, expected",
  475. [
  476. (
  477. {"a": pd.array([1, 2, None])},
  478. np.array([[1.0], [2.0], [np.nan]], dtype=float),
  479. ),
  480. (
  481. {"a": [1, 2, 3], "b": [1, 2, 3]},
  482. np.array([[1, 1], [2, 2], [3, 3]], dtype=float),
  483. ),
  484. ],
  485. )
  486. def test_to_numpy_dataframe_single_block(data, expected):
  487. # https://github.com/pandas-dev/pandas/issues/33820
  488. df = pd.DataFrame(data)
  489. result = df.to_numpy(dtype=float, na_value=np.nan)
  490. tm.assert_numpy_array_equal(result, expected)
  491. def test_to_numpy_dataframe_single_block_no_mutate():
  492. # https://github.com/pandas-dev/pandas/issues/33820
  493. result = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
  494. expected = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
  495. result.to_numpy(na_value=0.0)
  496. tm.assert_frame_equal(result, expected)
  497. class TestAsArray:
  498. @pytest.mark.parametrize("tz", [None, "US/Central"])
  499. def test_asarray_object_dt64(self, tz):
  500. ser = Series(date_range("2000", periods=2, tz=tz))
  501. with tm.assert_produces_warning(None):
  502. # Future behavior (for tzaware case) with no warning
  503. result = np.asarray(ser, dtype=object)
  504. expected = np.array(
  505. [Timestamp("2000-01-01", tz=tz), Timestamp("2000-01-02", tz=tz)]
  506. )
  507. tm.assert_numpy_array_equal(result, expected)
  508. def test_asarray_tz_naive(self):
  509. # This shouldn't produce a warning.
  510. ser = Series(date_range("2000", periods=2))
  511. expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]")
  512. result = np.asarray(ser)
  513. tm.assert_numpy_array_equal(result, expected)
  514. def test_asarray_tz_aware(self):
  515. tz = "US/Central"
  516. ser = Series(date_range("2000", periods=2, tz=tz))
  517. expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]")
  518. result = np.asarray(ser, dtype="datetime64[ns]")
  519. tm.assert_numpy_array_equal(result, expected)
  520. # Old behavior with no warning
  521. result = np.asarray(ser, dtype="M8[ns]")
  522. tm.assert_numpy_array_equal(result, expected)