| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519 |
- import datetime
- import decimal
- import re
- import numpy as np
- import pytest
- import pytz
- from pandas._config import using_string_dtype
- import pandas as pd
- import pandas._testing as tm
- from pandas.api.extensions import register_extension_dtype
- from pandas.arrays import (
- BooleanArray,
- DatetimeArray,
- FloatingArray,
- IntegerArray,
- IntervalArray,
- SparseArray,
- TimedeltaArray,
- )
- from pandas.core.arrays import (
- NumpyExtensionArray,
- period_array,
- )
- from pandas.tests.extension.decimal import (
- DecimalArray,
- DecimalDtype,
- to_decimal,
- )
- @pytest.mark.parametrize("dtype_unit", ["M8[h]", "M8[m]", "m8[h]", "M8[m]"])
- def test_dt64_array(dtype_unit):
- # PR 53817
- dtype_var = np.dtype(dtype_unit)
- msg = (
- r"datetime64 and timedelta64 dtype resolutions other than "
- r"'s', 'ms', 'us', and 'ns' are deprecated. "
- r"In future releases passing unsupported resolutions will "
- r"raise an exception."
- )
- with tm.assert_produces_warning(FutureWarning, match=re.escape(msg)):
- pd.array([], dtype=dtype_var)
- @pytest.mark.parametrize(
- "data, dtype, expected",
- [
- # Basic NumPy defaults.
- ([], None, FloatingArray._from_sequence([], dtype="Float64")),
- ([1, 2], None, IntegerArray._from_sequence([1, 2], dtype="Int64")),
- ([1, 2], object, NumpyExtensionArray(np.array([1, 2], dtype=object))),
- (
- [1, 2],
- np.dtype("float32"),
- NumpyExtensionArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),
- ),
- (
- np.array([], dtype=object),
- None,
- NumpyExtensionArray(np.array([], dtype=object)),
- ),
- (
- np.array([1, 2], dtype="int64"),
- None,
- IntegerArray._from_sequence([1, 2], dtype="Int64"),
- ),
- (
- np.array([1.0, 2.0], dtype="float64"),
- None,
- FloatingArray._from_sequence([1.0, 2.0], dtype="Float64"),
- ),
- # String alias passes through to NumPy
- ([1, 2], "float32", NumpyExtensionArray(np.array([1, 2], dtype="float32"))),
- ([1, 2], "int64", NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
- # GH#44715 FloatingArray does not support float16, so fall
- # back to NumpyExtensionArray
- (
- np.array([1, 2], dtype=np.float16),
- None,
- NumpyExtensionArray(np.array([1, 2], dtype=np.float16)),
- ),
- # idempotency with e.g. pd.array(pd.array([1, 2], dtype="int64"))
- (
- NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
- None,
- NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
- ),
- # Period alias
- (
- [pd.Period("2000", "D"), pd.Period("2001", "D")],
- "Period[D]",
- period_array(["2000", "2001"], freq="D"),
- ),
- # Period dtype
- (
- [pd.Period("2000", "D")],
- pd.PeriodDtype("D"),
- period_array(["2000"], freq="D"),
- ),
- # Datetime (naive)
- (
- [1, 2],
- np.dtype("datetime64[ns]"),
- DatetimeArray._from_sequence(
- np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
- ),
- ),
- (
- [1, 2],
- np.dtype("datetime64[s]"),
- DatetimeArray._from_sequence(
- np.array([1, 2], dtype="M8[s]"), dtype="M8[s]"
- ),
- ),
- (
- np.array([1, 2], dtype="datetime64[ns]"),
- None,
- DatetimeArray._from_sequence(
- np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
- ),
- ),
- (
- pd.DatetimeIndex(["2000", "2001"]),
- np.dtype("datetime64[ns]"),
- DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
- ),
- (
- pd.DatetimeIndex(["2000", "2001"]),
- None,
- DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
- ),
- (
- ["2000", "2001"],
- np.dtype("datetime64[ns]"),
- DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
- ),
- # Datetime (tz-aware)
- (
- ["2000", "2001"],
- pd.DatetimeTZDtype(tz="CET"),
- DatetimeArray._from_sequence(
- ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
- ),
- ),
- # Timedelta
- (
- ["1h", "2h"],
- np.dtype("timedelta64[ns]"),
- TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
- ),
- (
- pd.TimedeltaIndex(["1h", "2h"]),
- np.dtype("timedelta64[ns]"),
- TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
- ),
- (
- np.array([1, 2], dtype="m8[s]"),
- np.dtype("timedelta64[s]"),
- TimedeltaArray._from_sequence(
- np.array([1, 2], dtype="m8[s]"), dtype="m8[s]"
- ),
- ),
- (
- pd.TimedeltaIndex(["1h", "2h"]),
- None,
- TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
- ),
- (
- # preserve non-nano, i.e. don't cast to NumpyExtensionArray
- TimedeltaArray._simple_new(
- np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
- ),
- None,
- TimedeltaArray._simple_new(
- np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
- ),
- ),
- (
- # preserve non-nano, i.e. don't cast to NumpyExtensionArray
- TimedeltaArray._simple_new(
- np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
- ),
- np.dtype("m8[s]"),
- TimedeltaArray._simple_new(
- np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
- ),
- ),
- # Category
- (["a", "b"], "category", pd.Categorical(["a", "b"])),
- (
- ["a", "b"],
- pd.CategoricalDtype(None, ordered=True),
- pd.Categorical(["a", "b"], ordered=True),
- ),
- # Interval
- (
- [pd.Interval(1, 2), pd.Interval(3, 4)],
- "interval",
- IntervalArray.from_tuples([(1, 2), (3, 4)]),
- ),
- # Sparse
- ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")),
- # IntegerNA
- ([1, None], "Int16", pd.array([1, None], dtype="Int16")),
- (
- pd.Series([1, 2]),
- None,
- NumpyExtensionArray(np.array([1, 2], dtype=np.int64)),
- ),
- # String
- (
- ["a", None],
- "string",
- pd.StringDtype()
- .construct_array_type()
- ._from_sequence(["a", None], dtype=pd.StringDtype()),
- ),
- (
- ["a", None],
- "str",
- pd.StringDtype(na_value=np.nan)
- .construct_array_type()
- ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan))
- if using_string_dtype()
- else NumpyExtensionArray(np.array(["a", "None"])),
- ),
- (
- ["a", None],
- pd.StringDtype(),
- pd.StringDtype()
- .construct_array_type()
- ._from_sequence(["a", None], dtype=pd.StringDtype()),
- ),
- (
- ["a", None],
- pd.StringDtype(na_value=np.nan),
- pd.StringDtype(na_value=np.nan)
- .construct_array_type()
- ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)),
- ),
- (
- # numpy array with string dtype
- np.array(["a", "b"], dtype=str),
- pd.StringDtype(),
- pd.StringDtype()
- .construct_array_type()
- ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
- ),
- (
- # numpy array with string dtype
- np.array(["a", "b"], dtype=str),
- pd.StringDtype(na_value=np.nan),
- pd.StringDtype(na_value=np.nan)
- .construct_array_type()
- ._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
- ),
- # Boolean
- (
- [True, None],
- "boolean",
- BooleanArray._from_sequence([True, None], dtype="boolean"),
- ),
- (
- [True, None],
- pd.BooleanDtype(),
- BooleanArray._from_sequence([True, None], dtype="boolean"),
- ),
- # Index
- (pd.Index([1, 2]), None, NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
- # Series[EA] returns the EA
- (
- pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])),
- None,
- pd.Categorical(["a", "b"], categories=["a", "b", "c"]),
- ),
- # "3rd party" EAs work
- ([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])),
- # pass an ExtensionArray, but a different dtype
- (
- period_array(["2000", "2001"], freq="D"),
- "category",
- pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]),
- ),
- ],
- )
- def test_array(data, dtype, expected):
- result = pd.array(data, dtype=dtype)
- tm.assert_equal(result, expected)
- def test_array_copy():
- a = np.array([1, 2])
- # default is to copy
- b = pd.array(a, dtype=a.dtype)
- assert not tm.shares_memory(a, b)
- # copy=True
- b = pd.array(a, dtype=a.dtype, copy=True)
- assert not tm.shares_memory(a, b)
- # copy=False
- b = pd.array(a, dtype=a.dtype, copy=False)
- assert tm.shares_memory(a, b)
- cet = pytz.timezone("CET")
- @pytest.mark.parametrize(
- "data, expected",
- [
- # period
- (
- [pd.Period("2000", "D"), pd.Period("2001", "D")],
- period_array(["2000", "2001"], freq="D"),
- ),
- # interval
- ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])),
- # datetime
- (
- [pd.Timestamp("2000"), pd.Timestamp("2001")],
- DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
- ),
- (
- [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)],
- DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
- ),
- (
- np.array([1, 2], dtype="M8[ns]"),
- DatetimeArray._from_sequence(np.array([1, 2], dtype="M8[ns]")),
- ),
- (
- np.array([1, 2], dtype="M8[us]"),
- DatetimeArray._simple_new(
- np.array([1, 2], dtype="M8[us]"), dtype=np.dtype("M8[us]")
- ),
- ),
- # datetimetz
- (
- [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")],
- DatetimeArray._from_sequence(
- ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns")
- ),
- ),
- (
- [
- datetime.datetime(2000, 1, 1, tzinfo=cet),
- datetime.datetime(2001, 1, 1, tzinfo=cet),
- ],
- DatetimeArray._from_sequence(
- ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns")
- ),
- ),
- # timedelta
- (
- [pd.Timedelta("1h"), pd.Timedelta("2h")],
- TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
- ),
- (
- np.array([1, 2], dtype="m8[ns]"),
- TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")),
- ),
- (
- np.array([1, 2], dtype="m8[us]"),
- TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")),
- ),
- # integer
- ([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")),
- ([1, None], IntegerArray._from_sequence([1, None], dtype="Int64")),
- ([1, pd.NA], IntegerArray._from_sequence([1, pd.NA], dtype="Int64")),
- ([1, np.nan], IntegerArray._from_sequence([1, np.nan], dtype="Int64")),
- # float
- ([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2], dtype="Float64")),
- ([0.1, None], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
- ([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
- ([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
- # integer-like float
- ([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
- ([1.0, None], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
- ([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
- ([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
- # mixed-integer-float
- ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
- (
- [1, np.nan, 2.0],
- FloatingArray._from_sequence([1.0, None, 2.0], dtype="Float64"),
- ),
- # string
- (
- ["a", "b"],
- pd.StringDtype()
- .construct_array_type()
- ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
- ),
- (
- ["a", None],
- pd.StringDtype()
- .construct_array_type()
- ._from_sequence(["a", None], dtype=pd.StringDtype()),
- ),
- (
- # numpy array with string dtype
- np.array(["a", "b"], dtype=str),
- pd.StringDtype()
- .construct_array_type()
- ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
- ),
- # Boolean
- ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")),
- ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")),
- ],
- )
- def test_array_inference(data, expected):
- result = pd.array(data)
- tm.assert_equal(result, expected)
- @pytest.mark.parametrize(
- "data",
- [
- # mix of frequencies
- [pd.Period("2000", "D"), pd.Period("2001", "Y")],
- # mix of closed
- [pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")],
- # Mix of timezones
- [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")],
- # Mix of tz-aware and tz-naive
- [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")],
- np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]),
- ],
- )
- def test_array_inference_fails(data):
- result = pd.array(data)
- expected = NumpyExtensionArray(np.array(data, dtype=object))
- tm.assert_extension_array_equal(result, expected)
- @pytest.mark.parametrize("data", [np.array(0)])
- def test_nd_raises(data):
- with pytest.raises(ValueError, match="NumpyExtensionArray must be 1-dimensional"):
- pd.array(data, dtype="int64")
- def test_scalar_raises():
- with pytest.raises(ValueError, match="Cannot pass scalar '1'"):
- pd.array(1)
- def test_dataframe_raises():
- # GH#51167 don't accidentally cast to StringArray by doing inference on columns
- df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
- msg = "Cannot pass DataFrame to 'pandas.array'"
- with pytest.raises(TypeError, match=msg):
- pd.array(df)
- def test_bounds_check():
- # GH21796
- with pytest.raises(
- TypeError, match=r"cannot safely cast non-equivalent int(32|64) to uint16"
- ):
- pd.array([-1, 2, 3], dtype="UInt16")
- # ---------------------------------------------------------------------------
- # A couple dummy classes to ensure that Series and Indexes are unboxed before
- # getting to the EA classes.
- @register_extension_dtype
- class DecimalDtype2(DecimalDtype):
- name = "decimal2"
- @classmethod
- def construct_array_type(cls):
- """
- Return the array type associated with this dtype.
- Returns
- -------
- type
- """
- return DecimalArray2
- class DecimalArray2(DecimalArray):
- @classmethod
- def _from_sequence(cls, scalars, *, dtype=None, copy=False):
- if isinstance(scalars, (pd.Series, pd.Index)):
- raise TypeError("scalars should not be of type pd.Series or pd.Index")
- return super()._from_sequence(scalars, dtype=dtype, copy=copy)
- def test_array_unboxes(index_or_series):
- box = index_or_series
- data = box([decimal.Decimal("1"), decimal.Decimal("2")])
- dtype = DecimalDtype2()
- # make sure it works
- with pytest.raises(
- TypeError, match="scalars should not be of type pd.Series or pd.Index"
- ):
- DecimalArray2._from_sequence(data, dtype=dtype)
- result = pd.array(data, dtype="decimal2")
- expected = DecimalArray2._from_sequence(data.values, dtype=dtype)
- tm.assert_equal(result, expected)
- def test_array_to_numpy_na():
- # GH#40638
- arr = pd.array([pd.NA, 1], dtype="string[python]")
- result = arr.to_numpy(na_value=True, dtype=bool)
- expected = np.array([True, True])
- tm.assert_numpy_array_equal(result, expected)
|