| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205 |
- import numpy as np
- import pytest
- from pandas import (
- CategoricalDtype,
- DataFrame,
- Index,
- MultiIndex,
- Series,
- _testing as tm,
- option_context,
- )
- from pandas.core.strings.accessor import StringMethods
- # subset of the full set from pandas/conftest.py
- _any_allowed_skipna_inferred_dtype = [
- ("string", ["a", np.nan, "c"]),
- ("bytes", [b"a", np.nan, b"c"]),
- ("empty", [np.nan, np.nan, np.nan]),
- ("empty", []),
- ("mixed-integer", ["a", np.nan, 2]),
- ]
- ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id
- @pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids)
- def any_allowed_skipna_inferred_dtype(request):
- """
- Fixture for all (inferred) dtypes allowed in StringMethods.__init__
- The covered (inferred) types are:
- * 'string'
- * 'empty'
- * 'bytes'
- * 'mixed'
- * 'mixed-integer'
- Returns
- -------
- inferred_dtype : str
- The string for the inferred dtype from _libs.lib.infer_dtype
- values : np.ndarray
- An array of object dtype that will be inferred to have
- `inferred_dtype`
- Examples
- --------
- >>> from pandas._libs import lib
- >>>
- >>> def test_something(any_allowed_skipna_inferred_dtype):
- ... inferred_dtype, values = any_allowed_skipna_inferred_dtype
- ... # will pass
- ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
- ...
- ... # constructor for .str-accessor will also pass
- ... Series(values).str
- """
- inferred_dtype, values = request.param
- values = np.array(values, dtype=object) # object dtype to avoid casting
- # correctness of inference tested in tests/dtypes/test_inference.py
- return inferred_dtype, values
- def test_api(any_string_dtype):
- # GH 6106, GH 9322
- assert Series.str is StringMethods
- assert isinstance(Series([""], dtype=any_string_dtype).str, StringMethods)
- def test_api_mi_raises():
- # GH 23679
- mi = MultiIndex.from_arrays([["a", "b", "c"]])
- msg = "Can only use .str accessor with Index, not MultiIndex"
- with pytest.raises(AttributeError, match=msg):
- mi.str
- assert not hasattr(mi, "str")
- @pytest.mark.parametrize("dtype", [object, "category"])
- def test_api_per_dtype(index_or_series, dtype, any_skipna_inferred_dtype):
- # one instance of parametrized fixture
- box = index_or_series
- inferred_dtype, values = any_skipna_inferred_dtype
- t = box(values, dtype=dtype) # explicit dtype to avoid casting
- types_passing_constructor = [
- "string",
- "unicode",
- "empty",
- "bytes",
- "mixed",
- "mixed-integer",
- ]
- if inferred_dtype in types_passing_constructor:
- # GH 6106
- assert isinstance(t.str, StringMethods)
- else:
- # GH 9184, GH 23011, GH 23163
- msg = "Can only use .str accessor with string values.*"
- with pytest.raises(AttributeError, match=msg):
- t.str
- assert not hasattr(t, "str")
- @pytest.mark.parametrize("dtype", [object, "category"])
- def test_api_per_method(
- index_or_series,
- dtype,
- any_allowed_skipna_inferred_dtype,
- any_string_method,
- request,
- using_infer_string,
- ):
- # this test does not check correctness of the different methods,
- # just that the methods work on the specified (inferred) dtypes,
- # and raise on all others
- box = index_or_series
- # one instance of each parametrized fixture
- inferred_dtype, values = any_allowed_skipna_inferred_dtype
- method_name, args, kwargs = any_string_method
- reason = None
- if box is Index and values.size == 0:
- if method_name in ["partition", "rpartition"] and kwargs.get("expand", True):
- raises = TypeError
- reason = "Method cannot deal with empty Index"
- elif method_name == "split" and kwargs.get("expand", None):
- raises = TypeError
- reason = "Split fails on empty Series when expand=True"
- elif method_name == "get_dummies":
- raises = ValueError
- reason = "Need to fortify get_dummies corner cases"
- elif (
- box is Index
- and inferred_dtype == "empty"
- and dtype == object
- and method_name == "get_dummies"
- ):
- raises = ValueError
- reason = "Need to fortify get_dummies corner cases"
- if reason is not None:
- mark = pytest.mark.xfail(raises=raises, reason=reason)
- request.applymarker(mark)
- t = box(values, dtype=dtype) # explicit dtype to avoid casting
- method = getattr(t.str, method_name)
- if using_infer_string and dtype == "category":
- string_allowed = method_name not in ["decode"]
- else:
- string_allowed = True
- bytes_allowed = method_name in ["decode", "get", "len", "slice"]
- # as of v0.23.4, all methods except 'cat' are very lenient with the
- # allowed data types, just returning NaN for entries that error.
- # This could be changed with an 'errors'-kwarg to the `str`-accessor,
- # see discussion in GH 13877
- mixed_allowed = method_name not in ["cat"]
- allowed_types = (
- ["empty"]
- + ["string", "unicode"] * string_allowed
- + ["bytes"] * bytes_allowed
- + ["mixed", "mixed-integer"] * mixed_allowed
- )
- if inferred_dtype in allowed_types:
- # xref GH 23555, GH 23556
- with option_context("future.no_silent_downcasting", True):
- method(*args, **kwargs) # works!
- else:
- # GH 23011, GH 23163
- msg = (
- f"Cannot use .str.{method_name} with values of "
- f"inferred dtype {repr(inferred_dtype)}."
- "|a bytes-like object is required, not 'str'"
- )
- with pytest.raises(TypeError, match=msg):
- method(*args, **kwargs)
- def test_api_for_categorical(any_string_method, any_string_dtype):
- # https://github.com/pandas-dev/pandas/issues/10661
- s = Series(list("aabb"), dtype=any_string_dtype)
- s = s + " " + s
- c = s.astype("category")
- c = c.astype(CategoricalDtype(c.dtype.categories.astype("object")))
- assert isinstance(c.str, StringMethods)
- method_name, args, kwargs = any_string_method
- result = getattr(c.str, method_name)(*args, **kwargs)
- expected = getattr(s.astype("object").str, method_name)(*args, **kwargs)
- if isinstance(result, DataFrame):
- tm.assert_frame_equal(result, expected)
- elif isinstance(result, Series):
- tm.assert_series_equal(result, expected)
- else:
- # str.cat(others=None) returns string, for example
- assert result == expected
|