test_api.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. CategoricalDtype,
  5. DataFrame,
  6. Index,
  7. MultiIndex,
  8. Series,
  9. _testing as tm,
  10. option_context,
  11. )
  12. from pandas.core.strings.accessor import StringMethods
  13. # subset of the full set from pandas/conftest.py
  14. _any_allowed_skipna_inferred_dtype = [
  15. ("string", ["a", np.nan, "c"]),
  16. ("bytes", [b"a", np.nan, b"c"]),
  17. ("empty", [np.nan, np.nan, np.nan]),
  18. ("empty", []),
  19. ("mixed-integer", ["a", np.nan, 2]),
  20. ]
  21. ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id
  22. @pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids)
  23. def any_allowed_skipna_inferred_dtype(request):
  24. """
  25. Fixture for all (inferred) dtypes allowed in StringMethods.__init__
  26. The covered (inferred) types are:
  27. * 'string'
  28. * 'empty'
  29. * 'bytes'
  30. * 'mixed'
  31. * 'mixed-integer'
  32. Returns
  33. -------
  34. inferred_dtype : str
  35. The string for the inferred dtype from _libs.lib.infer_dtype
  36. values : np.ndarray
  37. An array of object dtype that will be inferred to have
  38. `inferred_dtype`
  39. Examples
  40. --------
  41. >>> from pandas._libs import lib
  42. >>>
  43. >>> def test_something(any_allowed_skipna_inferred_dtype):
  44. ... inferred_dtype, values = any_allowed_skipna_inferred_dtype
  45. ... # will pass
  46. ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
  47. ...
  48. ... # constructor for .str-accessor will also pass
  49. ... Series(values).str
  50. """
  51. inferred_dtype, values = request.param
  52. values = np.array(values, dtype=object) # object dtype to avoid casting
  53. # correctness of inference tested in tests/dtypes/test_inference.py
  54. return inferred_dtype, values
  55. def test_api(any_string_dtype):
  56. # GH 6106, GH 9322
  57. assert Series.str is StringMethods
  58. assert isinstance(Series([""], dtype=any_string_dtype).str, StringMethods)
  59. def test_api_mi_raises():
  60. # GH 23679
  61. mi = MultiIndex.from_arrays([["a", "b", "c"]])
  62. msg = "Can only use .str accessor with Index, not MultiIndex"
  63. with pytest.raises(AttributeError, match=msg):
  64. mi.str
  65. assert not hasattr(mi, "str")
  66. @pytest.mark.parametrize("dtype", [object, "category"])
  67. def test_api_per_dtype(index_or_series, dtype, any_skipna_inferred_dtype):
  68. # one instance of parametrized fixture
  69. box = index_or_series
  70. inferred_dtype, values = any_skipna_inferred_dtype
  71. t = box(values, dtype=dtype) # explicit dtype to avoid casting
  72. types_passing_constructor = [
  73. "string",
  74. "unicode",
  75. "empty",
  76. "bytes",
  77. "mixed",
  78. "mixed-integer",
  79. ]
  80. if inferred_dtype in types_passing_constructor:
  81. # GH 6106
  82. assert isinstance(t.str, StringMethods)
  83. else:
  84. # GH 9184, GH 23011, GH 23163
  85. msg = "Can only use .str accessor with string values.*"
  86. with pytest.raises(AttributeError, match=msg):
  87. t.str
  88. assert not hasattr(t, "str")
  89. @pytest.mark.parametrize("dtype", [object, "category"])
  90. def test_api_per_method(
  91. index_or_series,
  92. dtype,
  93. any_allowed_skipna_inferred_dtype,
  94. any_string_method,
  95. request,
  96. using_infer_string,
  97. ):
  98. # this test does not check correctness of the different methods,
  99. # just that the methods work on the specified (inferred) dtypes,
  100. # and raise on all others
  101. box = index_or_series
  102. # one instance of each parametrized fixture
  103. inferred_dtype, values = any_allowed_skipna_inferred_dtype
  104. method_name, args, kwargs = any_string_method
  105. reason = None
  106. if box is Index and values.size == 0:
  107. if method_name in ["partition", "rpartition"] and kwargs.get("expand", True):
  108. raises = TypeError
  109. reason = "Method cannot deal with empty Index"
  110. elif method_name == "split" and kwargs.get("expand", None):
  111. raises = TypeError
  112. reason = "Split fails on empty Series when expand=True"
  113. elif method_name == "get_dummies":
  114. raises = ValueError
  115. reason = "Need to fortify get_dummies corner cases"
  116. elif (
  117. box is Index
  118. and inferred_dtype == "empty"
  119. and dtype == object
  120. and method_name == "get_dummies"
  121. ):
  122. raises = ValueError
  123. reason = "Need to fortify get_dummies corner cases"
  124. if reason is not None:
  125. mark = pytest.mark.xfail(raises=raises, reason=reason)
  126. request.applymarker(mark)
  127. t = box(values, dtype=dtype) # explicit dtype to avoid casting
  128. method = getattr(t.str, method_name)
  129. if using_infer_string and dtype == "category":
  130. string_allowed = method_name not in ["decode"]
  131. else:
  132. string_allowed = True
  133. bytes_allowed = method_name in ["decode", "get", "len", "slice"]
  134. # as of v0.23.4, all methods except 'cat' are very lenient with the
  135. # allowed data types, just returning NaN for entries that error.
  136. # This could be changed with an 'errors'-kwarg to the `str`-accessor,
  137. # see discussion in GH 13877
  138. mixed_allowed = method_name not in ["cat"]
  139. allowed_types = (
  140. ["empty"]
  141. + ["string", "unicode"] * string_allowed
  142. + ["bytes"] * bytes_allowed
  143. + ["mixed", "mixed-integer"] * mixed_allowed
  144. )
  145. if inferred_dtype in allowed_types:
  146. # xref GH 23555, GH 23556
  147. with option_context("future.no_silent_downcasting", True):
  148. method(*args, **kwargs) # works!
  149. else:
  150. # GH 23011, GH 23163
  151. msg = (
  152. f"Cannot use .str.{method_name} with values of "
  153. f"inferred dtype {repr(inferred_dtype)}."
  154. "|a bytes-like object is required, not 'str'"
  155. )
  156. with pytest.raises(TypeError, match=msg):
  157. method(*args, **kwargs)
  158. def test_api_for_categorical(any_string_method, any_string_dtype):
  159. # https://github.com/pandas-dev/pandas/issues/10661
  160. s = Series(list("aabb"), dtype=any_string_dtype)
  161. s = s + " " + s
  162. c = s.astype("category")
  163. c = c.astype(CategoricalDtype(c.dtype.categories.astype("object")))
  164. assert isinstance(c.str, StringMethods)
  165. method_name, args, kwargs = any_string_method
  166. result = getattr(c.str, method_name)(*args, **kwargs)
  167. expected = getattr(s.astype("object").str, method_name)(*args, **kwargs)
  168. if isinstance(result, DataFrame):
  169. tm.assert_frame_equal(result, expected)
  170. elif isinstance(result, Series):
  171. tm.assert_series_equal(result, expected)
  172. else:
  173. # str.cat(others=None) returns string, for example
  174. assert result == expected