test_strings.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846
  1. from datetime import (
  2. datetime,
  3. timedelta,
  4. )
  5. from pathlib import Path
  6. import numpy as np
  7. import pytest
  8. from pandas.compat import pa_version_under21p0
  9. from pandas import (
  10. NA,
  11. DataFrame,
  12. Index,
  13. MultiIndex,
  14. Series,
  15. StringDtype,
  16. )
  17. import pandas._testing as tm
  18. from pandas.core.strings.accessor import StringMethods
  19. from pandas.tests.strings import is_object_or_nan_string_dtype
  20. @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])])
  21. def test_startswith_endswith_non_str_patterns(pattern):
  22. # GH3485
  23. ser = Series(["foo", "bar"])
  24. msg = f"expected a string or tuple, not {type(pattern).__name__}"
  25. with pytest.raises(TypeError, match=msg):
  26. ser.str.startswith(pattern)
  27. with pytest.raises(TypeError, match=msg):
  28. ser.str.endswith(pattern)
  29. def test_iter_raises():
  30. # GH 54173
  31. ser = Series(["foo", "bar"])
  32. with pytest.raises(TypeError, match="'StringMethods' object is not iterable"):
  33. iter(ser.str)
  34. # test integer/float dtypes (inferred by constructor) and mixed
  35. def test_count(any_string_dtype):
  36. ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype)
  37. result = ser.str.count("f[o]+")
  38. expected_dtype = (
  39. np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
  40. )
  41. expected = Series([1, 2, np.nan, 4], dtype=expected_dtype)
  42. tm.assert_series_equal(result, expected)
  43. def test_count_mixed_object():
  44. ser = Series(
  45. ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
  46. dtype=object,
  47. )
  48. result = ser.str.count("a")
  49. expected = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan])
  50. tm.assert_series_equal(result, expected)
  51. def test_repeat(any_string_dtype):
  52. ser = Series(["a", "b", np.nan, "c", np.nan, "d"], dtype=any_string_dtype)
  53. result = ser.str.repeat(3)
  54. expected = Series(
  55. ["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"], dtype=any_string_dtype
  56. )
  57. tm.assert_series_equal(result, expected)
  58. result = ser.str.repeat([1, 2, 3, 4, 5, 6])
  59. expected = Series(
  60. ["a", "bb", np.nan, "cccc", np.nan, "dddddd"], dtype=any_string_dtype
  61. )
  62. tm.assert_series_equal(result, expected)
  63. def test_repeat_mixed_object():
  64. ser = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0])
  65. result = ser.str.repeat(3)
  66. expected = Series(
  67. ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", None, np.nan, np.nan],
  68. dtype=object,
  69. )
  70. tm.assert_series_equal(result, expected)
  71. @pytest.mark.parametrize("arg, repeat", [[None, 4], ["b", None]])
  72. def test_repeat_with_null(any_string_dtype, arg, repeat):
  73. # GH: 31632
  74. ser = Series(["a", arg], dtype=any_string_dtype)
  75. result = ser.str.repeat([3, repeat])
  76. expected = Series(["aaa", None], dtype=any_string_dtype)
  77. tm.assert_series_equal(result, expected)
  78. def test_empty_str_methods(any_string_dtype):
  79. empty_str = empty = Series(dtype=any_string_dtype)
  80. empty_inferred_str = Series(dtype="str")
  81. if is_object_or_nan_string_dtype(any_string_dtype):
  82. empty_int = Series(dtype="int64")
  83. empty_bool = Series(dtype=bool)
  84. else:
  85. empty_int = Series(dtype="Int64")
  86. empty_bool = Series(dtype="boolean")
  87. empty_object = Series(dtype=object)
  88. empty_bytes = Series(dtype=object)
  89. empty_df = DataFrame()
  90. # GH7241
  91. # (extract) on empty series
  92. tm.assert_series_equal(empty_str, empty.str.cat(empty))
  93. assert "" == empty.str.cat()
  94. tm.assert_series_equal(empty_str, empty.str.title())
  95. tm.assert_series_equal(empty_int, empty.str.count("a"))
  96. tm.assert_series_equal(empty_bool, empty.str.contains("a"))
  97. tm.assert_series_equal(empty_bool, empty.str.startswith("a"))
  98. tm.assert_series_equal(empty_bool, empty.str.endswith("a"))
  99. tm.assert_series_equal(empty_str, empty.str.lower())
  100. tm.assert_series_equal(empty_str, empty.str.upper())
  101. tm.assert_series_equal(empty_str, empty.str.replace("a", "b"))
  102. tm.assert_series_equal(empty_str, empty.str.repeat(3))
  103. tm.assert_series_equal(empty_bool, empty.str.match("^a"))
  104. tm.assert_frame_equal(
  105. DataFrame(columns=[0], dtype=any_string_dtype),
  106. empty.str.extract("()", expand=True),
  107. )
  108. tm.assert_frame_equal(
  109. DataFrame(columns=[0, 1], dtype=any_string_dtype),
  110. empty.str.extract("()()", expand=True),
  111. )
  112. tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False))
  113. tm.assert_frame_equal(
  114. DataFrame(columns=[0, 1], dtype=any_string_dtype),
  115. empty.str.extract("()()", expand=False),
  116. )
  117. tm.assert_frame_equal(empty_df.set_axis([], axis=1), empty.str.get_dummies())
  118. tm.assert_series_equal(empty_str, empty_str.str.join(""))
  119. tm.assert_series_equal(empty_int, empty.str.len())
  120. tm.assert_series_equal(empty_object, empty_str.str.findall("a"))
  121. tm.assert_series_equal(empty_int, empty.str.find("a"))
  122. tm.assert_series_equal(empty_int, empty.str.rfind("a"))
  123. tm.assert_series_equal(empty_str, empty.str.pad(42))
  124. tm.assert_series_equal(empty_str, empty.str.center(42))
  125. tm.assert_series_equal(empty_object, empty.str.split("a"))
  126. tm.assert_series_equal(empty_object, empty.str.rsplit("a"))
  127. tm.assert_series_equal(empty_object, empty.str.partition("a", expand=False))
  128. tm.assert_frame_equal(empty_df, empty.str.partition("a"))
  129. tm.assert_series_equal(empty_object, empty.str.rpartition("a", expand=False))
  130. tm.assert_frame_equal(empty_df, empty.str.rpartition("a"))
  131. tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
  132. tm.assert_series_equal(empty_str, empty.str.slice(step=1))
  133. tm.assert_series_equal(empty_str, empty.str.strip())
  134. tm.assert_series_equal(empty_str, empty.str.lstrip())
  135. tm.assert_series_equal(empty_str, empty.str.rstrip())
  136. tm.assert_series_equal(empty_str, empty.str.wrap(42))
  137. tm.assert_series_equal(empty_str, empty.str.get(0))
  138. tm.assert_series_equal(empty_inferred_str, empty_bytes.str.decode("ascii"))
  139. tm.assert_series_equal(empty_bytes, empty.str.encode("ascii"))
  140. # ismethods should always return boolean (GH 29624)
  141. tm.assert_series_equal(empty_bool, empty.str.isalnum())
  142. tm.assert_series_equal(empty_bool, empty.str.isalpha())
  143. tm.assert_series_equal(empty_bool, empty.str.isdigit())
  144. tm.assert_series_equal(empty_bool, empty.str.isspace())
  145. tm.assert_series_equal(empty_bool, empty.str.islower())
  146. tm.assert_series_equal(empty_bool, empty.str.isupper())
  147. tm.assert_series_equal(empty_bool, empty.str.istitle())
  148. tm.assert_series_equal(empty_bool, empty.str.isnumeric())
  149. tm.assert_series_equal(empty_bool, empty.str.isdecimal())
  150. tm.assert_series_equal(empty_str, empty.str.capitalize())
  151. tm.assert_series_equal(empty_str, empty.str.swapcase())
  152. tm.assert_series_equal(empty_str, empty.str.normalize("NFC"))
  153. table = str.maketrans("a", "b")
  154. tm.assert_series_equal(empty_str, empty.str.translate(table))
  155. @pytest.mark.parametrize(
  156. "method, expected",
  157. [
  158. ("isalnum", [True, True, True, True, True, False, True, True, False, False]),
  159. ("isalpha", [True, True, True, False, False, False, True, False, False, False]),
  160. (
  161. "isdigit",
  162. [False, False, False, True, False, False, False, True, False, False],
  163. ),
  164. (
  165. "isnumeric",
  166. [False, False, False, True, False, False, False, True, False, False],
  167. ),
  168. (
  169. "isspace",
  170. [False, False, False, False, False, False, False, False, False, True],
  171. ),
  172. (
  173. "islower",
  174. [False, True, False, False, False, False, False, False, False, False],
  175. ),
  176. (
  177. "isupper",
  178. [True, False, False, False, True, False, True, False, False, False],
  179. ),
  180. (
  181. "istitle",
  182. [True, False, True, False, True, False, False, False, False, False],
  183. ),
  184. ],
  185. )
  186. def test_ismethods(method, expected, any_string_dtype):
  187. ser = Series(
  188. ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype
  189. )
  190. expected_dtype = (
  191. "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  192. )
  193. expected = Series(expected, dtype=expected_dtype)
  194. result = getattr(ser.str, method)()
  195. tm.assert_series_equal(result, expected)
  196. # compare with standard library
  197. expected_stdlib = [getattr(item, method)() for item in ser]
  198. assert list(result) == expected_stdlib
  199. # with missing value
  200. ser.iloc[[1, 2, 3, 4]] = np.nan
  201. result = getattr(ser.str, method)()
  202. if ser.dtype == "object":
  203. expected = expected.astype(object)
  204. expected.iloc[[1, 2, 3, 4]] = np.nan
  205. elif ser.dtype == "str":
  206. # NaN propagates as False
  207. expected.iloc[[1, 2, 3, 4]] = False
  208. else:
  209. # nullable dtypes propagate NaN
  210. expected.iloc[[1, 2, 3, 4]] = np.nan
  211. @pytest.mark.parametrize(
  212. "method, expected",
  213. [
  214. ("isnumeric", [False, True, True, True, False, True, True, False]),
  215. ("isdecimal", [False, True, False, False, False, False, True, False]),
  216. ("isdigit", [False, True, True, False, False, False, True, False]),
  217. ],
  218. )
  219. def test_isnumeric_unicode(method, expected, any_string_dtype):
  220. # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER
  221. # 0x2605: ★ not number
  222. # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
  223. # 0xFF13: 3 Em 3 # noqa: RUF003
  224. ser = Series(
  225. ["A", "3", "³", "¼", "★", "፸", "3", "four"], # noqa: RUF001
  226. dtype=any_string_dtype,
  227. )
  228. expected_dtype = (
  229. "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  230. )
  231. expected = Series(expected, dtype=expected_dtype)
  232. if (
  233. method == "isdigit"
  234. and isinstance(ser.dtype, StringDtype)
  235. and ser.dtype.storage == "pyarrow"
  236. and not pa_version_under21p0
  237. ):
  238. # known difference in behavior between python and pyarrow unicode handling
  239. # pyarrow 21+ considers ¼ and ፸ as a digit, while python does not
  240. expected.iloc[3] = True
  241. expected.iloc[5] = True
  242. result = getattr(ser.str, method)()
  243. tm.assert_series_equal(result, expected)
  244. # compare with standard library
  245. # (only for non-pyarrow storage given the above differences)
  246. if any_string_dtype == "object" or (
  247. isinstance(any_string_dtype, StringDtype)
  248. and any_string_dtype.storage == "python"
  249. ):
  250. expected = [getattr(item, method)() for item in ser]
  251. assert list(result) == expected
  252. @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning")
  253. @pytest.mark.parametrize(
  254. "method, expected",
  255. [
  256. ("isnumeric", [False, np.nan, True, False, np.nan, True, False]),
  257. ("isdecimal", [False, np.nan, False, False, np.nan, True, False]),
  258. ],
  259. )
  260. def test_isnumeric_unicode_missing(method, expected, any_string_dtype):
  261. values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001
  262. ser = Series(values, dtype=any_string_dtype)
  263. if any_string_dtype == "str":
  264. # NaN propagates as False
  265. expected = Series(expected, dtype=object).fillna(False).astype(bool)
  266. else:
  267. expected_dtype = (
  268. "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  269. )
  270. expected = Series(expected, dtype=expected_dtype)
  271. result = getattr(ser.str, method)()
  272. tm.assert_series_equal(result, expected)
  273. def test_spilt_join_roundtrip(any_string_dtype):
  274. ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
  275. result = ser.str.split("_").str.join("_")
  276. expected = ser.astype(object)
  277. tm.assert_series_equal(result, expected)
  278. def test_spilt_join_roundtrip_mixed_object():
  279. ser = Series(
  280. ["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0]
  281. )
  282. result = ser.str.split("_").str.join("_")
  283. expected = Series(
  284. ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", None, np.nan, np.nan],
  285. dtype=object,
  286. )
  287. tm.assert_series_equal(result, expected)
  288. def test_len(any_string_dtype):
  289. ser = Series(
  290. ["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"],
  291. dtype=any_string_dtype,
  292. )
  293. result = ser.str.len()
  294. expected_dtype = (
  295. "float64" if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
  296. )
  297. expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype)
  298. tm.assert_series_equal(result, expected)
  299. def test_len_mixed():
  300. ser = Series(
  301. ["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0]
  302. )
  303. result = ser.str.len()
  304. expected = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan])
  305. tm.assert_series_equal(result, expected)
  306. @pytest.mark.parametrize(
  307. "method,sub,start,end,expected",
  308. [
  309. ("index", "EF", None, None, [4, 3, 1, 0]),
  310. ("rindex", "EF", None, None, [4, 5, 7, 4]),
  311. ("index", "EF", 3, None, [4, 3, 7, 4]),
  312. ("rindex", "EF", 3, None, [4, 5, 7, 4]),
  313. ("index", "E", 4, 8, [4, 5, 7, 4]),
  314. ("rindex", "E", 0, 5, [4, 3, 1, 4]),
  315. ],
  316. )
  317. def test_index(method, sub, start, end, index_or_series, any_string_dtype, expected):
  318. obj = index_or_series(
  319. ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype
  320. )
  321. expected_dtype = (
  322. np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
  323. )
  324. expected = index_or_series(expected, dtype=expected_dtype)
  325. result = getattr(obj.str, method)(sub, start, end)
  326. if index_or_series is Series:
  327. tm.assert_series_equal(result, expected)
  328. else:
  329. tm.assert_index_equal(result, expected)
  330. # compare with standard library
  331. expected = [getattr(item, method)(sub, start, end) for item in obj]
  332. assert list(result) == expected
  333. def test_index_not_found_raises(index_or_series, any_string_dtype):
  334. obj = index_or_series(
  335. ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype
  336. )
  337. with pytest.raises(ValueError, match="substring not found"):
  338. obj.str.index("DE")
  339. @pytest.mark.parametrize("method", ["index", "rindex"])
  340. def test_index_wrong_type_raises(index_or_series, any_string_dtype, method):
  341. obj = index_or_series([], dtype=any_string_dtype)
  342. msg = "expected a string object, not int"
  343. with pytest.raises(TypeError, match=msg):
  344. getattr(obj.str, method)(0)
  345. @pytest.mark.parametrize(
  346. "method, exp",
  347. [
  348. ["index", [1, 1, 0]],
  349. ["rindex", [3, 1, 2]],
  350. ],
  351. )
  352. def test_index_missing(any_string_dtype, method, exp):
  353. ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype)
  354. expected_dtype = (
  355. np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
  356. )
  357. result = getattr(ser.str, method)("b")
  358. expected = Series(exp + [np.nan], dtype=expected_dtype)
  359. tm.assert_series_equal(result, expected)
  360. def test_pipe_failures(any_string_dtype):
  361. # #2119
  362. ser = Series(["A|B|C"], dtype=any_string_dtype)
  363. result = ser.str.split("|")
  364. expected = Series([["A", "B", "C"]], dtype=object)
  365. tm.assert_series_equal(result, expected)
  366. result = ser.str.replace("|", " ", regex=False)
  367. expected = Series(["A B C"], dtype=any_string_dtype)
  368. tm.assert_series_equal(result, expected)
  369. @pytest.mark.parametrize(
  370. "start, stop, step, expected",
  371. [
  372. (2, 5, None, ["foo", "bar", np.nan, "baz"]),
  373. (0, 3, -1, ["", "", np.nan, ""]),
  374. (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]),
  375. (None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]),
  376. (3, 10, 2, ["oto", "ato", np.nan, "aqx"]),
  377. (3, 0, -1, ["ofa", "aba", np.nan, "aba"]),
  378. ],
  379. )
  380. def test_slice(start, stop, step, expected, any_string_dtype):
  381. ser = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"], dtype=any_string_dtype)
  382. result = ser.str.slice(start, stop, step)
  383. expected = Series(expected, dtype=any_string_dtype)
  384. tm.assert_series_equal(result, expected)
  385. @pytest.mark.parametrize(
  386. "start, stop, step, expected",
  387. [
  388. (2, 5, None, ["foo", np.nan, "bar", np.nan, np.nan, None, np.nan, np.nan]),
  389. (4, 1, -1, ["oof", np.nan, "rab", np.nan, np.nan, None, np.nan, np.nan]),
  390. ],
  391. )
  392. def test_slice_mixed_object(start, stop, step, expected):
  393. ser = Series(["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0])
  394. result = ser.str.slice(start, stop, step)
  395. expected = Series(expected, dtype=object)
  396. tm.assert_series_equal(result, expected)
  397. @pytest.mark.parametrize(
  398. "start,stop,repl,expected",
  399. [
  400. (2, 3, None, ["shrt", "a it longer", "evnlongerthanthat", "", np.nan]),
  401. (2, 3, "z", ["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan]),
  402. (2, 2, "z", ["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]),
  403. (2, 1, "z", ["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]),
  404. (-1, None, "z", ["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan]),
  405. (None, -2, "z", ["zrt", "zer", "zat", "z", np.nan]),
  406. (6, 8, "z", ["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan]),
  407. (-10, 3, "z", ["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan]),
  408. ],
  409. )
  410. def test_slice_replace(start, stop, repl, expected, any_string_dtype):
  411. ser = Series(
  412. ["short", "a bit longer", "evenlongerthanthat", "", np.nan],
  413. dtype=any_string_dtype,
  414. )
  415. expected = Series(expected, dtype=any_string_dtype)
  416. result = ser.str.slice_replace(start, stop, repl)
  417. tm.assert_series_equal(result, expected)
  418. @pytest.mark.parametrize(
  419. "method, exp",
  420. [
  421. ["strip", ["aa", "bb", np.nan, "cc"]],
  422. ["lstrip", ["aa ", "bb \n", np.nan, "cc "]],
  423. ["rstrip", [" aa", " bb", np.nan, "cc"]],
  424. ],
  425. )
  426. def test_strip_lstrip_rstrip(any_string_dtype, method, exp):
  427. ser = Series([" aa ", " bb \n", np.nan, "cc "], dtype=any_string_dtype)
  428. result = getattr(ser.str, method)()
  429. expected = Series(exp, dtype=any_string_dtype)
  430. tm.assert_series_equal(result, expected)
  431. @pytest.mark.parametrize(
  432. "method, exp",
  433. [
  434. ["strip", ["aa", np.nan, "bb"]],
  435. ["lstrip", ["aa ", np.nan, "bb \t\n"]],
  436. ["rstrip", [" aa", np.nan, " bb"]],
  437. ],
  438. )
  439. def test_strip_lstrip_rstrip_mixed_object(method, exp):
  440. ser = Series([" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0])
  441. result = getattr(ser.str, method)()
  442. expected = Series(exp + [np.nan, np.nan, None, np.nan, np.nan], dtype=object)
  443. tm.assert_series_equal(result, expected)
  444. @pytest.mark.parametrize(
  445. "method, exp",
  446. [
  447. ["strip", ["ABC", " BNSD", "LDFJH "]],
  448. ["lstrip", ["ABCxx", " BNSD", "LDFJH xx"]],
  449. ["rstrip", ["xxABC", "xx BNSD", "LDFJH "]],
  450. ],
  451. )
  452. def test_strip_lstrip_rstrip_args(any_string_dtype, method, exp):
  453. ser = Series(["xxABCxx", "xx BNSD", "LDFJH xx"], dtype=any_string_dtype)
  454. result = getattr(ser.str, method)("x")
  455. expected = Series(exp, dtype=any_string_dtype)
  456. tm.assert_series_equal(result, expected)
  457. @pytest.mark.parametrize(
  458. "prefix, expected", [("a", ["b", " b c", "bc"]), ("ab", ["", "a b c", "bc"])]
  459. )
  460. def test_removeprefix(any_string_dtype, prefix, expected):
  461. ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
  462. result = ser.str.removeprefix(prefix)
  463. ser_expected = Series(expected, dtype=any_string_dtype)
  464. tm.assert_series_equal(result, ser_expected)
  465. @pytest.mark.parametrize(
  466. "suffix, expected", [("c", ["ab", "a b ", "b"]), ("bc", ["ab", "a b c", ""])]
  467. )
  468. def test_removesuffix(any_string_dtype, suffix, expected):
  469. ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
  470. result = ser.str.removesuffix(suffix)
  471. ser_expected = Series(expected, dtype=any_string_dtype)
  472. tm.assert_series_equal(result, ser_expected)
  473. def test_string_slice_get_syntax(any_string_dtype):
  474. ser = Series(
  475. ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"],
  476. dtype=any_string_dtype,
  477. )
  478. result = ser.str[0]
  479. expected = ser.str.get(0)
  480. tm.assert_series_equal(result, expected)
  481. result = ser.str[:3]
  482. expected = ser.str.slice(stop=3)
  483. tm.assert_series_equal(result, expected)
  484. result = ser.str[2::-1]
  485. expected = ser.str.slice(start=2, step=-1)
  486. tm.assert_series_equal(result, expected)
  487. def test_string_slice_out_of_bounds_nested():
  488. ser = Series([(1, 2), (1,), (3, 4, 5)])
  489. result = ser.str[1]
  490. expected = Series([2, np.nan, 4])
  491. tm.assert_series_equal(result, expected)
  492. def test_string_slice_out_of_bounds(any_string_dtype):
  493. ser = Series(["foo", "b", "ba"], dtype=any_string_dtype)
  494. result = ser.str[1]
  495. expected = Series(["o", np.nan, "a"], dtype=any_string_dtype)
  496. tm.assert_series_equal(result, expected)
  497. def test_encode_decode(any_string_dtype):
  498. ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8")
  499. result = ser.str.decode("utf-8")
  500. expected = Series(["a", "b", "a\xe4"], dtype="str")
  501. tm.assert_series_equal(result, expected)
  502. def test_encode_errors_kwarg(any_string_dtype):
  503. ser = Series(["a", "b", "a\x9d"], dtype=any_string_dtype)
  504. msg = (
  505. r"'charmap' codec can't encode character '\\x9d' in position 1: "
  506. "character maps to <undefined>"
  507. )
  508. with pytest.raises(UnicodeEncodeError, match=msg):
  509. ser.str.encode("cp1252")
  510. result = ser.str.encode("cp1252", "ignore")
  511. expected = ser.map(lambda x: x.encode("cp1252", "ignore"))
  512. tm.assert_series_equal(result, expected)
  513. def test_decode_errors_kwarg():
  514. ser = Series([b"a", b"b", b"a\x9d"])
  515. msg = (
  516. "'charmap' codec can't decode byte 0x9d in position 1: "
  517. "character maps to <undefined>"
  518. )
  519. with pytest.raises(UnicodeDecodeError, match=msg):
  520. ser.str.decode("cp1252")
  521. result = ser.str.decode("cp1252", "ignore")
  522. expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype("str")
  523. tm.assert_series_equal(result, expected)
  524. def test_decode_string_dtype(string_dtype):
  525. # https://github.com/pandas-dev/pandas/pull/60940
  526. ser = Series([b"a", b"b"])
  527. result = ser.str.decode("utf-8", dtype=string_dtype)
  528. expected = Series(["a", "b"], dtype=string_dtype)
  529. tm.assert_series_equal(result, expected)
  530. def test_decode_object_dtype(object_dtype):
  531. # https://github.com/pandas-dev/pandas/pull/60940
  532. ser = Series([b"a", rb"\ud800"])
  533. result = ser.str.decode("utf-8", dtype=object_dtype)
  534. expected = Series(["a", r"\ud800"], dtype=object_dtype)
  535. tm.assert_series_equal(result, expected)
  536. def test_decode_bad_dtype():
  537. # https://github.com/pandas-dev/pandas/pull/60940
  538. ser = Series([b"a", b"b"])
  539. msg = "dtype must be string or object, got dtype='int64'"
  540. with pytest.raises(ValueError, match=msg):
  541. ser.str.decode("utf-8", dtype="int64")
  542. @pytest.mark.parametrize(
  543. "form, expected",
  544. [
  545. ("NFKC", ["ABC", "ABC", "123", np.nan, "アイエ"]),
  546. ("NFC", ["ABC", "ABC", "123", np.nan, "アイエ"]), # noqa: RUF001
  547. ],
  548. )
  549. def test_normalize(form, expected, any_string_dtype):
  550. ser = Series(
  551. ["ABC", "ABC", "123", np.nan, "アイエ"], # noqa: RUF001
  552. index=["a", "b", "c", "d", "e"],
  553. dtype=any_string_dtype,
  554. )
  555. expected = Series(expected, index=["a", "b", "c", "d", "e"], dtype=any_string_dtype)
  556. result = ser.str.normalize(form)
  557. tm.assert_series_equal(result, expected)
  558. def test_normalize_bad_arg_raises(any_string_dtype):
  559. ser = Series(
  560. ["ABC", "ABC", "123", np.nan, "アイエ"], # noqa: RUF001
  561. index=["a", "b", "c", "d", "e"],
  562. dtype=any_string_dtype,
  563. )
  564. with pytest.raises(ValueError, match="invalid normalization form"):
  565. ser.str.normalize("xxx")
  566. def test_normalize_index():
  567. idx = Index(["ABC", "123", "アイエ"]) # noqa: RUF001
  568. expected = Index(["ABC", "123", "アイエ"])
  569. result = idx.str.normalize("NFKC")
  570. tm.assert_index_equal(result, expected)
  571. @pytest.mark.parametrize(
  572. "values,inferred_type",
  573. [
  574. (["a", "b"], "string"),
  575. (["a", "b", 1], "mixed-integer"),
  576. (["a", "b", 1.3], "mixed"),
  577. (["a", "b", 1.3, 1], "mixed-integer"),
  578. (["aa", datetime(2011, 1, 1)], "mixed"),
  579. ],
  580. )
  581. def test_index_str_accessor_visibility(values, inferred_type, index_or_series):
  582. obj = index_or_series(values)
  583. if index_or_series is Index:
  584. assert obj.inferred_type == inferred_type
  585. assert isinstance(obj.str, StringMethods)
  586. @pytest.mark.parametrize(
  587. "values,inferred_type",
  588. [
  589. ([1, np.nan], "floating"),
  590. ([datetime(2011, 1, 1)], "datetime64"),
  591. ([timedelta(1)], "timedelta64"),
  592. ],
  593. )
  594. def test_index_str_accessor_non_string_values_raises(
  595. values, inferred_type, index_or_series
  596. ):
  597. obj = index_or_series(values)
  598. if index_or_series is Index:
  599. assert obj.inferred_type == inferred_type
  600. msg = "Can only use .str accessor with string values"
  601. with pytest.raises(AttributeError, match=msg):
  602. obj.str
  603. def test_index_str_accessor_multiindex_raises():
  604. # MultiIndex has mixed dtype, but not allow to use accessor
  605. idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")])
  606. assert idx.inferred_type == "mixed"
  607. msg = "Can only use .str accessor with Index, not MultiIndex"
  608. with pytest.raises(AttributeError, match=msg):
  609. idx.str
  610. def test_str_accessor_no_new_attributes(any_string_dtype):
  611. # https://github.com/pandas-dev/pandas/issues/10673
  612. ser = Series(list("aabbcde"), dtype=any_string_dtype)
  613. with pytest.raises(AttributeError, match="You cannot add any new attribute"):
  614. ser.str.xlabel = "a"
  615. def test_cat_on_bytes_raises():
  616. lhs = Series(np.array(list("abc"), "S1").astype(object))
  617. rhs = Series(np.array(list("def"), "S1").astype(object))
  618. msg = "Cannot use .str.cat with values of inferred dtype 'bytes'"
  619. with pytest.raises(TypeError, match=msg):
  620. lhs.str.cat(rhs)
  621. def test_str_accessor_in_apply_func():
  622. # https://github.com/pandas-dev/pandas/issues/38979
  623. df = DataFrame(zip("abc", "def"))
  624. expected = Series(["A/D", "B/E", "C/F"])
  625. result = df.apply(lambda f: "/".join(f.str.upper()), axis=1)
  626. tm.assert_series_equal(result, expected)
  627. def test_zfill():
  628. # https://github.com/pandas-dev/pandas/issues/20868
  629. value = Series(["-1", "1", "1000", 10, np.nan])
  630. expected = Series(["-01", "001", "1000", np.nan, np.nan], dtype=object)
  631. tm.assert_series_equal(value.str.zfill(3), expected)
  632. value = Series(["-2", "+5"])
  633. expected = Series(["-0002", "+0005"])
  634. tm.assert_series_equal(value.str.zfill(5), expected)
  635. def test_zfill_with_non_integer_argument():
  636. value = Series(["-2", "+5"])
  637. wid = "a"
  638. msg = f"width must be of integer type, not {type(wid).__name__}"
  639. with pytest.raises(TypeError, match=msg):
  640. value.str.zfill(wid)
  641. def test_zfill_with_leading_sign():
  642. value = Series(["-cat", "-1", "+dog"])
  643. expected = Series(["-0cat", "-0001", "+0dog"])
  644. tm.assert_series_equal(value.str.zfill(5), expected)
  645. def test_get_with_dict_label():
  646. # GH47911
  647. s = Series(
  648. [
  649. {"name": "Hello", "value": "World"},
  650. {"name": "Goodbye", "value": "Planet"},
  651. {"value": "Sea"},
  652. ]
  653. )
  654. result = s.str.get("name")
  655. expected = Series(["Hello", "Goodbye", None], dtype=object)
  656. tm.assert_series_equal(result, expected)
  657. result = s.str.get("value")
  658. expected = Series(["World", "Planet", "Sea"], dtype=object)
  659. tm.assert_series_equal(result, expected)
  660. def test_series_str_decode():
  661. # GH 22613
  662. result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict")
  663. expected = Series(["x", "y"], dtype="str")
  664. tm.assert_series_equal(result, expected)
  665. def test_reversed_logical_ops(any_string_dtype):
  666. # GH#60234
  667. dtype = any_string_dtype
  668. warn = None if dtype == object else DeprecationWarning
  669. left = Series([True, False, False, True])
  670. right = Series(["", "", "b", "c"], dtype=dtype)
  671. msg = "operations between boolean dtype and"
  672. with tm.assert_produces_warning(warn, match=msg):
  673. result = left | right
  674. expected = left | right.astype(bool)
  675. tm.assert_series_equal(result, expected)
  676. with tm.assert_produces_warning(warn, match=msg):
  677. result = left & right
  678. expected = left & right.astype(bool)
  679. tm.assert_series_equal(result, expected)
  680. with tm.assert_produces_warning(warn, match=msg):
  681. result = left ^ right
  682. expected = left ^ right.astype(bool)
  683. tm.assert_series_equal(result, expected)
  684. def test_pathlib_path_division(any_string_dtype, request):
  685. # GH#61940
  686. if any_string_dtype == object:
  687. mark = pytest.mark.xfail(
  688. reason="with NA present we go through _masked_arith_op which "
  689. "raises TypeError bc Path is not recognized by lib.is_scalar."
  690. )
  691. request.applymarker(mark)
  692. item = Path("/Users/Irv/")
  693. ser = Series(["A", "B", NA], dtype=any_string_dtype)
  694. result = item / ser
  695. expected = Series([item / "A", item / "B", ser.dtype.na_value], dtype=object)
  696. tm.assert_series_equal(result, expected)
  697. result = ser / item
  698. expected = Series(["A" / item, "B" / item, ser.dtype.na_value], dtype=object)
  699. tm.assert_series_equal(result, expected)