test_find_replace.py 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342
  1. from datetime import datetime
  2. import re
  3. import numpy as np
  4. import pytest
  5. import pandas.util._test_decorators as td
  6. import pandas as pd
  7. from pandas import (
  8. Series,
  9. _testing as tm,
  10. )
  11. from pandas.tests.strings import (
  12. _convert_na_value,
  13. is_object_or_nan_string_dtype,
  14. )
  15. # --------------------------------------------------------------------------------------
  16. # str.contains
  17. # --------------------------------------------------------------------------------------
  18. def test_contains(any_string_dtype):
  19. values = np.array(
  20. ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_
  21. )
  22. values = Series(values, dtype=any_string_dtype)
  23. pat = "mmm[_]+"
  24. result = values.str.contains(pat)
  25. if any_string_dtype == "str":
  26. # NaN propagates as False
  27. expected = Series([False, False, True, True, False], dtype=bool)
  28. else:
  29. expected_dtype = (
  30. "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  31. )
  32. expected = Series(
  33. np.array([False, np.nan, True, True, False], dtype=np.object_),
  34. dtype=expected_dtype,
  35. )
  36. tm.assert_series_equal(result, expected)
  37. result = values.str.contains(pat, regex=False)
  38. if any_string_dtype == "str":
  39. expected = Series([False, False, False, False, True], dtype=bool)
  40. else:
  41. expected = Series(
  42. np.array([False, np.nan, False, False, True], dtype=np.object_),
  43. dtype=expected_dtype,
  44. )
  45. tm.assert_series_equal(result, expected)
  46. values = Series(
  47. np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object),
  48. dtype=any_string_dtype,
  49. )
  50. result = values.str.contains(pat)
  51. expected_dtype = (
  52. np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  53. )
  54. expected = Series(np.array([False, False, True, True]), dtype=expected_dtype)
  55. tm.assert_series_equal(result, expected)
  56. # case insensitive using regex
  57. values = Series(
  58. np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object),
  59. dtype=any_string_dtype,
  60. )
  61. result = values.str.contains("FOO|mmm", case=False)
  62. expected = Series(np.array([True, False, True, True]), dtype=expected_dtype)
  63. tm.assert_series_equal(result, expected)
  64. # case insensitive without regex
  65. result = values.str.contains("foo", regex=False, case=False)
  66. expected = Series(np.array([True, False, True, False]), dtype=expected_dtype)
  67. tm.assert_series_equal(result, expected)
  68. # unicode
  69. values = Series(
  70. np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_),
  71. dtype=any_string_dtype,
  72. )
  73. pat = "mmm[_]+"
  74. result = values.str.contains(pat)
  75. if any_string_dtype == "str":
  76. expected = Series([False, False, True, True], dtype=bool)
  77. else:
  78. expected_dtype = (
  79. "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  80. )
  81. expected = Series(
  82. np.array([False, np.nan, True, True], dtype=np.object_),
  83. dtype=expected_dtype,
  84. )
  85. tm.assert_series_equal(result, expected)
  86. result = values.str.contains(pat, na=False)
  87. expected_dtype = (
  88. np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  89. )
  90. expected = Series(np.array([False, False, True, True]), dtype=expected_dtype)
  91. tm.assert_series_equal(result, expected)
  92. values = Series(
  93. np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_),
  94. dtype=any_string_dtype,
  95. )
  96. result = values.str.contains(pat)
  97. expected = Series(np.array([False, False, True, True]), dtype=expected_dtype)
  98. tm.assert_series_equal(result, expected)
  99. def test_contains_object_mixed():
  100. mixed = Series(
  101. np.array(
  102. ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
  103. dtype=object,
  104. )
  105. )
  106. result = mixed.str.contains("o")
  107. expected = Series(
  108. np.array(
  109. [False, np.nan, False, np.nan, np.nan, True, None, np.nan, np.nan],
  110. dtype=np.object_,
  111. )
  112. )
  113. tm.assert_series_equal(result, expected)
  114. def test_contains_na_kwarg_for_object_category():
  115. # gh 22158
  116. # na for category
  117. values = Series(["a", "b", "c", "a", np.nan], dtype="category")
  118. result = values.str.contains("a", na=True)
  119. expected = Series([True, False, False, True, True])
  120. tm.assert_series_equal(result, expected)
  121. result = values.str.contains("a", na=False)
  122. expected = Series([True, False, False, True, False])
  123. tm.assert_series_equal(result, expected)
  124. # na for objects
  125. values = Series(["a", "b", "c", "a", np.nan])
  126. result = values.str.contains("a", na=True)
  127. expected = Series([True, False, False, True, True])
  128. tm.assert_series_equal(result, expected)
  129. result = values.str.contains("a", na=False)
  130. expected = Series([True, False, False, True, False])
  131. tm.assert_series_equal(result, expected)
  132. @pytest.mark.parametrize(
  133. "na, expected",
  134. [
  135. (None, pd.NA),
  136. (True, True),
  137. (False, False),
  138. (0, False),
  139. (3, True),
  140. (np.nan, pd.NA),
  141. ],
  142. )
  143. @pytest.mark.parametrize("regex", [True, False])
  144. def test_contains_na_kwarg_for_nullable_string_dtype(
  145. nullable_string_dtype, na, expected, regex
  146. ):
  147. # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416
  148. values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype)
  149. msg = (
  150. "Allowing a non-bool 'na' in obj.str.contains is deprecated and "
  151. "will raise in a future version"
  152. )
  153. warn = None
  154. if not pd.isna(na) and not isinstance(na, bool):
  155. warn = FutureWarning
  156. with tm.assert_produces_warning(warn, match=msg):
  157. result = values.str.contains("a", na=na, regex=regex)
  158. expected = Series([True, False, False, True, expected], dtype="boolean")
  159. tm.assert_series_equal(result, expected)
  160. def test_contains_moar(any_string_dtype):
  161. # PR #1179
  162. s = Series(
  163. ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
  164. dtype=any_string_dtype,
  165. )
  166. result = s.str.contains("a")
  167. if any_string_dtype == "str":
  168. # NaN propagates as False
  169. expected_dtype = bool
  170. na_value = False
  171. else:
  172. expected_dtype = (
  173. "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  174. )
  175. na_value = np.nan
  176. expected = Series(
  177. [False, False, False, True, True, False, na_value, False, False, True],
  178. dtype=expected_dtype,
  179. )
  180. tm.assert_series_equal(result, expected)
  181. result = s.str.contains("a", case=False)
  182. expected = Series(
  183. [True, False, False, True, True, False, na_value, True, False, True],
  184. dtype=expected_dtype,
  185. )
  186. tm.assert_series_equal(result, expected)
  187. result = s.str.contains("Aa")
  188. expected = Series(
  189. [False, False, False, True, False, False, na_value, False, False, False],
  190. dtype=expected_dtype,
  191. )
  192. tm.assert_series_equal(result, expected)
  193. result = s.str.contains("ba")
  194. expected = Series(
  195. [False, False, False, True, False, False, na_value, False, False, False],
  196. dtype=expected_dtype,
  197. )
  198. tm.assert_series_equal(result, expected)
  199. result = s.str.contains("ba", case=False)
  200. expected = Series(
  201. [False, False, False, True, True, False, na_value, True, False, False],
  202. dtype=expected_dtype,
  203. )
  204. tm.assert_series_equal(result, expected)
  205. def test_contains_nan(any_string_dtype):
  206. # PR #14171
  207. s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype)
  208. result = s.str.contains("foo", na=False)
  209. expected_dtype = (
  210. np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  211. )
  212. expected = Series([False, False, False], dtype=expected_dtype)
  213. tm.assert_series_equal(result, expected)
  214. result = s.str.contains("foo", na=True)
  215. expected = Series([True, True, True], dtype=expected_dtype)
  216. tm.assert_series_equal(result, expected)
  217. # TODO(infer_string)
  218. # this particular combination of events is broken on 2.3
  219. # would require cherry picking #58483, which in turn requires #57481
  220. # which introduce many behavioral changes
  221. if not (
  222. hasattr(any_string_dtype, "storage")
  223. and any_string_dtype.storage == "python"
  224. and any_string_dtype.na_value is np.nan
  225. ):
  226. msg = (
  227. "Allowing a non-bool 'na' in obj.str.contains is deprecated and "
  228. "will raise in a future version"
  229. )
  230. with tm.assert_produces_warning(FutureWarning, match=msg):
  231. result = s.str.contains("foo", na="foo")
  232. if any_string_dtype == "object":
  233. expected = Series(["foo", "foo", "foo"], dtype=np.object_)
  234. elif any_string_dtype.na_value is np.nan:
  235. expected = Series([True, True, True], dtype=np.bool_)
  236. else:
  237. expected = Series([True, True, True], dtype="boolean")
  238. tm.assert_series_equal(result, expected)
  239. result = s.str.contains("foo")
  240. if any_string_dtype == "str":
  241. # NaN propagates as False
  242. expected = Series([False, False, False], dtype=bool)
  243. else:
  244. expected_dtype = (
  245. "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  246. )
  247. expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype)
  248. tm.assert_series_equal(result, expected)
  249. def test_contains_compiled_regex(any_string_dtype):
  250. # GH#61942
  251. expected_dtype = (
  252. np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  253. )
  254. ser = Series(["foo", "bar", "Baz"], dtype=any_string_dtype)
  255. pat = re.compile("ba.")
  256. result = ser.str.contains(pat)
  257. expected = Series([False, True, False], dtype=expected_dtype)
  258. tm.assert_series_equal(result, expected)
  259. # TODO this currently works for pyarrow-backed dtypes but raises for python
  260. if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
  261. result = ser.str.contains(pat, case=False)
  262. expected = Series([False, True, True], dtype=expected_dtype)
  263. tm.assert_series_equal(result, expected)
  264. else:
  265. with pytest.raises(
  266. ValueError, match="cannot process flags argument with a compiled pattern"
  267. ):
  268. ser.str.contains(pat, case=False)
  269. pat = re.compile("ba.", flags=re.IGNORECASE)
  270. result = ser.str.contains(pat)
  271. expected = Series([False, True, True], dtype=expected_dtype)
  272. tm.assert_series_equal(result, expected)
  273. # TODO should this be supported?
  274. with pytest.raises(
  275. ValueError, match="cannot process flags argument with a compiled pattern"
  276. ):
  277. ser.str.contains(pat, flags=re.IGNORECASE)
  278. def test_contains_compiled_regex_flags(any_string_dtype):
  279. # ensure other (than ignorecase) flags are respected
  280. expected_dtype = (
  281. np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  282. )
  283. ser = Series(["foobar", "foo\nbar", "Baz"], dtype=any_string_dtype)
  284. pat = re.compile("^ba")
  285. result = ser.str.contains(pat)
  286. expected = Series([False, False, False], dtype=expected_dtype)
  287. tm.assert_series_equal(result, expected)
  288. pat = re.compile("^ba", flags=re.MULTILINE)
  289. result = ser.str.contains(pat)
  290. expected = Series([False, True, False], dtype=expected_dtype)
  291. tm.assert_series_equal(result, expected)
  292. pat = re.compile("^ba", flags=re.MULTILINE | re.IGNORECASE)
  293. result = ser.str.contains(pat)
  294. expected = Series([False, True, True], dtype=expected_dtype)
  295. tm.assert_series_equal(result, expected)
  296. # --------------------------------------------------------------------------------------
  297. # str.startswith
  298. # --------------------------------------------------------------------------------------
  299. def test_startswith_endswith_validate_na(request, any_string_dtype):
  300. if (
  301. any_string_dtype == "string"
  302. and any_string_dtype.na_value is np.nan
  303. and any_string_dtype.storage == "python"
  304. ):
  305. request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
  306. # GH#59615
  307. ser = Series(
  308. ["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"],
  309. dtype=any_string_dtype,
  310. )
  311. msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated"
  312. with tm.assert_produces_warning(FutureWarning, match=msg):
  313. ser.str.startswith("kapow", na="baz")
  314. msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated"
  315. with tm.assert_produces_warning(FutureWarning, match=msg):
  316. ser.str.endswith("bar", na="baz")
  317. @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning")
  318. @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")])
  319. @pytest.mark.parametrize("dtype", ["object", "category"])
  320. @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
  321. @pytest.mark.parametrize("na", [True, False])
  322. def test_startswith(pat, dtype, null_value, na, using_infer_string):
  323. # add category dtype parametrizations for GH-36241
  324. values = Series(
  325. ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
  326. dtype=dtype,
  327. )
  328. result = values.str.startswith(pat)
  329. exp = Series([False, np.nan, True, False, False, np.nan, True])
  330. if dtype == "object" and null_value is pd.NA:
  331. # GH#18463
  332. exp = exp.fillna(null_value)
  333. elif dtype == "object" and null_value is None:
  334. exp[exp.isna()] = None
  335. elif using_infer_string and dtype == "category":
  336. exp = exp.fillna(False).astype(bool)
  337. tm.assert_series_equal(result, exp)
  338. result = values.str.startswith(pat, na=na)
  339. exp = Series([False, na, True, False, False, na, True])
  340. tm.assert_series_equal(result, exp)
  341. # mixed
  342. mixed = np.array(
  343. ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
  344. dtype=np.object_,
  345. )
  346. rs = Series(mixed).str.startswith("f")
  347. xp = Series([False, np.nan, False, np.nan, np.nan, True, None, np.nan, np.nan])
  348. tm.assert_series_equal(rs, xp)
  349. @pytest.mark.parametrize("na", [None, True, False])
  350. def test_startswith_string_dtype(any_string_dtype, na):
  351. values = Series(
  352. ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."],
  353. dtype=any_string_dtype,
  354. )
  355. result = values.str.startswith("foo", na=na)
  356. expected_dtype = (
  357. (object if na is None else bool)
  358. if is_object_or_nan_string_dtype(any_string_dtype)
  359. else "boolean"
  360. )
  361. if any_string_dtype == "str":
  362. # NaN propagates as False
  363. expected_dtype = bool
  364. if na is None:
  365. na = False
  366. exp = Series(
  367. [False, na, True, False, False, na, True, False, False], dtype=expected_dtype
  368. )
  369. tm.assert_series_equal(result, exp)
  370. result = values.str.startswith("rege.", na=na)
  371. exp = Series(
  372. [False, na, False, False, False, na, False, False, True], dtype=expected_dtype
  373. )
  374. tm.assert_series_equal(result, exp)
  375. # --------------------------------------------------------------------------------------
  376. # str.endswith
  377. # --------------------------------------------------------------------------------------
  378. @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning")
  379. @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")])
  380. @pytest.mark.parametrize("dtype", ["object", "category"])
  381. @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
  382. @pytest.mark.parametrize("na", [True, False])
  383. def test_endswith(pat, dtype, null_value, na, using_infer_string):
  384. # add category dtype parametrizations for GH-36241
  385. values = Series(
  386. ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
  387. dtype=dtype,
  388. )
  389. result = values.str.endswith(pat)
  390. exp = Series([False, np.nan, False, False, True, np.nan, True])
  391. if dtype == "object" and null_value is pd.NA:
  392. # GH#18463
  393. exp = exp.fillna(null_value)
  394. elif dtype == "object" and null_value is None:
  395. exp[exp.isna()] = None
  396. elif using_infer_string and dtype == "category":
  397. exp = exp.fillna(False).astype(bool)
  398. tm.assert_series_equal(result, exp)
  399. result = values.str.endswith(pat, na=na)
  400. exp = Series([False, na, False, False, True, na, True])
  401. tm.assert_series_equal(result, exp)
  402. # mixed
  403. mixed = np.array(
  404. ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
  405. dtype=object,
  406. )
  407. rs = Series(mixed).str.endswith("f")
  408. xp = Series([False, np.nan, False, np.nan, np.nan, False, None, np.nan, np.nan])
  409. tm.assert_series_equal(rs, xp)
  410. @pytest.mark.parametrize("na", [None, True, False])
  411. def test_endswith_string_dtype(any_string_dtype, na):
  412. values = Series(
  413. ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."],
  414. dtype=any_string_dtype,
  415. )
  416. result = values.str.endswith("foo", na=na)
  417. expected_dtype = (
  418. (object if na is None else bool)
  419. if is_object_or_nan_string_dtype(any_string_dtype)
  420. else "boolean"
  421. )
  422. if any_string_dtype == "str":
  423. # NaN propagates as False
  424. expected_dtype = bool
  425. if na is None:
  426. na = False
  427. exp = Series(
  428. [False, na, False, False, True, na, True, False, False], dtype=expected_dtype
  429. )
  430. tm.assert_series_equal(result, exp)
  431. result = values.str.endswith("rege.", na=na)
  432. exp = Series(
  433. [False, na, False, False, False, na, False, False, True], dtype=expected_dtype
  434. )
  435. tm.assert_series_equal(result, exp)
  436. # --------------------------------------------------------------------------------------
  437. # str.replace
  438. # --------------------------------------------------------------------------------------
  439. def test_replace(any_string_dtype):
  440. ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
  441. result = ser.str.replace("BAD[_]*", "", regex=True)
  442. expected = Series(["foobar", np.nan], dtype=any_string_dtype)
  443. tm.assert_series_equal(result, expected)
  444. def test_replace_max_replacements(any_string_dtype):
  445. ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
  446. expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
  447. result = ser.str.replace("BAD[_]*", "", n=1, regex=True)
  448. tm.assert_series_equal(result, expected)
  449. expected = Series(["foo__barBAD", np.nan], dtype=any_string_dtype)
  450. result = ser.str.replace("BAD", "", n=1, regex=False)
  451. tm.assert_series_equal(result, expected)
  452. def test_replace_mixed_object():
  453. ser = Series(
  454. ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
  455. )
  456. result = Series(ser).str.replace("BAD[_]*", "", regex=True)
  457. expected = Series(
  458. ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object
  459. )
  460. tm.assert_series_equal(result, expected)
  461. def test_replace_unicode(any_string_dtype):
  462. ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
  463. expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
  464. result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True)
  465. tm.assert_series_equal(result, expected)
  466. @pytest.mark.parametrize("repl", [None, 3, {"a": "b"}])
  467. @pytest.mark.parametrize("data", [["a", "b", None], ["a", "b", "c", "ad"]])
  468. def test_replace_wrong_repl_type_raises(any_string_dtype, index_or_series, repl, data):
  469. # https://github.com/pandas-dev/pandas/issues/13438
  470. msg = "repl must be a string or callable"
  471. obj = index_or_series(data, dtype=any_string_dtype)
  472. with pytest.raises(TypeError, match=msg):
  473. obj.str.replace("a", repl)
  474. def test_replace_callable(any_string_dtype):
  475. # GH 15055
  476. ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
  477. # test with callable
  478. repl = lambda m: m.group(0).swapcase()
  479. result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
  480. expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
  481. tm.assert_series_equal(result, expected)
  482. @pytest.mark.parametrize(
  483. "repl", [lambda: None, lambda m, x: None, lambda m, x, y=None: None]
  484. )
  485. def test_replace_callable_raises(any_string_dtype, repl):
  486. # GH 15055
  487. values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
  488. # test with wrong number of arguments, raising an error
  489. msg = (
  490. r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
  491. r"(?(3)required )positional arguments?"
  492. )
  493. with pytest.raises(TypeError, match=msg):
  494. values.str.replace("a", repl, regex=True)
  495. @pytest.mark.parametrize(
  496. "repl, expected_list",
  497. [
  498. (
  499. r"\g<three> \g<two> \g<one>",
  500. ["Three Two One", "Baz Bar Foo"],
  501. ),
  502. (
  503. r"\g<3> \g<2> \g<1>",
  504. ["Three Two One", "Baz Bar Foo"],
  505. ),
  506. (
  507. r"\g<2>0",
  508. ["Two0", "Bar0"],
  509. ),
  510. (
  511. r"\g<2>0 \1",
  512. ["Two0 One", "Bar0 Foo"],
  513. ),
  514. ],
  515. ids=[
  516. "named_groups_full_swap",
  517. "numbered_groups_full_swap",
  518. "single_group_with_literal",
  519. "mixed_group_reference_with_literal",
  520. ],
  521. )
  522. @pytest.mark.parametrize("use_compile", [True, False])
  523. def test_replace_named_groups_regex_swap(
  524. any_string_dtype, use_compile, repl, expected_list
  525. ):
  526. # GH#57636
  527. ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype)
  528. pattern = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
  529. if use_compile:
  530. pattern = re.compile(pattern)
  531. result = ser.str.replace(pattern, repl, regex=True)
  532. expected = Series(expected_list, dtype=any_string_dtype)
  533. tm.assert_series_equal(result, expected)
  534. @pytest.mark.parametrize(
  535. "repl",
  536. [
  537. r"\g<20>",
  538. r"\20",
  539. ],
  540. )
  541. @pytest.mark.parametrize("use_compile", [True, False])
  542. def test_replace_named_groups_regex_swap_expected_fail(
  543. any_string_dtype, repl, use_compile
  544. ):
  545. # GH#57636
  546. pattern = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
  547. if use_compile:
  548. pattern = re.compile(pattern)
  549. ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype)
  550. with pytest.raises(re.error, match="invalid group reference"):
  551. ser.str.replace(pattern, repl, regex=True)
  552. def test_replace_callable_named_groups(any_string_dtype):
  553. # test regex named groups
  554. ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype)
  555. pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
  556. repl = lambda m: m.group("middle").swapcase()
  557. result = ser.str.replace(pat, repl, regex=True)
  558. expected = Series(["bAR", np.nan], dtype=any_string_dtype)
  559. tm.assert_series_equal(result, expected)
  560. def test_replace_compiled_regex(any_string_dtype):
  561. # GH 15446
  562. ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
  563. # test with compiled regex
  564. pat = re.compile(r"BAD_*")
  565. result = ser.str.replace(pat, "", regex=True)
  566. expected = Series(["foobar", np.nan], dtype=any_string_dtype)
  567. tm.assert_series_equal(result, expected)
  568. result = ser.str.replace(pat, "", n=1, regex=True)
  569. expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
  570. tm.assert_series_equal(result, expected)
  571. def test_replace_compiled_regex_mixed_object():
  572. pat = re.compile(r"BAD_*")
  573. ser = Series(
  574. ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
  575. )
  576. result = Series(ser).str.replace(pat, "", regex=True)
  577. expected = Series(
  578. ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object
  579. )
  580. tm.assert_series_equal(result, expected)
  581. def test_replace_compiled_regex_unicode(any_string_dtype):
  582. ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
  583. expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
  584. pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
  585. result = ser.str.replace(pat, ", ", regex=True)
  586. tm.assert_series_equal(result, expected)
  587. def test_replace_compiled_regex_raises(any_string_dtype):
  588. # case and flags provided to str.replace will have no effect
  589. # and will produce warnings
  590. ser = Series(["fooBAD__barBAD__bad", np.nan], dtype=any_string_dtype)
  591. pat = re.compile(r"BAD_*")
  592. msg = "case and flags cannot be set when pat is a compiled regex"
  593. with pytest.raises(ValueError, match=msg):
  594. ser.str.replace(pat, "", flags=re.IGNORECASE, regex=True)
  595. with pytest.raises(ValueError, match=msg):
  596. ser.str.replace(pat, "", case=False, regex=True)
  597. with pytest.raises(ValueError, match=msg):
  598. ser.str.replace(pat, "", case=True, regex=True)
  599. def test_replace_compiled_regex_callable(any_string_dtype):
  600. # test with callable
  601. ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
  602. repl = lambda m: m.group(0).swapcase()
  603. pat = re.compile("[a-z][A-Z]{2}")
  604. result = ser.str.replace(pat, repl, n=2, regex=True)
  605. expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
  606. tm.assert_series_equal(result, expected)
  607. @pytest.mark.parametrize(
  608. "regex,expected", [(True, ["bao", "bao", np.nan]), (False, ["bao", "foo", np.nan])]
  609. )
  610. def test_replace_literal(regex, expected, any_string_dtype):
  611. # GH16808 literal replace (regex=False vs regex=True)
  612. ser = Series(["f.o", "foo", np.nan], dtype=any_string_dtype)
  613. expected = Series(expected, dtype=any_string_dtype)
  614. result = ser.str.replace("f.", "ba", regex=regex)
  615. tm.assert_series_equal(result, expected)
  616. def test_replace_literal_callable_raises(any_string_dtype):
  617. ser = Series([], dtype=any_string_dtype)
  618. repl = lambda m: m.group(0).swapcase()
  619. msg = "Cannot use a callable replacement when regex=False"
  620. with pytest.raises(ValueError, match=msg):
  621. ser.str.replace("abc", repl, regex=False)
  622. def test_replace_literal_compiled_raises(any_string_dtype):
  623. ser = Series([], dtype=any_string_dtype)
  624. pat = re.compile("[a-z][A-Z]{2}")
  625. msg = "Cannot use a compiled regex as replacement pattern with regex=False"
  626. with pytest.raises(ValueError, match=msg):
  627. ser.str.replace(pat, "", regex=False)
  628. def test_replace_moar(any_string_dtype):
  629. # PR #1179
  630. ser = Series(
  631. ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
  632. dtype=any_string_dtype,
  633. )
  634. result = ser.str.replace("A", "YYY")
  635. expected = Series(
  636. ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"],
  637. dtype=any_string_dtype,
  638. )
  639. tm.assert_series_equal(result, expected)
  640. result = ser.str.replace("A", "YYY", case=False)
  641. expected = Series(
  642. [
  643. "YYY",
  644. "B",
  645. "C",
  646. "YYYYYYbYYY",
  647. "BYYYcYYY",
  648. "",
  649. np.nan,
  650. "CYYYBYYY",
  651. "dog",
  652. "cYYYt",
  653. ],
  654. dtype=any_string_dtype,
  655. )
  656. tm.assert_series_equal(result, expected)
  657. result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)
  658. expected = Series(
  659. [
  660. "A",
  661. "B",
  662. "C",
  663. "XX-XX ba",
  664. "XX-XX ca",
  665. "",
  666. np.nan,
  667. "XX-XX BA",
  668. "XX-XX ",
  669. "XX-XX t",
  670. ],
  671. dtype=any_string_dtype,
  672. )
  673. tm.assert_series_equal(result, expected)
  674. def test_replace_not_case_sensitive_not_regex(any_string_dtype):
  675. # https://github.com/pandas-dev/pandas/issues/41602
  676. ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype)
  677. result = ser.str.replace("a", "c", case=False, regex=False)
  678. expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype)
  679. tm.assert_series_equal(result, expected)
  680. result = ser.str.replace("a.", "c.", case=False, regex=False)
  681. expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype)
  682. tm.assert_series_equal(result, expected)
  683. def test_replace_regex(any_string_dtype):
  684. # https://github.com/pandas-dev/pandas/pull/24809
  685. s = Series(["a", "b", "ac", np.nan, ""], dtype=any_string_dtype)
  686. result = s.str.replace("^.$", "a", regex=True)
  687. expected = Series(["a", "a", "ac", np.nan, ""], dtype=any_string_dtype)
  688. tm.assert_series_equal(result, expected)
  689. @pytest.mark.parametrize("regex", [True, False])
  690. def test_replace_regex_single_character(regex, any_string_dtype):
  691. # https://github.com/pandas-dev/pandas/pull/24809, enforced in 2.0
  692. # GH 24804
  693. s = Series(["a.b", ".", "b", np.nan, ""], dtype=any_string_dtype)
  694. result = s.str.replace(".", "a", regex=regex)
  695. if regex:
  696. expected = Series(["aaa", "a", "a", np.nan, ""], dtype=any_string_dtype)
  697. else:
  698. expected = Series(["aab", "a", "b", np.nan, ""], dtype=any_string_dtype)
  699. tm.assert_series_equal(result, expected)
  700. # --------------------------------------------------------------------------------------
  701. # str.match
  702. # --------------------------------------------------------------------------------------
  703. def test_match(any_string_dtype):
  704. if any_string_dtype == "str":
  705. # NaN propagates as False
  706. expected_dtype = bool
  707. na_value = False
  708. else:
  709. expected_dtype = (
  710. "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  711. )
  712. na_value = np.nan
  713. values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
  714. result = values.str.match(".*(BAD[_]+).*(BAD)")
  715. expected = Series([True, na_value, False], dtype=expected_dtype)
  716. tm.assert_series_equal(result, expected)
  717. values = Series(
  718. ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
  719. )
  720. result = values.str.match(".*BAD[_]+.*BAD")
  721. expected = Series([True, True, na_value, False], dtype=expected_dtype)
  722. tm.assert_series_equal(result, expected)
  723. result = values.str.match("BAD[_]+.*BAD")
  724. expected = Series([False, True, na_value, False], dtype=expected_dtype)
  725. tm.assert_series_equal(result, expected)
  726. values = Series(
  727. ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
  728. )
  729. result = values.str.match("^BAD[_]+.*BAD")
  730. expected = Series([False, False, na_value, False], dtype=expected_dtype)
  731. tm.assert_series_equal(result, expected)
  732. result = values.str.match("\\^BAD[_]+.*BAD")
  733. expected = Series([False, True, na_value, False], dtype=expected_dtype)
  734. tm.assert_series_equal(result, expected)
  735. def test_match_mixed_object():
  736. mixed = Series(
  737. [
  738. "aBAD_BAD",
  739. np.nan,
  740. "BAD_b_BAD",
  741. True,
  742. datetime.today(),
  743. "foo",
  744. None,
  745. 1,
  746. 2.0,
  747. ]
  748. )
  749. result = Series(mixed).str.match(".*(BAD[_]+).*(BAD)")
  750. expected = Series([True, np.nan, True, np.nan, np.nan, False, None, np.nan, np.nan])
  751. assert isinstance(result, Series)
  752. tm.assert_series_equal(result, expected)
  753. def test_match_na_kwarg(any_string_dtype):
  754. # GH #6609
  755. s = Series(["a", "b", np.nan], dtype=any_string_dtype)
  756. result = s.str.match("a", na=False)
  757. expected_dtype = (
  758. np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  759. )
  760. expected = Series([True, False, False], dtype=expected_dtype)
  761. tm.assert_series_equal(result, expected)
  762. result = s.str.match("a")
  763. if any_string_dtype == "str":
  764. # NaN propagates as False
  765. expected_dtype = bool
  766. na_value = False
  767. else:
  768. expected_dtype = (
  769. "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  770. )
  771. na_value = np.nan
  772. expected = Series([True, False, na_value], dtype=expected_dtype)
  773. tm.assert_series_equal(result, expected)
  774. def test_match_case_kwarg(any_string_dtype):
  775. values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
  776. result = values.str.match("ab", case=False)
  777. expected_dtype = (
  778. np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  779. )
  780. expected = Series([True, True, True, True], dtype=expected_dtype)
  781. tm.assert_series_equal(result, expected)
  782. def test_match_compiled_regex(any_string_dtype):
  783. # GH#61952
  784. expected_dtype = (
  785. np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  786. )
  787. values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
  788. result = values.str.match(re.compile("ab"))
  789. expected = Series([True, False, True, False], dtype=expected_dtype)
  790. tm.assert_series_equal(result, expected)
  791. # TODO this currently works for pyarrow-backed dtypes but raises for python
  792. if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
  793. result = values.str.match(re.compile("ab"), case=False)
  794. expected = Series([True, True, True, True], dtype=expected_dtype)
  795. tm.assert_series_equal(result, expected)
  796. else:
  797. with pytest.raises(
  798. ValueError, match="cannot process flags argument with a compiled pattern"
  799. ):
  800. values.str.match(re.compile("ab"), case=False)
  801. result = values.str.match(re.compile("ab", flags=re.IGNORECASE))
  802. expected = Series([True, True, True, True], dtype=expected_dtype)
  803. tm.assert_series_equal(result, expected)
  804. with pytest.raises(
  805. ValueError, match="cannot process flags argument with a compiled pattern"
  806. ):
  807. values.str.match(re.compile("ab"), flags=re.IGNORECASE)
  808. @pytest.mark.parametrize(
  809. "pat, case, exp",
  810. [
  811. ["ab", False, [True, False]],
  812. ["Ab", True, [False, False]],
  813. ["bc", True, [False, False]],
  814. ["a[a-z]{1}", False, [True, False]],
  815. ["A[a-z]{1}", True, [False, False]],
  816. # https://github.com/pandas-dev/pandas/issues/61072
  817. ["(bc)|(ab)", True, [True, False]],
  818. ["((bc)|(ab))", True, [True, False]],
  819. ],
  820. )
  821. def test_str_match_extra_cases(any_string_dtype, pat, case, exp):
  822. ser = Series(["abc", "Xab"], dtype=any_string_dtype)
  823. result = ser.str.match(pat, case=case)
  824. expected_dtype = (
  825. np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  826. )
  827. expected = Series(exp, dtype=expected_dtype)
  828. tm.assert_series_equal(result, expected)
  829. # --------------------------------------------------------------------------------------
  830. # str.fullmatch
  831. # --------------------------------------------------------------------------------------
  832. def test_fullmatch(any_string_dtype):
  833. # GH 32806
  834. ser = Series(
  835. ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
  836. )
  837. result = ser.str.fullmatch(".*BAD[_]+.*BAD")
  838. if any_string_dtype == "str":
  839. # NaN propagates as False
  840. expected = Series([True, False, False, False], dtype=bool)
  841. else:
  842. expected_dtype = (
  843. "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  844. )
  845. expected = Series([True, False, np.nan, False], dtype=expected_dtype)
  846. tm.assert_series_equal(result, expected)
  847. def test_fullmatch_dollar_literal(any_string_dtype):
  848. # GH 56652
  849. ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype)
  850. result = ser.str.fullmatch("foo\\$")
  851. if any_string_dtype == "str":
  852. # NaN propagates as False
  853. expected = Series([False, False, False, True], dtype=bool)
  854. else:
  855. expected_dtype = (
  856. "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  857. )
  858. expected = Series([False, False, np.nan, True], dtype=expected_dtype)
  859. tm.assert_series_equal(result, expected)
  860. def test_fullmatch_na_kwarg(any_string_dtype):
  861. ser = Series(
  862. ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
  863. )
  864. result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False)
  865. expected_dtype = (
  866. np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  867. )
  868. expected = Series([True, False, False, False], dtype=expected_dtype)
  869. tm.assert_series_equal(result, expected)
  870. def test_fullmatch_case_kwarg(any_string_dtype):
  871. ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
  872. expected_dtype = (
  873. np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  874. )
  875. expected = Series([True, False, False, False], dtype=expected_dtype)
  876. result = ser.str.fullmatch("ab", case=True)
  877. tm.assert_series_equal(result, expected)
  878. expected = Series([True, True, False, False], dtype=expected_dtype)
  879. result = ser.str.fullmatch("ab", case=False)
  880. tm.assert_series_equal(result, expected)
  881. result = ser.str.fullmatch("ab", flags=re.IGNORECASE)
  882. tm.assert_series_equal(result, expected)
  883. def test_fullmatch_compiled_regex(any_string_dtype):
  884. # GH#61952
  885. expected_dtype = (
  886. np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  887. )
  888. values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
  889. result = values.str.fullmatch(re.compile("ab"))
  890. expected = Series([True, False, False, False], dtype=expected_dtype)
  891. tm.assert_series_equal(result, expected)
  892. # TODO this currently works for pyarrow-backed dtypes but raises for python
  893. if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
  894. result = values.str.fullmatch(re.compile("ab"), case=False)
  895. expected = Series([True, True, False, False], dtype=expected_dtype)
  896. tm.assert_series_equal(result, expected)
  897. else:
  898. with pytest.raises(
  899. ValueError, match="cannot process flags argument with a compiled pattern"
  900. ):
  901. values.str.fullmatch(re.compile("ab"), case=False)
  902. result = values.str.fullmatch(re.compile("ab", flags=re.IGNORECASE))
  903. expected = Series([True, True, False, False], dtype=expected_dtype)
  904. tm.assert_series_equal(result, expected)
  905. with pytest.raises(
  906. ValueError, match="cannot process flags argument with a compiled pattern"
  907. ):
  908. values.str.fullmatch(re.compile("ab"), flags=re.IGNORECASE)
  909. @pytest.mark.parametrize(
  910. "pat, case, na, exp",
  911. # Note: keep cases in sync with
  912. # pandas/tests/extension/test_arrow.py::test_str_fullmatch
  913. [
  914. ["abc", False, None, [True, False, False, None]],
  915. ["Abc", True, None, [False, False, False, None]],
  916. ["bc", True, None, [False, False, False, None]],
  917. ["ab", False, None, [False, False, False, None]],
  918. ["a[a-z]{2}", False, None, [True, False, False, None]],
  919. ["A[a-z]{1}", True, None, [False, False, False, None]],
  920. # GH Issue: #56652
  921. ["abc$", False, None, [True, False, False, None]],
  922. ["abc\\$", False, None, [False, True, False, None]],
  923. ["Abc$", True, None, [False, False, False, None]],
  924. ["Abc\\$", True, None, [False, False, False, None]],
  925. # https://github.com/pandas-dev/pandas/issues/61072
  926. ["(abc)|(abx)", True, None, [True, False, False, None]],
  927. ["((abc)|(abx))", True, None, [True, False, False, None]],
  928. ],
  929. )
  930. def test_str_fullmatch_extra_cases(any_string_dtype, pat, case, na, exp):
  931. ser = Series(["abc", "abc$", "$abc", None], dtype=any_string_dtype)
  932. result = ser.str.fullmatch(pat, case=case, na=na)
  933. if any_string_dtype == "str":
  934. # NaN propagates as False
  935. exp[-1] = False
  936. expected_dtype = bool
  937. else:
  938. expected_dtype = (
  939. "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
  940. )
  941. expected = Series(exp, dtype=expected_dtype)
  942. tm.assert_series_equal(result, expected)
  943. # --------------------------------------------------------------------------------------
  944. # str.findall
  945. # --------------------------------------------------------------------------------------
  946. def test_findall(any_string_dtype):
  947. ser = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"], dtype=any_string_dtype)
  948. result = ser.str.findall("BAD[_]*")
  949. expected = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]])
  950. expected = _convert_na_value(ser, expected)
  951. tm.assert_series_equal(result, expected)
  952. def test_findall_mixed_object():
  953. ser = Series(
  954. [
  955. "fooBAD__barBAD",
  956. np.nan,
  957. "foo",
  958. True,
  959. datetime.today(),
  960. "BAD",
  961. None,
  962. 1,
  963. 2.0,
  964. ]
  965. )
  966. result = ser.str.findall("BAD[_]*")
  967. expected = Series(
  968. [
  969. ["BAD__", "BAD"],
  970. np.nan,
  971. [],
  972. np.nan,
  973. np.nan,
  974. ["BAD"],
  975. None,
  976. np.nan,
  977. np.nan,
  978. ]
  979. )
  980. tm.assert_series_equal(result, expected)
  981. # --------------------------------------------------------------------------------------
  982. # str.find
  983. # --------------------------------------------------------------------------------------
  984. def test_find(any_string_dtype):
  985. ser = Series(
  986. ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype
  987. )
  988. expected_dtype = (
  989. np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
  990. )
  991. result = ser.str.find("EF")
  992. expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype)
  993. tm.assert_series_equal(result, expected)
  994. expected = np.array([v.find("EF") for v in np.array(ser)], dtype=np.int64)
  995. tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
  996. result = ser.str.rfind("EF")
  997. expected = Series([4, 5, 7, 4, -1], dtype=expected_dtype)
  998. tm.assert_series_equal(result, expected)
  999. expected = np.array([v.rfind("EF") for v in np.array(ser)], dtype=np.int64)
  1000. tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
  1001. result = ser.str.find("EF", 3)
  1002. expected = Series([4, 3, 7, 4, -1], dtype=expected_dtype)
  1003. tm.assert_series_equal(result, expected)
  1004. expected = np.array([v.find("EF", 3) for v in np.array(ser)], dtype=np.int64)
  1005. tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
  1006. result = ser.str.rfind("EF", 3)
  1007. expected = Series([4, 5, 7, 4, -1], dtype=expected_dtype)
  1008. tm.assert_series_equal(result, expected)
  1009. expected = np.array([v.rfind("EF", 3) for v in np.array(ser)], dtype=np.int64)
  1010. tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
  1011. result = ser.str.find("EF", 3, 6)
  1012. expected = Series([4, 3, -1, 4, -1], dtype=expected_dtype)
  1013. tm.assert_series_equal(result, expected)
  1014. expected = np.array([v.find("EF", 3, 6) for v in np.array(ser)], dtype=np.int64)
  1015. tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
  1016. result = ser.str.rfind("EF", 3, 6)
  1017. expected = Series([4, 3, -1, 4, -1], dtype=expected_dtype)
  1018. tm.assert_series_equal(result, expected)
  1019. expected = np.array([v.rfind("EF", 3, 6) for v in np.array(ser)], dtype=np.int64)
  1020. tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
  1021. def test_find_bad_arg_raises(any_string_dtype):
  1022. ser = Series([], dtype=any_string_dtype)
  1023. with pytest.raises(TypeError, match="expected a string object, not int"):
  1024. ser.str.find(0)
  1025. with pytest.raises(TypeError, match="expected a string object, not int"):
  1026. ser.str.rfind(0)
  1027. def test_find_nan(any_string_dtype):
  1028. ser = Series(
  1029. ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype
  1030. )
  1031. expected_dtype = (
  1032. np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
  1033. )
  1034. result = ser.str.find("EF")
  1035. expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype)
  1036. tm.assert_series_equal(result, expected)
  1037. result = ser.str.rfind("EF")
  1038. expected = Series([4, np.nan, 7, np.nan, -1], dtype=expected_dtype)
  1039. tm.assert_series_equal(result, expected)
  1040. result = ser.str.find("EF", 3)
  1041. expected = Series([4, np.nan, 7, np.nan, -1], dtype=expected_dtype)
  1042. tm.assert_series_equal(result, expected)
  1043. result = ser.str.rfind("EF", 3)
  1044. expected = Series([4, np.nan, 7, np.nan, -1], dtype=expected_dtype)
  1045. tm.assert_series_equal(result, expected)
  1046. result = ser.str.find("EF", 3, 6)
  1047. expected = Series([4, np.nan, -1, np.nan, -1], dtype=expected_dtype)
  1048. tm.assert_series_equal(result, expected)
  1049. result = ser.str.rfind("EF", 3, 6)
  1050. expected = Series([4, np.nan, -1, np.nan, -1], dtype=expected_dtype)
  1051. tm.assert_series_equal(result, expected)
  1052. # --------------------------------------------------------------------------------------
  1053. # str.translate
  1054. # --------------------------------------------------------------------------------------
  1055. @pytest.mark.parametrize(
  1056. "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
  1057. )
  1058. def test_translate(index_or_series, any_string_dtype, infer_string):
  1059. obj = index_or_series(
  1060. ["abcdefg", "abcc", "cdddfg", "cdefggg"], dtype=any_string_dtype
  1061. )
  1062. table = str.maketrans("abc", "cde")
  1063. result = obj.str.translate(table)
  1064. expected = index_or_series(
  1065. ["cdedefg", "cdee", "edddfg", "edefggg"], dtype=any_string_dtype
  1066. )
  1067. tm.assert_equal(result, expected)
  1068. def test_translate_mixed_object():
  1069. # Series with non-string values
  1070. s = Series(["a", "b", "c", 1.2])
  1071. table = str.maketrans("abc", "cde")
  1072. expected = Series(["c", "d", "e", np.nan], dtype=object)
  1073. result = s.str.translate(table)
  1074. tm.assert_series_equal(result, expected)
  1075. # --------------------------------------------------------------------------------------
  1076. def test_flags_kwarg(any_string_dtype):
  1077. data = {
  1078. "Dave": "dave@google.com",
  1079. "Steve": "steve@gmail.com",
  1080. "Rob": "rob@gmail.com",
  1081. "Wes": np.nan,
  1082. }
  1083. data = Series(data, dtype=any_string_dtype)
  1084. pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
  1085. result = data.str.extract(pat, flags=re.IGNORECASE, expand=True)
  1086. assert result.iloc[0].tolist() == ["dave", "google", "com"]
  1087. result = data.str.match(pat, flags=re.IGNORECASE)
  1088. assert result.iloc[0]
  1089. result = data.str.fullmatch(pat, flags=re.IGNORECASE)
  1090. assert result.iloc[0]
  1091. result = data.str.findall(pat, flags=re.IGNORECASE)
  1092. assert result.iloc[0][0] == ("dave", "google", "com")
  1093. result = data.str.count(pat, flags=re.IGNORECASE)
  1094. assert result.iloc[0] == 1
  1095. msg = "has match groups"
  1096. with tm.assert_produces_warning(UserWarning, match=msg):
  1097. result = data.str.contains(pat, flags=re.IGNORECASE)
  1098. assert result.iloc[0]