test_extract.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724
  1. from datetime import datetime
  2. import re
  3. import numpy as np
  4. import pytest
  5. from pandas.core.dtypes.dtypes import ArrowDtype
  6. from pandas import (
  7. DataFrame,
  8. Index,
  9. MultiIndex,
  10. Series,
  11. _testing as tm,
  12. )
  13. def test_extract_expand_kwarg_wrong_type_raises(any_string_dtype):
  14. # TODO: should this raise TypeError
  15. values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
  16. with pytest.raises(ValueError, match="expand must be True or False"):
  17. values.str.extract(".*(BAD[_]+).*(BAD)", expand=None)
  18. def test_extract_expand_kwarg(any_string_dtype):
  19. s = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
  20. expected = DataFrame(["BAD__", np.nan, np.nan], dtype=any_string_dtype)
  21. result = s.str.extract(".*(BAD[_]+).*")
  22. tm.assert_frame_equal(result, expected)
  23. result = s.str.extract(".*(BAD[_]+).*", expand=True)
  24. tm.assert_frame_equal(result, expected)
  25. expected = DataFrame(
  26. [["BAD__", "BAD"], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
  27. )
  28. result = s.str.extract(".*(BAD[_]+).*(BAD)", expand=False)
  29. tm.assert_frame_equal(result, expected)
  30. def test_extract_expand_False_mixed_object():
  31. ser = Series(
  32. ["aBAD_BAD", np.nan, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0]
  33. )
  34. # two groups
  35. result = ser.str.extract(".*(BAD[_]+).*(BAD)", expand=False)
  36. er = [np.nan, np.nan] # empty row
  37. expected = DataFrame(
  38. [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object
  39. )
  40. tm.assert_frame_equal(result, expected)
  41. # single group
  42. result = ser.str.extract(".*(BAD[_]+).*BAD", expand=False)
  43. expected = Series(
  44. ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan],
  45. dtype=object,
  46. )
  47. tm.assert_series_equal(result, expected)
  48. def test_extract_expand_index_raises():
  49. # GH9980
  50. # Index only works with one regex group since
  51. # multi-group would expand to a frame
  52. idx = Index(["A1", "A2", "A3", "A4", "B5"])
  53. msg = "only one regex group is supported with Index"
  54. with pytest.raises(ValueError, match=msg):
  55. idx.str.extract("([AB])([123])", expand=False)
  56. def test_extract_expand_no_capture_groups_raises(index_or_series, any_string_dtype):
  57. s_or_idx = index_or_series(["A1", "B2", "C3"], dtype=any_string_dtype)
  58. msg = "pattern contains no capture groups"
  59. # no groups
  60. with pytest.raises(ValueError, match=msg):
  61. s_or_idx.str.extract("[ABC][123]", expand=False)
  62. # only non-capturing groups
  63. with pytest.raises(ValueError, match=msg):
  64. s_or_idx.str.extract("(?:[AB]).*", expand=False)
  65. def test_extract_expand_single_capture_group(index_or_series, any_string_dtype):
  66. # single group renames series/index properly
  67. s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype)
  68. result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=False)
  69. expected = index_or_series(["A", "A"], name="uno", dtype=any_string_dtype)
  70. if index_or_series == Series:
  71. tm.assert_series_equal(result, expected)
  72. else:
  73. tm.assert_index_equal(result, expected)
  74. def test_extract_expand_capture_groups(any_string_dtype):
  75. s = Series(["A1", "B2", "C3"], dtype=any_string_dtype)
  76. # one group, no matches
  77. result = s.str.extract("(_)", expand=False)
  78. expected = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype)
  79. tm.assert_series_equal(result, expected)
  80. # two groups, no matches
  81. result = s.str.extract("(_)(_)", expand=False)
  82. expected = DataFrame(
  83. [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
  84. )
  85. tm.assert_frame_equal(result, expected)
  86. # one group, some matches
  87. result = s.str.extract("([AB])[123]", expand=False)
  88. expected = Series(["A", "B", np.nan], dtype=any_string_dtype)
  89. tm.assert_series_equal(result, expected)
  90. # two groups, some matches
  91. result = s.str.extract("([AB])([123])", expand=False)
  92. expected = DataFrame(
  93. [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
  94. )
  95. tm.assert_frame_equal(result, expected)
  96. # one named group
  97. result = s.str.extract("(?P<letter>[AB])", expand=False)
  98. expected = Series(["A", "B", np.nan], name="letter", dtype=any_string_dtype)
  99. tm.assert_series_equal(result, expected)
  100. # two named groups
  101. result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=False)
  102. expected = DataFrame(
  103. [["A", "1"], ["B", "2"], [np.nan, np.nan]],
  104. columns=["letter", "number"],
  105. dtype=any_string_dtype,
  106. )
  107. tm.assert_frame_equal(result, expected)
  108. # mix named and unnamed groups
  109. result = s.str.extract("([AB])(?P<number>[123])", expand=False)
  110. expected = DataFrame(
  111. [["A", "1"], ["B", "2"], [np.nan, np.nan]],
  112. columns=[0, "number"],
  113. dtype=any_string_dtype,
  114. )
  115. tm.assert_frame_equal(result, expected)
  116. # one normal group, one non-capturing group
  117. result = s.str.extract("([AB])(?:[123])", expand=False)
  118. expected = Series(["A", "B", np.nan], dtype=any_string_dtype)
  119. tm.assert_series_equal(result, expected)
  120. # two normal groups, one non-capturing group
  121. s = Series(["A11", "B22", "C33"], dtype=any_string_dtype)
  122. result = s.str.extract("([AB])([123])(?:[123])", expand=False)
  123. expected = DataFrame(
  124. [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
  125. )
  126. tm.assert_frame_equal(result, expected)
  127. # one optional group followed by one normal group
  128. s = Series(["A1", "B2", "3"], dtype=any_string_dtype)
  129. result = s.str.extract("(?P<letter>[AB])?(?P<number>[123])", expand=False)
  130. expected = DataFrame(
  131. [["A", "1"], ["B", "2"], [np.nan, "3"]],
  132. columns=["letter", "number"],
  133. dtype=any_string_dtype,
  134. )
  135. tm.assert_frame_equal(result, expected)
  136. # one normal group followed by one optional group
  137. s = Series(["A1", "B2", "C"], dtype=any_string_dtype)
  138. result = s.str.extract("(?P<letter>[ABC])(?P<number>[123])?", expand=False)
  139. expected = DataFrame(
  140. [["A", "1"], ["B", "2"], ["C", np.nan]],
  141. columns=["letter", "number"],
  142. dtype=any_string_dtype,
  143. )
  144. tm.assert_frame_equal(result, expected)
  145. def test_extract_expand_capture_groups_index(index, any_string_dtype):
  146. # https://github.com/pandas-dev/pandas/issues/6348
  147. # not passing index to the extractor
  148. data = ["A1", "B2", "C"]
  149. if len(index) == 0:
  150. pytest.skip("Test requires len(index) > 0")
  151. while len(index) < len(data):
  152. index = index.repeat(2)
  153. index = index[: len(data)]
  154. ser = Series(data, index=index, dtype=any_string_dtype)
  155. result = ser.str.extract(r"(\d)", expand=False)
  156. expected = Series(["1", "2", np.nan], index=index, dtype=any_string_dtype)
  157. tm.assert_series_equal(result, expected)
  158. result = ser.str.extract(r"(?P<letter>\D)(?P<number>\d)?", expand=False)
  159. expected = DataFrame(
  160. [["A", "1"], ["B", "2"], ["C", np.nan]],
  161. columns=["letter", "number"],
  162. index=index,
  163. dtype=any_string_dtype,
  164. )
  165. tm.assert_frame_equal(result, expected)
  166. def test_extract_single_series_name_is_preserved(any_string_dtype):
  167. s = Series(["a3", "b3", "c2"], name="bob", dtype=any_string_dtype)
  168. result = s.str.extract(r"(?P<sue>[a-z])", expand=False)
  169. expected = Series(["a", "b", "c"], name="sue", dtype=any_string_dtype)
  170. tm.assert_series_equal(result, expected)
  171. def test_extract_expand_True(any_string_dtype):
  172. # Contains tests like those in test_match and some others.
  173. s = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
  174. result = s.str.extract(".*(BAD[_]+).*(BAD)", expand=True)
  175. expected = DataFrame(
  176. [["BAD__", "BAD"], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
  177. )
  178. tm.assert_frame_equal(result, expected)
  179. def test_extract_expand_True_mixed_object():
  180. er = [np.nan, np.nan] # empty row
  181. mixed = Series(
  182. [
  183. "aBAD_BAD",
  184. np.nan,
  185. "BAD_b_BAD",
  186. True,
  187. datetime.today(),
  188. "foo",
  189. None,
  190. 1,
  191. 2.0,
  192. ]
  193. )
  194. result = mixed.str.extract(".*(BAD[_]+).*(BAD)", expand=True)
  195. expected = DataFrame(
  196. [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object
  197. )
  198. tm.assert_frame_equal(result, expected)
  199. def test_extract_expand_True_single_capture_group_raises(
  200. index_or_series, any_string_dtype
  201. ):
  202. # these should work for both Series and Index
  203. # no groups
  204. s_or_idx = index_or_series(["A1", "B2", "C3"], dtype=any_string_dtype)
  205. msg = "pattern contains no capture groups"
  206. with pytest.raises(ValueError, match=msg):
  207. s_or_idx.str.extract("[ABC][123]", expand=True)
  208. # only non-capturing groups
  209. with pytest.raises(ValueError, match=msg):
  210. s_or_idx.str.extract("(?:[AB]).*", expand=True)
  211. def test_extract_expand_True_single_capture_group(index_or_series, any_string_dtype):
  212. # single group renames series/index properly
  213. s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype)
  214. result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=True)
  215. expected = DataFrame({"uno": ["A", "A"]}, dtype=any_string_dtype)
  216. tm.assert_frame_equal(result, expected)
  217. @pytest.mark.parametrize("name", [None, "series_name"])
  218. def test_extract_series(name, any_string_dtype):
  219. # extract should give the same result whether or not the series has a name.
  220. s = Series(["A1", "B2", "C3"], name=name, dtype=any_string_dtype)
  221. # one group, no matches
  222. result = s.str.extract("(_)", expand=True)
  223. expected = DataFrame([np.nan, np.nan, np.nan], dtype=any_string_dtype)
  224. tm.assert_frame_equal(result, expected)
  225. # two groups, no matches
  226. result = s.str.extract("(_)(_)", expand=True)
  227. expected = DataFrame(
  228. [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
  229. )
  230. tm.assert_frame_equal(result, expected)
  231. # one group, some matches
  232. result = s.str.extract("([AB])[123]", expand=True)
  233. expected = DataFrame(["A", "B", np.nan], dtype=any_string_dtype)
  234. tm.assert_frame_equal(result, expected)
  235. # two groups, some matches
  236. result = s.str.extract("([AB])([123])", expand=True)
  237. expected = DataFrame(
  238. [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
  239. )
  240. tm.assert_frame_equal(result, expected)
  241. # one named group
  242. result = s.str.extract("(?P<letter>[AB])", expand=True)
  243. expected = DataFrame({"letter": ["A", "B", np.nan]}, dtype=any_string_dtype)
  244. tm.assert_frame_equal(result, expected)
  245. # two named groups
  246. result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=True)
  247. expected = DataFrame(
  248. [["A", "1"], ["B", "2"], [np.nan, np.nan]],
  249. columns=["letter", "number"],
  250. dtype=any_string_dtype,
  251. )
  252. tm.assert_frame_equal(result, expected)
  253. # mix named and unnamed groups
  254. result = s.str.extract("([AB])(?P<number>[123])", expand=True)
  255. expected = DataFrame(
  256. [["A", "1"], ["B", "2"], [np.nan, np.nan]],
  257. columns=[0, "number"],
  258. dtype=any_string_dtype,
  259. )
  260. tm.assert_frame_equal(result, expected)
  261. # one normal group, one non-capturing group
  262. result = s.str.extract("([AB])(?:[123])", expand=True)
  263. expected = DataFrame(["A", "B", np.nan], dtype=any_string_dtype)
  264. tm.assert_frame_equal(result, expected)
  265. def test_extract_optional_groups(any_string_dtype):
  266. # two normal groups, one non-capturing group
  267. s = Series(["A11", "B22", "C33"], dtype=any_string_dtype)
  268. result = s.str.extract("([AB])([123])(?:[123])", expand=True)
  269. expected = DataFrame(
  270. [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
  271. )
  272. tm.assert_frame_equal(result, expected)
  273. # one optional group followed by one normal group
  274. s = Series(["A1", "B2", "3"], dtype=any_string_dtype)
  275. result = s.str.extract("(?P<letter>[AB])?(?P<number>[123])", expand=True)
  276. expected = DataFrame(
  277. [["A", "1"], ["B", "2"], [np.nan, "3"]],
  278. columns=["letter", "number"],
  279. dtype=any_string_dtype,
  280. )
  281. tm.assert_frame_equal(result, expected)
  282. # one normal group followed by one optional group
  283. s = Series(["A1", "B2", "C"], dtype=any_string_dtype)
  284. result = s.str.extract("(?P<letter>[ABC])(?P<number>[123])?", expand=True)
  285. expected = DataFrame(
  286. [["A", "1"], ["B", "2"], ["C", np.nan]],
  287. columns=["letter", "number"],
  288. dtype=any_string_dtype,
  289. )
  290. tm.assert_frame_equal(result, expected)
  291. def test_extract_dataframe_capture_groups_index(index, any_string_dtype):
  292. # GH6348
  293. # not passing index to the extractor
  294. data = ["A1", "B2", "C"]
  295. if len(index) < len(data):
  296. pytest.skip(f"Index needs more than {len(data)} values")
  297. index = index[: len(data)]
  298. s = Series(data, index=index, dtype=any_string_dtype)
  299. result = s.str.extract(r"(\d)", expand=True)
  300. expected = DataFrame(["1", "2", np.nan], index=index, dtype=any_string_dtype)
  301. tm.assert_frame_equal(result, expected)
  302. result = s.str.extract(r"(?P<letter>\D)(?P<number>\d)?", expand=True)
  303. expected = DataFrame(
  304. [["A", "1"], ["B", "2"], ["C", np.nan]],
  305. columns=["letter", "number"],
  306. index=index,
  307. dtype=any_string_dtype,
  308. )
  309. tm.assert_frame_equal(result, expected)
  310. def test_extract_single_group_returns_frame(any_string_dtype):
  311. # GH11386 extract should always return DataFrame, even when
  312. # there is only one group. Prior to v0.18.0, extract returned
  313. # Series when there was only one group in the regex.
  314. s = Series(["a3", "b3", "c2"], name="series_name", dtype=any_string_dtype)
  315. result = s.str.extract(r"(?P<letter>[a-z])", expand=True)
  316. expected = DataFrame({"letter": ["a", "b", "c"]}, dtype=any_string_dtype)
  317. tm.assert_frame_equal(result, expected)
  318. def test_extractall(any_string_dtype):
  319. data = [
  320. "dave@google.com",
  321. "tdhock5@gmail.com",
  322. "maudelaperriere@gmail.com",
  323. "rob@gmail.com some text steve@gmail.com",
  324. "a@b.com some text c@d.com and e@f.com",
  325. np.nan,
  326. "",
  327. ]
  328. expected_tuples = [
  329. ("dave", "google", "com"),
  330. ("tdhock5", "gmail", "com"),
  331. ("maudelaperriere", "gmail", "com"),
  332. ("rob", "gmail", "com"),
  333. ("steve", "gmail", "com"),
  334. ("a", "b", "com"),
  335. ("c", "d", "com"),
  336. ("e", "f", "com"),
  337. ]
  338. pat = r"""
  339. (?P<user>[a-z0-9]+)
  340. @
  341. (?P<domain>[a-z]+)
  342. \.
  343. (?P<tld>[a-z]{2,4})
  344. """
  345. expected_columns = ["user", "domain", "tld"]
  346. s = Series(data, dtype=any_string_dtype)
  347. # extractall should return a DataFrame with one row for each match, indexed by the
  348. # subject from which the match came.
  349. expected_index = MultiIndex.from_tuples(
  350. [(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (4, 1), (4, 2)],
  351. names=(None, "match"),
  352. )
  353. expected = DataFrame(
  354. expected_tuples, expected_index, expected_columns, dtype=any_string_dtype
  355. )
  356. result = s.str.extractall(pat, flags=re.VERBOSE)
  357. tm.assert_frame_equal(result, expected)
  358. # The index of the input Series should be used to construct the index of the output
  359. # DataFrame:
  360. mi = MultiIndex.from_tuples(
  361. [
  362. ("single", "Dave"),
  363. ("single", "Toby"),
  364. ("single", "Maude"),
  365. ("multiple", "robAndSteve"),
  366. ("multiple", "abcdef"),
  367. ("none", "missing"),
  368. ("none", "empty"),
  369. ]
  370. )
  371. s = Series(data, index=mi, dtype=any_string_dtype)
  372. expected_index = MultiIndex.from_tuples(
  373. [
  374. ("single", "Dave", 0),
  375. ("single", "Toby", 0),
  376. ("single", "Maude", 0),
  377. ("multiple", "robAndSteve", 0),
  378. ("multiple", "robAndSteve", 1),
  379. ("multiple", "abcdef", 0),
  380. ("multiple", "abcdef", 1),
  381. ("multiple", "abcdef", 2),
  382. ],
  383. names=(None, None, "match"),
  384. )
  385. expected = DataFrame(
  386. expected_tuples, expected_index, expected_columns, dtype=any_string_dtype
  387. )
  388. result = s.str.extractall(pat, flags=re.VERBOSE)
  389. tm.assert_frame_equal(result, expected)
  390. # MultiIndexed subject with names.
  391. s = Series(data, index=mi, dtype=any_string_dtype)
  392. s.index.names = ("matches", "description")
  393. expected_index.names = ("matches", "description", "match")
  394. expected = DataFrame(
  395. expected_tuples, expected_index, expected_columns, dtype=any_string_dtype
  396. )
  397. result = s.str.extractall(pat, flags=re.VERBOSE)
  398. tm.assert_frame_equal(result, expected)
  399. @pytest.mark.parametrize(
  400. "pat,expected_names",
  401. [
  402. # optional groups.
  403. ("(?P<letter>[AB])?(?P<number>[123])", ["letter", "number"]),
  404. # only one of two groups has a name.
  405. ("([AB])?(?P<number>[123])", [0, "number"]),
  406. ],
  407. )
  408. def test_extractall_column_names(pat, expected_names, any_string_dtype):
  409. s = Series(["", "A1", "32"], dtype=any_string_dtype)
  410. result = s.str.extractall(pat)
  411. expected = DataFrame(
  412. [("A", "1"), (np.nan, "3"), (np.nan, "2")],
  413. index=MultiIndex.from_tuples([(1, 0), (2, 0), (2, 1)], names=(None, "match")),
  414. columns=expected_names,
  415. dtype=any_string_dtype,
  416. )
  417. tm.assert_frame_equal(result, expected)
  418. def test_extractall_single_group(any_string_dtype):
  419. s = Series(["a3", "b3", "d4c2"], name="series_name", dtype=any_string_dtype)
  420. expected_index = MultiIndex.from_tuples(
  421. [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match")
  422. )
  423. # extractall(one named group) returns DataFrame with one named column.
  424. result = s.str.extractall(r"(?P<letter>[a-z])")
  425. expected = DataFrame(
  426. {"letter": ["a", "b", "d", "c"]}, index=expected_index, dtype=any_string_dtype
  427. )
  428. tm.assert_frame_equal(result, expected)
  429. # extractall(one un-named group) returns DataFrame with one un-named column.
  430. result = s.str.extractall(r"([a-z])")
  431. expected = DataFrame(
  432. ["a", "b", "d", "c"], index=expected_index, dtype=any_string_dtype
  433. )
  434. tm.assert_frame_equal(result, expected)
  435. def test_extractall_single_group_with_quantifier(any_string_dtype):
  436. # GH#13382
  437. # extractall(one un-named group with quantifier) returns DataFrame with one un-named
  438. # column.
  439. s = Series(["ab3", "abc3", "d4cd2"], name="series_name", dtype=any_string_dtype)
  440. result = s.str.extractall(r"([a-z]+)")
  441. expected = DataFrame(
  442. ["ab", "abc", "d", "cd"],
  443. index=MultiIndex.from_tuples(
  444. [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match")
  445. ),
  446. dtype=any_string_dtype,
  447. )
  448. tm.assert_frame_equal(result, expected)
  449. @pytest.mark.parametrize(
  450. "data, names",
  451. [
  452. ([], (None,)),
  453. ([], ("i1",)),
  454. ([], (None, "i2")),
  455. ([], ("i1", "i2")),
  456. (["a3", "b3", "d4c2"], (None,)),
  457. (["a3", "b3", "d4c2"], ("i1", "i2")),
  458. (["a3", "b3", "d4c2"], (None, "i2")),
  459. (["a3", "b3", "d4c2"], ("i1", "i2")),
  460. ],
  461. )
  462. def test_extractall_no_matches(data, names, any_string_dtype):
  463. # GH19075 extractall with no matches should return a valid MultiIndex
  464. n = len(data)
  465. if len(names) == 1:
  466. index = Index(range(n), name=names[0])
  467. else:
  468. tuples = (tuple([i] * (n - 1)) for i in range(n))
  469. index = MultiIndex.from_tuples(tuples, names=names)
  470. s = Series(data, name="series_name", index=index, dtype=any_string_dtype)
  471. expected_index = MultiIndex.from_tuples([], names=(names + ("match",)))
  472. # one un-named group.
  473. result = s.str.extractall("(z)")
  474. expected = DataFrame(columns=[0], index=expected_index, dtype=any_string_dtype)
  475. tm.assert_frame_equal(result, expected)
  476. # two un-named groups.
  477. result = s.str.extractall("(z)(z)")
  478. expected = DataFrame(columns=[0, 1], index=expected_index, dtype=any_string_dtype)
  479. tm.assert_frame_equal(result, expected)
  480. # one named group.
  481. result = s.str.extractall("(?P<first>z)")
  482. expected = DataFrame(
  483. columns=["first"], index=expected_index, dtype=any_string_dtype
  484. )
  485. tm.assert_frame_equal(result, expected)
  486. # two named groups.
  487. result = s.str.extractall("(?P<first>z)(?P<second>z)")
  488. expected = DataFrame(
  489. columns=["first", "second"], index=expected_index, dtype=any_string_dtype
  490. )
  491. tm.assert_frame_equal(result, expected)
  492. # one named, one un-named.
  493. result = s.str.extractall("(z)(?P<second>z)")
  494. expected = DataFrame(
  495. columns=[0, "second"], index=expected_index, dtype=any_string_dtype
  496. )
  497. tm.assert_frame_equal(result, expected)
  498. def test_extractall_stringindex(any_string_dtype):
  499. s = Series(["a1a2", "b1", "c1"], name="xxx", dtype=any_string_dtype)
  500. result = s.str.extractall(r"[ab](?P<digit>\d)")
  501. expected = DataFrame(
  502. {"digit": ["1", "2", "1"]},
  503. index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], names=[None, "match"]),
  504. dtype=any_string_dtype,
  505. )
  506. tm.assert_frame_equal(result, expected)
  507. # index should return the same result as the default index without name thus
  508. # index.name doesn't affect to the result
  509. if any_string_dtype == "object":
  510. for idx in [
  511. Index(["a1a2", "b1", "c1"], dtype=object),
  512. Index(["a1a2", "b1", "c1"], name="xxx", dtype=object),
  513. ]:
  514. result = idx.str.extractall(r"[ab](?P<digit>\d)")
  515. tm.assert_frame_equal(result, expected)
  516. s = Series(
  517. ["a1a2", "b1", "c1"],
  518. name="s_name",
  519. index=Index(["XX", "yy", "zz"], name="idx_name"),
  520. dtype=any_string_dtype,
  521. )
  522. result = s.str.extractall(r"[ab](?P<digit>\d)")
  523. expected = DataFrame(
  524. {"digit": ["1", "2", "1"]},
  525. index=MultiIndex.from_tuples(
  526. [("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", "match"]
  527. ),
  528. dtype=any_string_dtype,
  529. )
  530. tm.assert_frame_equal(result, expected)
  531. def test_extractall_no_capture_groups_raises(any_string_dtype):
  532. # Does not make sense to use extractall with a regex that has no capture groups.
  533. # (it returns DataFrame with one column for each capture group)
  534. s = Series(["a3", "b3", "d4c2"], name="series_name", dtype=any_string_dtype)
  535. with pytest.raises(ValueError, match="no capture groups"):
  536. s.str.extractall(r"[a-z]")
  537. def test_extract_index_one_two_groups():
  538. s = Series(["a3", "b3", "d4c2"], index=["A3", "B3", "D4"], name="series_name")
  539. r = s.index.str.extract(r"([A-Z])", expand=True)
  540. e = DataFrame(["A", "B", "D"])
  541. tm.assert_frame_equal(r, e)
  542. # Prior to v0.18.0, index.str.extract(regex with one group)
  543. # returned Index. With more than one group, extract raised an
  544. # error (GH9980). Now extract always returns DataFrame.
  545. r = s.index.str.extract(r"(?P<letter>[A-Z])(?P<digit>[0-9])", expand=True)
  546. e_list = [("A", "3"), ("B", "3"), ("D", "4")]
  547. e = DataFrame(e_list, columns=["letter", "digit"])
  548. tm.assert_frame_equal(r, e)
  549. def test_extractall_same_as_extract(any_string_dtype):
  550. s = Series(["a3", "b3", "c2"], name="series_name", dtype=any_string_dtype)
  551. pattern_two_noname = r"([a-z])([0-9])"
  552. extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
  553. has_multi_index = s.str.extractall(pattern_two_noname)
  554. no_multi_index = has_multi_index.xs(0, level="match")
  555. tm.assert_frame_equal(extract_two_noname, no_multi_index)
  556. pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])"
  557. extract_two_named = s.str.extract(pattern_two_named, expand=True)
  558. has_multi_index = s.str.extractall(pattern_two_named)
  559. no_multi_index = has_multi_index.xs(0, level="match")
  560. tm.assert_frame_equal(extract_two_named, no_multi_index)
  561. pattern_one_named = r"(?P<group_name>[a-z])"
  562. extract_one_named = s.str.extract(pattern_one_named, expand=True)
  563. has_multi_index = s.str.extractall(pattern_one_named)
  564. no_multi_index = has_multi_index.xs(0, level="match")
  565. tm.assert_frame_equal(extract_one_named, no_multi_index)
  566. pattern_one_noname = r"([a-z])"
  567. extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
  568. has_multi_index = s.str.extractall(pattern_one_noname)
  569. no_multi_index = has_multi_index.xs(0, level="match")
  570. tm.assert_frame_equal(extract_one_noname, no_multi_index)
  571. def test_extractall_same_as_extract_subject_index(any_string_dtype):
  572. # same as above tests, but s has an MultiIndex.
  573. mi = MultiIndex.from_tuples(
  574. [("A", "first"), ("B", "second"), ("C", "third")],
  575. names=("capital", "ordinal"),
  576. )
  577. s = Series(["a3", "b3", "c2"], index=mi, name="series_name", dtype=any_string_dtype)
  578. pattern_two_noname = r"([a-z])([0-9])"
  579. extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
  580. has_match_index = s.str.extractall(pattern_two_noname)
  581. no_match_index = has_match_index.xs(0, level="match")
  582. tm.assert_frame_equal(extract_two_noname, no_match_index)
  583. pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])"
  584. extract_two_named = s.str.extract(pattern_two_named, expand=True)
  585. has_match_index = s.str.extractall(pattern_two_named)
  586. no_match_index = has_match_index.xs(0, level="match")
  587. tm.assert_frame_equal(extract_two_named, no_match_index)
  588. pattern_one_named = r"(?P<group_name>[a-z])"
  589. extract_one_named = s.str.extract(pattern_one_named, expand=True)
  590. has_match_index = s.str.extractall(pattern_one_named)
  591. no_match_index = has_match_index.xs(0, level="match")
  592. tm.assert_frame_equal(extract_one_named, no_match_index)
  593. pattern_one_noname = r"([a-z])"
  594. extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
  595. has_match_index = s.str.extractall(pattern_one_noname)
  596. no_match_index = has_match_index.xs(0, level="match")
  597. tm.assert_frame_equal(extract_one_noname, no_match_index)
  598. def test_extractall_preserves_dtype():
  599. # Ensure that when extractall is called on a series with specific dtypes set, that
  600. # the dtype is preserved in the resulting DataFrame's column.
  601. pa = pytest.importorskip("pyarrow")
  602. result = Series(["abc", "ab"], dtype=ArrowDtype(pa.string())).str.extractall("(ab)")
  603. assert result.dtypes[0] == "string[pyarrow]"