test_split_partition.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734
  1. from datetime import datetime
  2. import re
  3. import numpy as np
  4. import pytest
  5. import pandas as pd
  6. from pandas import (
  7. DataFrame,
  8. Index,
  9. MultiIndex,
  10. Series,
  11. _testing as tm,
  12. )
  13. from pandas.tests.strings import (
  14. _convert_na_value,
  15. is_object_or_nan_string_dtype,
  16. )
  17. @pytest.mark.parametrize("method", ["split", "rsplit"])
  18. def test_split(any_string_dtype, method):
  19. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
  20. result = getattr(values.str, method)("_")
  21. exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
  22. exp = _convert_na_value(values, exp)
  23. tm.assert_series_equal(result, exp)
  24. @pytest.mark.parametrize("method", ["split", "rsplit"])
  25. def test_split_more_than_one_char(any_string_dtype, method):
  26. # more than one char
  27. values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype)
  28. result = getattr(values.str, method)("__")
  29. exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
  30. exp = _convert_na_value(values, exp)
  31. tm.assert_series_equal(result, exp)
  32. result = getattr(values.str, method)("__", expand=False)
  33. tm.assert_series_equal(result, exp)
  34. def test_split_more_regex_split(any_string_dtype):
  35. # regex split
  36. values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
  37. result = values.str.split("[,_]")
  38. exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
  39. exp = _convert_na_value(values, exp)
  40. tm.assert_series_equal(result, exp)
  41. def test_split_regex(any_string_dtype):
  42. # GH 43563
  43. # explicit regex = True split
  44. values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
  45. result = values.str.split(r"\.jpg", regex=True)
  46. exp = Series([["xxxjpgzzz", ""]])
  47. tm.assert_series_equal(result, exp)
  48. def test_split_regex_explicit(any_string_dtype):
  49. # explicit regex = True split with compiled regex
  50. regex_pat = re.compile(r".jpg")
  51. values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
  52. result = values.str.split(regex_pat)
  53. exp = Series([["xx", "zzz", ""]])
  54. tm.assert_series_equal(result, exp)
  55. # explicit regex = False split
  56. result = values.str.split(r"\.jpg", regex=False)
  57. exp = Series([["xxxjpgzzz.jpg"]])
  58. tm.assert_series_equal(result, exp)
  59. # non explicit regex split, pattern length == 1
  60. result = values.str.split(r".")
  61. exp = Series([["xxxjpgzzz", "jpg"]])
  62. tm.assert_series_equal(result, exp)
  63. # non explicit regex split, pattern length != 1
  64. result = values.str.split(r".jpg")
  65. exp = Series([["xx", "zzz", ""]])
  66. tm.assert_series_equal(result, exp)
  67. # regex=False with pattern compiled regex raises error
  68. with pytest.raises(
  69. ValueError,
  70. match="Cannot use a compiled regex as replacement pattern with regex=False",
  71. ):
  72. values.str.split(regex_pat, regex=False)
  73. @pytest.mark.parametrize("expand", [None, False])
  74. @pytest.mark.parametrize("method", ["split", "rsplit"])
  75. def test_split_object_mixed(expand, method):
  76. mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
  77. result = getattr(mixed.str, method)("_", expand=expand)
  78. exp = Series(
  79. [
  80. ["a", "b", "c"],
  81. np.nan,
  82. ["d", "e", "f"],
  83. np.nan,
  84. np.nan,
  85. None,
  86. np.nan,
  87. np.nan,
  88. ]
  89. )
  90. assert isinstance(result, Series)
  91. tm.assert_almost_equal(result, exp)
  92. @pytest.mark.parametrize("method", ["split", "rsplit"])
  93. @pytest.mark.parametrize("n", [None, 0])
  94. def test_split_n(any_string_dtype, method, n):
  95. s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype)
  96. expected = Series([["a", "b"], pd.NA, ["b", "c"]])
  97. result = getattr(s.str, method)(" ", n=n)
  98. expected = _convert_na_value(s, expected)
  99. tm.assert_series_equal(result, expected)
  100. def test_rsplit(any_string_dtype):
  101. # regex split is not supported by rsplit
  102. values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
  103. result = values.str.rsplit("[,_]")
  104. exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]])
  105. exp = _convert_na_value(values, exp)
  106. tm.assert_series_equal(result, exp)
  107. def test_rsplit_max_number(any_string_dtype):
  108. # setting max number of splits, make sure it's from reverse
  109. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
  110. result = values.str.rsplit("_", n=1)
  111. exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]])
  112. exp = _convert_na_value(values, exp)
  113. tm.assert_series_equal(result, exp)
  114. def test_split_blank_string(any_string_dtype):
  115. # expand blank split GH 20067
  116. values = Series([""], name="test", dtype=any_string_dtype)
  117. result = values.str.split(expand=True)
  118. exp = DataFrame([[]], dtype=any_string_dtype) # NOTE: this is NOT an empty df
  119. tm.assert_frame_equal(result, exp)
  120. def test_split_blank_string_with_non_empty(any_string_dtype):
  121. values = Series(["a b c", "a b", "", " "], name="test", dtype=any_string_dtype)
  122. result = values.str.split(expand=True)
  123. exp = DataFrame(
  124. [
  125. ["a", "b", "c"],
  126. ["a", "b", None],
  127. [None, None, None],
  128. [None, None, None],
  129. ],
  130. dtype=any_string_dtype,
  131. )
  132. tm.assert_frame_equal(result, exp)
  133. @pytest.mark.parametrize("method", ["split", "rsplit"])
  134. def test_split_noargs(any_string_dtype, method):
  135. # #1859
  136. s = Series(["Wes McKinney", "Travis Oliphant"], dtype=any_string_dtype)
  137. result = getattr(s.str, method)()
  138. expected = ["Travis", "Oliphant"]
  139. assert result[1] == expected
  140. @pytest.mark.parametrize(
  141. "data, pat",
  142. [
  143. (["bd asdf jfg", "kjasdflqw asdfnfk"], None),
  144. (["bd asdf jfg", "kjasdflqw asdfnfk"], "asdf"),
  145. (["bd_asdf_jfg", "kjasdflqw_asdfnfk"], "_"),
  146. ],
  147. )
  148. @pytest.mark.parametrize("n", [-1, 0])
  149. def test_split_maxsplit(data, pat, any_string_dtype, n):
  150. # re.split 0, str.split -1
  151. s = Series(data, dtype=any_string_dtype)
  152. result = s.str.split(pat=pat, n=n)
  153. xp = s.str.split(pat=pat)
  154. tm.assert_series_equal(result, xp)
  155. @pytest.mark.parametrize(
  156. "data, pat, expected",
  157. [
  158. (
  159. ["split once", "split once too!"],
  160. None,
  161. Series({0: ["split", "once"], 1: ["split", "once too!"]}),
  162. ),
  163. (
  164. ["split_once", "split_once_too!"],
  165. "_",
  166. Series({0: ["split", "once"], 1: ["split", "once_too!"]}),
  167. ),
  168. ],
  169. )
  170. def test_split_no_pat_with_nonzero_n(data, pat, expected, any_string_dtype):
  171. s = Series(data, dtype=any_string_dtype)
  172. result = s.str.split(pat=pat, n=1)
  173. tm.assert_series_equal(expected, result, check_index_type=False)
  174. def test_split_to_dataframe_no_splits(any_string_dtype):
  175. s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)
  176. result = s.str.split("_", expand=True)
  177. exp = DataFrame({0: Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)})
  178. tm.assert_frame_equal(result, exp)
  179. def test_split_to_dataframe(any_string_dtype):
  180. s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype)
  181. result = s.str.split("_", expand=True)
  182. exp = DataFrame(
  183. {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
  184. dtype=any_string_dtype,
  185. )
  186. tm.assert_frame_equal(result, exp)
  187. def test_split_to_dataframe_unequal_splits(any_string_dtype):
  188. s = Series(
  189. ["some_unequal_splits", "one_of_these_things_is_not"], dtype=any_string_dtype
  190. )
  191. result = s.str.split("_", expand=True)
  192. exp = DataFrame(
  193. {
  194. 0: ["some", "one"],
  195. 1: ["unequal", "of"],
  196. 2: ["splits", "these"],
  197. 3: [None, "things"],
  198. 4: [None, "is"],
  199. 5: [None, "not"],
  200. },
  201. dtype=any_string_dtype,
  202. )
  203. tm.assert_frame_equal(result, exp)
  204. def test_split_to_dataframe_with_index(any_string_dtype):
  205. s = Series(
  206. ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype
  207. )
  208. result = s.str.split("_", expand=True)
  209. exp = DataFrame(
  210. {0: ["some", "with"], 1: ["splits", "index"]},
  211. index=["preserve", "me"],
  212. dtype=any_string_dtype,
  213. )
  214. tm.assert_frame_equal(result, exp)
  215. with pytest.raises(ValueError, match="expand must be"):
  216. s.str.split("_", expand="not_a_boolean")
  217. def test_split_to_multiindex_expand_no_splits():
  218. # https://github.com/pandas-dev/pandas/issues/23677
  219. idx = Index(["nosplit", "alsonosplit", np.nan])
  220. result = idx.str.split("_", expand=True)
  221. exp = idx
  222. tm.assert_index_equal(result, exp)
  223. assert result.nlevels == 1
  224. def test_split_to_multiindex_expand():
  225. idx = Index(["some_equal_splits", "with_no_nans", np.nan, None])
  226. result = idx.str.split("_", expand=True)
  227. exp = MultiIndex.from_tuples(
  228. [
  229. ("some", "equal", "splits"),
  230. ("with", "no", "nans"),
  231. [np.nan, np.nan, np.nan],
  232. [None, None, None],
  233. ]
  234. )
  235. tm.assert_index_equal(result, exp)
  236. assert result.nlevels == 3
  237. def test_split_to_multiindex_expand_unequal_splits():
  238. idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None])
  239. result = idx.str.split("_", expand=True)
  240. exp = MultiIndex.from_tuples(
  241. [
  242. ("some", "unequal", "splits", np.nan, np.nan, np.nan),
  243. ("one", "of", "these", "things", "is", "not"),
  244. (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan),
  245. (None, None, None, None, None, None),
  246. ]
  247. )
  248. tm.assert_index_equal(result, exp)
  249. assert result.nlevels == 6
  250. with pytest.raises(ValueError, match="expand must be"):
  251. idx.str.split("_", expand="not_a_boolean")
  252. def test_rsplit_to_dataframe_expand_no_splits(any_string_dtype):
  253. s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)
  254. result = s.str.rsplit("_", expand=True)
  255. exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}, dtype=any_string_dtype)
  256. tm.assert_frame_equal(result, exp)
  257. def test_rsplit_to_dataframe_expand(any_string_dtype):
  258. s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype)
  259. result = s.str.rsplit("_", expand=True)
  260. exp = DataFrame(
  261. {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
  262. dtype=any_string_dtype,
  263. )
  264. tm.assert_frame_equal(result, exp)
  265. result = s.str.rsplit("_", expand=True, n=2)
  266. exp = DataFrame(
  267. {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
  268. dtype=any_string_dtype,
  269. )
  270. tm.assert_frame_equal(result, exp)
  271. result = s.str.rsplit("_", expand=True, n=1)
  272. exp = DataFrame(
  273. {0: ["some_equal", "with_no"], 1: ["splits", "nans"]}, dtype=any_string_dtype
  274. )
  275. tm.assert_frame_equal(result, exp)
  276. def test_rsplit_to_dataframe_expand_with_index(any_string_dtype):
  277. s = Series(
  278. ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype
  279. )
  280. result = s.str.rsplit("_", expand=True)
  281. exp = DataFrame(
  282. {0: ["some", "with"], 1: ["splits", "index"]},
  283. index=["preserve", "me"],
  284. dtype=any_string_dtype,
  285. )
  286. tm.assert_frame_equal(result, exp)
  287. def test_rsplit_to_multiindex_expand_no_split():
  288. idx = Index(["nosplit", "alsonosplit"])
  289. result = idx.str.rsplit("_", expand=True)
  290. exp = idx
  291. tm.assert_index_equal(result, exp)
  292. assert result.nlevels == 1
  293. def test_rsplit_to_multiindex_expand():
  294. idx = Index(["some_equal_splits", "with_no_nans"])
  295. result = idx.str.rsplit("_", expand=True)
  296. exp = MultiIndex.from_tuples([("some", "equal", "splits"), ("with", "no", "nans")])
  297. tm.assert_index_equal(result, exp)
  298. assert result.nlevels == 3
  299. def test_rsplit_to_multiindex_expand_n():
  300. idx = Index(["some_equal_splits", "with_no_nans"])
  301. result = idx.str.rsplit("_", expand=True, n=1)
  302. exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")])
  303. tm.assert_index_equal(result, exp)
  304. assert result.nlevels == 2
  305. def test_split_nan_expand(any_string_dtype):
  306. # gh-18450
  307. s = Series(["foo,bar,baz", np.nan], dtype=any_string_dtype)
  308. result = s.str.split(",", expand=True)
  309. exp = DataFrame(
  310. [["foo", "bar", "baz"], [np.nan, np.nan, np.nan]], dtype=any_string_dtype
  311. )
  312. tm.assert_frame_equal(result, exp)
  313. # check that these are actually np.nan/pd.NA and not None
  314. # TODO see GH 18463
  315. # tm.assert_frame_equal does not differentiate
  316. if is_object_or_nan_string_dtype(any_string_dtype):
  317. assert all(np.isnan(x) for x in result.iloc[1])
  318. else:
  319. assert all(x is pd.NA for x in result.iloc[1])
  320. def test_split_with_name_series(any_string_dtype):
  321. # GH 12617
  322. # should preserve name
  323. s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
  324. res = s.str.split(",")
  325. exp = Series([["a", "b"], ["c", "d"]], name="xxx")
  326. tm.assert_series_equal(res, exp)
  327. res = s.str.split(",", expand=True)
  328. exp = DataFrame([["a", "b"], ["c", "d"]], dtype=any_string_dtype)
  329. tm.assert_frame_equal(res, exp)
  330. def test_split_with_name_index():
  331. # GH 12617
  332. idx = Index(["a,b", "c,d"], name="xxx")
  333. res = idx.str.split(",")
  334. exp = Index([["a", "b"], ["c", "d"]], name="xxx")
  335. assert res.nlevels == 1
  336. tm.assert_index_equal(res, exp)
  337. res = idx.str.split(",", expand=True)
  338. exp = MultiIndex.from_tuples([("a", "b"), ("c", "d")])
  339. assert res.nlevels == 2
  340. tm.assert_index_equal(res, exp)
  341. @pytest.mark.parametrize(
  342. "method, exp",
  343. [
  344. [
  345. "partition",
  346. [
  347. ("a", "__", "b__c"),
  348. ("c", "__", "d__e"),
  349. np.nan,
  350. ("f", "__", "g__h"),
  351. None,
  352. ],
  353. ],
  354. [
  355. "rpartition",
  356. [
  357. ("a__b", "__", "c"),
  358. ("c__d", "__", "e"),
  359. np.nan,
  360. ("f__g", "__", "h"),
  361. None,
  362. ],
  363. ],
  364. ],
  365. )
  366. def test_partition_series_more_than_one_char(method, exp, any_string_dtype):
  367. # https://github.com/pandas-dev/pandas/issues/23558
  368. # more than one char
  369. s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype)
  370. result = getattr(s.str, method)("__", expand=False)
  371. expected = Series(exp)
  372. expected = _convert_na_value(s, expected)
  373. tm.assert_series_equal(result, expected)
  374. @pytest.mark.parametrize(
  375. "method, exp",
  376. [
  377. [
  378. "partition",
  379. [("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None],
  380. ],
  381. [
  382. "rpartition",
  383. [("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None],
  384. ],
  385. ],
  386. )
  387. def test_partition_series_none(any_string_dtype, method, exp):
  388. # https://github.com/pandas-dev/pandas/issues/23558
  389. # None
  390. s = Series(["a b c", "c d e", np.nan, "f g h", None], dtype=any_string_dtype)
  391. result = getattr(s.str, method)(expand=False)
  392. expected = Series(exp)
  393. expected = _convert_na_value(s, expected)
  394. tm.assert_series_equal(result, expected)
  395. @pytest.mark.parametrize(
  396. "method, exp",
  397. [
  398. [
  399. "partition",
  400. [("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None],
  401. ],
  402. [
  403. "rpartition",
  404. [("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None],
  405. ],
  406. ],
  407. )
  408. def test_partition_series_not_split(any_string_dtype, method, exp):
  409. # https://github.com/pandas-dev/pandas/issues/23558
  410. # Not split
  411. s = Series(["abc", "cde", np.nan, "fgh", None], dtype=any_string_dtype)
  412. result = getattr(s.str, method)("_", expand=False)
  413. expected = Series(exp)
  414. expected = _convert_na_value(s, expected)
  415. tm.assert_series_equal(result, expected)
  416. @pytest.mark.parametrize(
  417. "method, exp",
  418. [
  419. [
  420. "partition",
  421. [("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")],
  422. ],
  423. [
  424. "rpartition",
  425. [("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")],
  426. ],
  427. ],
  428. )
  429. def test_partition_series_unicode(any_string_dtype, method, exp):
  430. # https://github.com/pandas-dev/pandas/issues/23558
  431. # unicode
  432. s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
  433. result = getattr(s.str, method)("_", expand=False)
  434. expected = Series(exp)
  435. expected = _convert_na_value(s, expected)
  436. tm.assert_series_equal(result, expected)
  437. @pytest.mark.parametrize("method", ["partition", "rpartition"])
  438. def test_partition_series_stdlib(any_string_dtype, method):
  439. # https://github.com/pandas-dev/pandas/issues/23558
  440. # compare to standard lib
  441. s = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"], dtype=any_string_dtype)
  442. result = getattr(s.str, method)("_", expand=False).tolist()
  443. assert result == [getattr(v, method)("_") for v in s]
  444. @pytest.mark.parametrize(
  445. "method, expand, exp, exp_levels",
  446. [
  447. [
  448. "partition",
  449. False,
  450. np.array(
  451. [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None],
  452. dtype=object,
  453. ),
  454. 1,
  455. ],
  456. [
  457. "rpartition",
  458. False,
  459. np.array(
  460. [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None],
  461. dtype=object,
  462. ),
  463. 1,
  464. ],
  465. ],
  466. )
  467. def test_partition_index(method, expand, exp, exp_levels):
  468. # https://github.com/pandas-dev/pandas/issues/23558
  469. values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None])
  470. result = getattr(values.str, method)("_", expand=expand)
  471. exp = Index(exp)
  472. tm.assert_index_equal(result, exp)
  473. assert result.nlevels == exp_levels
  474. @pytest.mark.parametrize(
  475. "method, exp",
  476. [
  477. [
  478. "partition",
  479. {
  480. 0: ["a", "c", np.nan, "f", None],
  481. 1: ["_", "_", np.nan, "_", None],
  482. 2: ["b_c", "d_e", np.nan, "g_h", None],
  483. },
  484. ],
  485. [
  486. "rpartition",
  487. {
  488. 0: ["a_b", "c_d", np.nan, "f_g", None],
  489. 1: ["_", "_", np.nan, "_", None],
  490. 2: ["c", "e", np.nan, "h", None],
  491. },
  492. ],
  493. ],
  494. )
  495. def test_partition_to_dataframe(any_string_dtype, method, exp):
  496. # https://github.com/pandas-dev/pandas/issues/23558
  497. s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype)
  498. result = getattr(s.str, method)("_")
  499. expected = DataFrame(
  500. exp,
  501. dtype=any_string_dtype,
  502. )
  503. tm.assert_frame_equal(result, expected)
  504. @pytest.mark.parametrize(
  505. "method, exp",
  506. [
  507. [
  508. "partition",
  509. {
  510. 0: ["a", "c", np.nan, "f", None],
  511. 1: ["_", "_", np.nan, "_", None],
  512. 2: ["b_c", "d_e", np.nan, "g_h", None],
  513. },
  514. ],
  515. [
  516. "rpartition",
  517. {
  518. 0: ["a_b", "c_d", np.nan, "f_g", None],
  519. 1: ["_", "_", np.nan, "_", None],
  520. 2: ["c", "e", np.nan, "h", None],
  521. },
  522. ],
  523. ],
  524. )
  525. def test_partition_to_dataframe_from_series(any_string_dtype, method, exp):
  526. # https://github.com/pandas-dev/pandas/issues/23558
  527. s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype)
  528. result = getattr(s.str, method)("_", expand=True)
  529. expected = DataFrame(
  530. exp,
  531. dtype=any_string_dtype,
  532. )
  533. tm.assert_frame_equal(result, expected)
  534. def test_partition_with_name(any_string_dtype):
  535. # GH 12617
  536. s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
  537. result = s.str.partition(",")
  538. expected = DataFrame(
  539. {0: ["a", "c"], 1: [",", ","], 2: ["b", "d"]}, dtype=any_string_dtype
  540. )
  541. tm.assert_frame_equal(result, expected)
  542. def test_partition_with_name_expand(any_string_dtype):
  543. # GH 12617
  544. # should preserve name
  545. s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
  546. result = s.str.partition(",", expand=False)
  547. expected = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx")
  548. tm.assert_series_equal(result, expected)
  549. def test_partition_index_with_name():
  550. idx = Index(["a,b", "c,d"], name="xxx")
  551. result = idx.str.partition(",")
  552. expected = MultiIndex.from_tuples([("a", ",", "b"), ("c", ",", "d")])
  553. assert result.nlevels == 3
  554. tm.assert_index_equal(result, expected)
  555. def test_partition_index_with_name_expand_false():
  556. idx = Index(["a,b", "c,d"], name="xxx")
  557. # should preserve name
  558. result = idx.str.partition(",", expand=False)
  559. expected = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx")
  560. assert result.nlevels == 1
  561. tm.assert_index_equal(result, expected)
  562. @pytest.mark.parametrize("method", ["partition", "rpartition"])
  563. def test_partition_sep_kwarg(any_string_dtype, method):
  564. # GH 22676; depr kwarg "pat" in favor of "sep"
  565. s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
  566. expected = getattr(s.str, method)(sep="_")
  567. result = getattr(s.str, method)("_")
  568. tm.assert_frame_equal(result, expected)
  569. def test_get():
  570. ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
  571. result = ser.str.split("_").str.get(1)
  572. expected = Series(["b", "d", np.nan, "g"], dtype=object)
  573. tm.assert_series_equal(result, expected)
  574. def test_get_mixed_object():
  575. ser = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0])
  576. result = ser.str.split("_").str.get(1)
  577. expected = Series(
  578. ["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan], dtype=object
  579. )
  580. tm.assert_series_equal(result, expected)
  581. @pytest.mark.parametrize("idx", [2, -3])
  582. def test_get_bounds(idx):
  583. ser = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"])
  584. result = ser.str.split("_").str.get(idx)
  585. expected = Series(["3", "8", np.nan], dtype=object)
  586. tm.assert_series_equal(result, expected)
  587. @pytest.mark.parametrize(
  588. "idx, exp", [[2, [3, 3, np.nan, "b"]], [-1, [3, 3, np.nan, np.nan]]]
  589. )
  590. def test_get_complex(idx, exp):
  591. # GH 20671, getting value not in dict raising `KeyError`
  592. ser = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}])
  593. result = ser.str.get(idx)
  594. expected = Series(exp)
  595. tm.assert_series_equal(result, expected)
  596. @pytest.mark.parametrize("to_type", [tuple, list, np.array])
  597. def test_get_complex_nested(to_type):
  598. ser = Series([to_type([to_type([1, 2])])])
  599. result = ser.str.get(0)
  600. expected = Series([to_type([1, 2])])
  601. tm.assert_series_equal(result, expected)
  602. result = ser.str.get(1)
  603. expected = Series([np.nan])
  604. tm.assert_series_equal(result, expected)
  605. def test_get_strings(any_string_dtype):
  606. ser = Series(["a", "ab", np.nan, "abc"], dtype=any_string_dtype)
  607. result = ser.str.get(2)
  608. expected = Series([np.nan, np.nan, np.nan, "c"], dtype=any_string_dtype)
  609. tm.assert_series_equal(result, expected)