test_cat.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. import re
  2. import numpy as np
  3. import pytest
  4. import pandas.util._test_decorators as td
  5. from pandas import (
  6. DataFrame,
  7. Index,
  8. MultiIndex,
  9. Series,
  10. _testing as tm,
  11. concat,
  12. option_context,
  13. )
  14. @pytest.mark.parametrize("other", [None, Series, Index])
  15. def test_str_cat_name(index_or_series, other):
  16. # GH 21053
  17. box = index_or_series
  18. values = ["a", "b"]
  19. if other:
  20. other = other(values)
  21. else:
  22. other = values
  23. result = box(values, name="name").str.cat(other, sep=",")
  24. assert result.name == "name"
  25. @pytest.mark.parametrize(
  26. "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
  27. )
  28. def test_str_cat(index_or_series, infer_string):
  29. with option_context("future.infer_string", infer_string):
  30. box = index_or_series
  31. # test_cat above tests "str_cat" from ndarray;
  32. # here testing "str.cat" from Series/Index to ndarray/list
  33. s = box(["a", "a", "b", "b", "c", np.nan])
  34. # single array
  35. result = s.str.cat()
  36. expected = "aabbc"
  37. assert result == expected
  38. result = s.str.cat(na_rep="-")
  39. expected = "aabbc-"
  40. assert result == expected
  41. result = s.str.cat(sep="_", na_rep="NA")
  42. expected = "a_a_b_b_c_NA"
  43. assert result == expected
  44. t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object)
  45. expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"])
  46. # Series/Index with array
  47. result = s.str.cat(t, na_rep="-")
  48. tm.assert_equal(result, expected)
  49. # Series/Index with list
  50. result = s.str.cat(list(t), na_rep="-")
  51. tm.assert_equal(result, expected)
  52. # errors for incorrect lengths
  53. rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
  54. z = Series(["1", "2", "3"])
  55. with pytest.raises(ValueError, match=rgx):
  56. s.str.cat(z.values)
  57. with pytest.raises(ValueError, match=rgx):
  58. s.str.cat(list(z))
  59. def test_str_cat_raises_intuitive_error(index_or_series):
  60. # GH 11334
  61. box = index_or_series
  62. s = box(["a", "b", "c", "d"])
  63. message = "Did you mean to supply a `sep` keyword?"
  64. with pytest.raises(ValueError, match=message):
  65. s.str.cat("|")
  66. with pytest.raises(ValueError, match=message):
  67. s.str.cat(" ")
  68. @pytest.mark.parametrize(
  69. "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
  70. )
  71. @pytest.mark.parametrize("sep", ["", None])
  72. @pytest.mark.parametrize("dtype_target", ["object", "category"])
  73. @pytest.mark.parametrize("dtype_caller", ["object", "category"])
  74. def test_str_cat_categorical(
  75. index_or_series, dtype_caller, dtype_target, sep, infer_string
  76. ):
  77. box = index_or_series
  78. with option_context("future.infer_string", infer_string):
  79. s = Index(["a", "a", "b", "a"], dtype=dtype_caller)
  80. s = s if box == Index else Series(s, index=s, dtype=s.dtype)
  81. t = Index(["b", "a", "b", "c"], dtype=dtype_target)
  82. expected = Index(
  83. ["ab", "aa", "bb", "ac"], dtype=object if dtype_caller == "object" else None
  84. )
  85. expected = (
  86. expected
  87. if box == Index
  88. else Series(
  89. expected, index=Index(s, dtype=dtype_caller), dtype=expected.dtype
  90. )
  91. )
  92. # Series/Index with unaligned Index -> t.values
  93. result = s.str.cat(t.values, sep=sep)
  94. tm.assert_equal(result, expected)
  95. # Series/Index with Series having matching Index
  96. t = Series(t.values, index=Index(s, dtype=dtype_caller))
  97. result = s.str.cat(t, sep=sep)
  98. tm.assert_equal(result, expected)
  99. # Series/Index with Series.values
  100. result = s.str.cat(t.values, sep=sep)
  101. tm.assert_equal(result, expected)
  102. # Series/Index with Series having different Index
  103. t = Series(t.values, index=t.values)
  104. expected = Index(
  105. ["aa", "aa", "bb", "bb", "aa"],
  106. dtype=object if dtype_caller == "object" else None,
  107. )
  108. dtype = object if dtype_caller == "object" else s.dtype.categories.dtype
  109. expected = (
  110. expected
  111. if box == Index
  112. else Series(
  113. expected,
  114. index=Index(expected.str[:1], dtype=dtype),
  115. dtype=expected.dtype,
  116. )
  117. )
  118. result = s.str.cat(t, sep=sep)
  119. tm.assert_equal(result, expected)
  120. @pytest.mark.parametrize(
  121. "data",
  122. [[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]],
  123. ids=["integers", "floats", "mixed"],
  124. )
  125. # without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b']
  126. @pytest.mark.parametrize(
  127. "box",
  128. [Series, Index, list, lambda x: np.array(x, dtype=object)],
  129. ids=["Series", "Index", "list", "np.array"],
  130. )
  131. def test_str_cat_wrong_dtype_raises(box, data):
  132. # GH 22722
  133. s = Series(["a", "b", "c"])
  134. t = box(data)
  135. msg = "Concatenation requires list-likes containing only strings.*"
  136. with pytest.raises(TypeError, match=msg):
  137. # need to use outer and na_rep, as otherwise Index would not raise
  138. s.str.cat(t, join="outer", na_rep="-")
  139. def test_str_cat_mixed_inputs(index_or_series):
  140. box = index_or_series
  141. s = Index(["a", "b", "c", "d"])
  142. s = s if box == Index else Series(s, index=s)
  143. t = Series(["A", "B", "C", "D"], index=s.values)
  144. d = concat([t, Series(s, index=s)], axis=1)
  145. expected = Index(["aAa", "bBb", "cCc", "dDd"])
  146. expected = expected if box == Index else Series(expected.values, index=s.values)
  147. # Series/Index with DataFrame
  148. result = s.str.cat(d)
  149. tm.assert_equal(result, expected)
  150. # Series/Index with two-dimensional ndarray
  151. result = s.str.cat(d.values)
  152. tm.assert_equal(result, expected)
  153. # Series/Index with list of Series
  154. result = s.str.cat([t, s])
  155. tm.assert_equal(result, expected)
  156. # Series/Index with mixed list of Series/array
  157. result = s.str.cat([t, s.values])
  158. tm.assert_equal(result, expected)
  159. # Series/Index with list of Series; different indexes
  160. t.index = ["b", "c", "d", "a"]
  161. expected = box(["aDa", "bAb", "cBc", "dCd"])
  162. expected = expected if box == Index else Series(expected.values, index=s.values)
  163. result = s.str.cat([t, s])
  164. tm.assert_equal(result, expected)
  165. # Series/Index with mixed list; different index
  166. result = s.str.cat([t, s.values])
  167. tm.assert_equal(result, expected)
  168. # Series/Index with DataFrame; different indexes
  169. d.index = ["b", "c", "d", "a"]
  170. expected = box(["aDd", "bAa", "cBb", "dCc"])
  171. expected = expected if box == Index else Series(expected.values, index=s.values)
  172. result = s.str.cat(d)
  173. tm.assert_equal(result, expected)
  174. # errors for incorrect lengths
  175. rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
  176. z = Series(["1", "2", "3"])
  177. e = concat([z, z], axis=1)
  178. # two-dimensional ndarray
  179. with pytest.raises(ValueError, match=rgx):
  180. s.str.cat(e.values)
  181. # list of list-likes
  182. with pytest.raises(ValueError, match=rgx):
  183. s.str.cat([z.values, s.values])
  184. # mixed list of Series/list-like
  185. with pytest.raises(ValueError, match=rgx):
  186. s.str.cat([z.values, s])
  187. # errors for incorrect arguments in list-like
  188. rgx = "others must be Series, Index, DataFrame,.*"
  189. # make sure None/NaN do not crash checks in _get_series_list
  190. u = Series(["a", np.nan, "c", None])
  191. # mix of string and Series
  192. with pytest.raises(TypeError, match=rgx):
  193. s.str.cat([u, "u"])
  194. # DataFrame in list
  195. with pytest.raises(TypeError, match=rgx):
  196. s.str.cat([u, d])
  197. # 2-dim ndarray in list
  198. with pytest.raises(TypeError, match=rgx):
  199. s.str.cat([u, d.values])
  200. # nested lists
  201. with pytest.raises(TypeError, match=rgx):
  202. s.str.cat([u, [u, d]])
  203. # forbidden input type: set
  204. # GH 23009
  205. with pytest.raises(TypeError, match=rgx):
  206. s.str.cat(set(u))
  207. # forbidden input type: set in list
  208. # GH 23009
  209. with pytest.raises(TypeError, match=rgx):
  210. s.str.cat([u, set(u)])
  211. # other forbidden input type, e.g. int
  212. with pytest.raises(TypeError, match=rgx):
  213. s.str.cat(1)
  214. # nested list-likes
  215. with pytest.raises(TypeError, match=rgx):
  216. s.str.cat(iter([t.values, list(s)]))
  217. @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
  218. def test_str_cat_align_indexed(index_or_series, join):
  219. # https://github.com/pandas-dev/pandas/issues/18657
  220. box = index_or_series
  221. s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"])
  222. t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"])
  223. sa, ta = s.align(t, join=join)
  224. # result after manual alignment of inputs
  225. expected = sa.str.cat(ta, na_rep="-")
  226. if box == Index:
  227. s = Index(s)
  228. sa = Index(sa)
  229. expected = Index(expected)
  230. result = s.str.cat(t, join=join, na_rep="-")
  231. tm.assert_equal(result, expected)
  232. @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
  233. def test_str_cat_align_mixed_inputs(join):
  234. s = Series(["a", "b", "c", "d"])
  235. t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
  236. d = concat([t, t], axis=1)
  237. expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"])
  238. expected = expected_outer.loc[s.index.join(t.index, how=join)]
  239. # list of Series
  240. result = s.str.cat([t, t], join=join, na_rep="-")
  241. tm.assert_series_equal(result, expected)
  242. # DataFrame
  243. result = s.str.cat(d, join=join, na_rep="-")
  244. tm.assert_series_equal(result, expected)
  245. # mixed list of indexed/unindexed
  246. u = np.array(["A", "B", "C", "D"])
  247. expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"])
  248. # joint index of rhs [t, u]; u will be forced have index of s
  249. rhs_idx = (
  250. t.index.intersection(s.index)
  251. if join == "inner"
  252. else t.index.union(s.index)
  253. if join == "outer"
  254. else t.index.append(s.index.difference(t.index))
  255. )
  256. expected = expected_outer.loc[s.index.join(rhs_idx, how=join)]
  257. result = s.str.cat([t, u], join=join, na_rep="-")
  258. tm.assert_series_equal(result, expected)
  259. with pytest.raises(TypeError, match="others must be Series,.*"):
  260. # nested lists are forbidden
  261. s.str.cat([t, list(u)], join=join)
  262. # errors for incorrect lengths
  263. rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
  264. z = Series(["1", "2", "3"]).values
  265. # unindexed object of wrong length
  266. with pytest.raises(ValueError, match=rgx):
  267. s.str.cat(z, join=join)
  268. # unindexed object of wrong length in list
  269. with pytest.raises(ValueError, match=rgx):
  270. s.str.cat([t, z], join=join)
  271. def test_str_cat_all_na(index_or_series, index_or_series2):
  272. # GH 24044
  273. box = index_or_series
  274. other = index_or_series2
  275. # check that all NaNs in caller / target work
  276. s = Index(["a", "b", "c", "d"])
  277. s = s if box == Index else Series(s, index=s)
  278. t = other([np.nan] * 4, dtype=object)
  279. # add index of s for alignment
  280. t = t if other == Index else Series(t, index=s)
  281. # all-NA target
  282. if box == Series:
  283. expected = Series([np.nan] * 4, index=s.index, dtype=s.dtype)
  284. else: # box == Index
  285. # TODO: Strimg option, this should return string dtype
  286. expected = Index([np.nan] * 4, dtype=object)
  287. result = s.str.cat(t, join="left")
  288. tm.assert_equal(result, expected)
  289. # all-NA caller (only for Series)
  290. if other == Series:
  291. expected = Series([np.nan] * 4, dtype=object, index=t.index)
  292. result = t.str.cat(s, join="left")
  293. tm.assert_series_equal(result, expected)
  294. def test_str_cat_special_cases():
  295. s = Series(["a", "b", "c", "d"])
  296. t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
  297. # iterator of elements with different types
  298. expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"])
  299. result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-")
  300. tm.assert_series_equal(result, expected)
  301. # right-align with different indexes in others
  302. expected = Series(["aa-", "d-d"], index=[0, 3])
  303. result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-")
  304. tm.assert_series_equal(result, expected)
  305. def test_cat_on_filtered_index():
  306. df = DataFrame(
  307. index=MultiIndex.from_product(
  308. [[2011, 2012], [1, 2, 3]], names=["year", "month"]
  309. )
  310. )
  311. df = df.reset_index()
  312. df = df[df.month > 1]
  313. str_year = df.year.astype("str")
  314. str_month = df.month.astype("str")
  315. str_both = str_year.str.cat(str_month, sep=" ")
  316. assert str_both.loc[1] == "2011 2"
  317. str_multiple = str_year.str.cat([str_month, str_month], sep=" ")
  318. assert str_multiple.loc[1] == "2011 2 2"
  319. @pytest.mark.parametrize("klass", [tuple, list, np.array, Series, Index])
  320. def test_cat_different_classes(klass):
  321. # https://github.com/pandas-dev/pandas/issues/33425
  322. s = Series(["a", "b", "c"])
  323. result = s.str.cat(klass(["x", "y", "z"]))
  324. expected = Series(["ax", "by", "cz"])
  325. tm.assert_series_equal(result, expected)
  326. def test_cat_on_series_dot_str():
  327. # GH 28277
  328. ps = Series(["AbC", "de", "FGHI", "j", "kLLLm"])
  329. message = re.escape(
  330. "others must be Series, Index, DataFrame, np.ndarray "
  331. "or list-like (either containing only strings or "
  332. "containing only objects of type Series/Index/"
  333. "np.ndarray[1-dim])"
  334. )
  335. with pytest.raises(TypeError, match=message):
  336. ps.str.cat(others=ps.str)