test_map.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604
  1. from collections import (
  2. Counter,
  3. defaultdict,
  4. )
  5. from decimal import Decimal
  6. import math
  7. import numpy as np
  8. import pytest
  9. import pandas as pd
  10. from pandas import (
  11. DataFrame,
  12. Index,
  13. MultiIndex,
  14. Series,
  15. bdate_range,
  16. date_range,
  17. isna,
  18. timedelta_range,
  19. )
  20. import pandas._testing as tm
  21. def test_series_map_box_timedelta():
  22. # GH#11349
  23. ser = Series(timedelta_range("1 day 1 s", periods=5, freq="h"))
  24. def f(x):
  25. return x.total_seconds()
  26. ser.map(f)
  27. def test_map_callable(datetime_series):
  28. with np.errstate(all="ignore"):
  29. tm.assert_series_equal(datetime_series.map(np.sqrt), np.sqrt(datetime_series))
  30. # map function element-wise
  31. tm.assert_series_equal(datetime_series.map(math.exp), np.exp(datetime_series))
  32. # empty series
  33. s = Series(dtype=object, name="foo", index=Index([], name="bar"))
  34. rs = s.map(lambda x: x)
  35. tm.assert_series_equal(s, rs)
  36. # check all metadata (GH 9322)
  37. assert s is not rs
  38. assert s.index is rs.index
  39. assert s.dtype == rs.dtype
  40. assert s.name == rs.name
  41. # index but no data
  42. s = Series(index=[1, 2, 3], dtype=np.float64)
  43. rs = s.map(lambda x: x)
  44. tm.assert_series_equal(s, rs)
  45. def test_map_same_length_inference_bug():
  46. s = Series([1, 2])
  47. def f(x):
  48. return (x, x + 1)
  49. s = Series([1, 2, 3])
  50. result = s.map(f)
  51. expected = Series([(1, 2), (2, 3), (3, 4)])
  52. tm.assert_series_equal(result, expected)
  53. s = Series(["foo,bar"])
  54. result = s.map(lambda x: x.split(","))
  55. expected = Series([("foo", "bar")])
  56. tm.assert_series_equal(result, expected)
  57. def test_series_map_box_timestamps():
  58. # GH#2689, GH#2627
  59. ser = Series(date_range("1/1/2000", periods=3))
  60. def func(x):
  61. return (x.hour, x.day, x.month)
  62. result = ser.map(func)
  63. expected = Series([(0, 1, 1), (0, 2, 1), (0, 3, 1)])
  64. tm.assert_series_equal(result, expected)
  65. def test_map_series_stringdtype(any_string_dtype, using_infer_string):
  66. # map test on StringDType, GH#40823
  67. ser1 = Series(
  68. data=["cat", "dog", "rabbit"],
  69. index=["id1", "id2", "id3"],
  70. dtype=any_string_dtype,
  71. )
  72. ser2 = Series(["id3", "id2", "id1", "id7000"], dtype=any_string_dtype)
  73. result = ser2.map(ser1)
  74. item = pd.NA
  75. if ser2.dtype == object:
  76. item = np.nan
  77. expected = Series(data=["rabbit", "dog", "cat", item], dtype=any_string_dtype)
  78. if using_infer_string and any_string_dtype == "object":
  79. expected = expected.astype("str")
  80. tm.assert_series_equal(result, expected)
  81. @pytest.mark.parametrize(
  82. "data, expected_dtype",
  83. [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], "str")],
  84. )
  85. def test_map_categorical_with_nan_values(data, expected_dtype):
  86. # GH 20714 bug fixed in: GH 24275
  87. def func(val):
  88. return val.split("-")[0]
  89. s = Series(data, dtype="category")
  90. result = s.map(func, na_action="ignore")
  91. expected = Series(["1", "1", np.nan], dtype=expected_dtype)
  92. tm.assert_series_equal(result, expected)
  93. def test_map_empty_integer_series():
  94. # GH52384
  95. s = Series([], dtype=int)
  96. result = s.map(lambda x: x)
  97. tm.assert_series_equal(result, s)
  98. def test_map_empty_integer_series_with_datetime_index():
  99. # GH 21245
  100. s = Series([], index=date_range(start="2018-01-01", periods=0), dtype=int)
  101. result = s.map(lambda x: x)
  102. tm.assert_series_equal(result, s)
  103. @pytest.mark.parametrize("func", [str, lambda x: str(x)])
  104. def test_map_simple_str_callables_same_as_astype(
  105. string_series, func, using_infer_string
  106. ):
  107. # test that we are evaluating row-by-row first
  108. # before vectorized evaluation
  109. result = string_series.map(func)
  110. expected = string_series.astype(str if not using_infer_string else "str")
  111. tm.assert_series_equal(result, expected)
  112. def test_list_raises(string_series):
  113. with pytest.raises(TypeError, match="'list' object is not callable"):
  114. string_series.map([lambda x: x])
  115. def test_map():
  116. data = {
  117. "A": [0.0, 1.0, 2.0, 3.0, 4.0],
  118. "B": [0.0, 1.0, 0.0, 1.0, 0.0],
  119. "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
  120. "D": bdate_range("1/1/2009", periods=5),
  121. }
  122. source = Series(data["B"], index=data["C"])
  123. target = Series(data["C"][:4], index=data["D"][:4])
  124. merged = target.map(source)
  125. for k, v in merged.items():
  126. assert v == source[target[k]]
  127. # input could be a dict
  128. merged = target.map(source.to_dict())
  129. for k, v in merged.items():
  130. assert v == source[target[k]]
  131. def test_map_datetime(datetime_series):
  132. # function
  133. result = datetime_series.map(lambda x: x * 2)
  134. tm.assert_series_equal(result, datetime_series * 2)
  135. def test_map_category():
  136. # GH 10324
  137. a = Series([1, 2, 3, 4])
  138. b = Series(["even", "odd", "even", "odd"], dtype="category")
  139. c = Series(["even", "odd", "even", "odd"])
  140. exp = Series(["odd", "even", "odd", np.nan], dtype="category")
  141. tm.assert_series_equal(a.map(b), exp)
  142. exp = Series(["odd", "even", "odd", np.nan])
  143. tm.assert_series_equal(a.map(c), exp)
  144. def test_map_category_numeric():
  145. a = Series(["a", "b", "c", "d"])
  146. b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(["b", "c", "d", "e"]))
  147. c = Series([1, 2, 3, 4], index=Index(["b", "c", "d", "e"]))
  148. exp = Series([np.nan, 1, 2, 3])
  149. tm.assert_series_equal(a.map(b), exp)
  150. exp = Series([np.nan, 1, 2, 3])
  151. tm.assert_series_equal(a.map(c), exp)
  152. def test_map_category_string():
  153. a = Series(["a", "b", "c", "d"])
  154. b = Series(
  155. ["B", "C", "D", "E"],
  156. dtype="category",
  157. index=pd.CategoricalIndex(["b", "c", "d", "e"]),
  158. )
  159. c = Series(["B", "C", "D", "E"], index=Index(["b", "c", "d", "e"]))
  160. exp = Series(
  161. pd.Categorical([np.nan, "B", "C", "D"], categories=["B", "C", "D", "E"])
  162. )
  163. tm.assert_series_equal(a.map(b), exp)
  164. exp = Series([np.nan, "B", "C", "D"])
  165. tm.assert_series_equal(a.map(c), exp)
  166. @pytest.mark.filterwarnings(r"ignore:Dtype inference:FutureWarning")
  167. def test_map_empty(request, index):
  168. if isinstance(index, MultiIndex):
  169. request.applymarker(
  170. pytest.mark.xfail(
  171. reason="Initializing a Series from a MultiIndex is not supported"
  172. )
  173. )
  174. s = Series(index)
  175. result = s.map({})
  176. expected = Series(np.nan, index=s.index)
  177. tm.assert_series_equal(result, expected)
  178. def test_map_compat():
  179. # related GH 8024
  180. s = Series([True, True, False], index=[1, 2, 3])
  181. result = s.map({True: "foo", False: "bar"})
  182. expected = Series(["foo", "foo", "bar"], index=[1, 2, 3])
  183. tm.assert_series_equal(result, expected)
  184. def test_map_int():
  185. left = Series({"a": 1.0, "b": 2.0, "c": 3.0, "d": 4})
  186. right = Series({1: 11, 2: 22, 3: 33})
  187. assert left.dtype == np.float64
  188. assert issubclass(right.dtype.type, np.integer)
  189. merged = left.map(right)
  190. assert merged.dtype == np.float64
  191. assert isna(merged["d"])
  192. assert not isna(merged["c"])
  193. def test_map_type_inference():
  194. s = Series(range(3))
  195. s2 = s.map(lambda x: np.where(x == 0, 0, 1))
  196. assert issubclass(s2.dtype.type, np.integer)
  197. def test_map_decimal(string_series):
  198. result = string_series.map(lambda x: Decimal(str(x)))
  199. assert result.dtype == np.object_
  200. assert isinstance(result.iloc[0], Decimal)
  201. def test_map_na_exclusion():
  202. s = Series([1.5, np.nan, 3, np.nan, 5])
  203. result = s.map(lambda x: x * 2, na_action="ignore")
  204. exp = s * 2
  205. tm.assert_series_equal(result, exp)
  206. def test_map_dict_with_tuple_keys():
  207. """
  208. Due to new MultiIndex-ing behaviour in v0.14.0,
  209. dicts with tuple keys passed to map were being
  210. converted to a multi-index, preventing tuple values
  211. from being mapped properly.
  212. """
  213. # GH 18496
  214. df = DataFrame({"a": [(1,), (2,), (3, 4), (5, 6)]})
  215. label_mappings = {(1,): "A", (2,): "B", (3, 4): "A", (5, 6): "B"}
  216. df["labels"] = df["a"].map(label_mappings)
  217. df["expected_labels"] = Series(["A", "B", "A", "B"], index=df.index)
  218. # All labels should be filled now
  219. tm.assert_series_equal(df["labels"], df["expected_labels"], check_names=False)
  220. def test_map_counter():
  221. s = Series(["a", "b", "c"], index=[1, 2, 3])
  222. counter = Counter()
  223. counter["b"] = 5
  224. counter["c"] += 1
  225. result = s.map(counter)
  226. expected = Series([0, 5, 1], index=[1, 2, 3])
  227. tm.assert_series_equal(result, expected)
  228. def test_map_defaultdict():
  229. s = Series([1, 2, 3], index=["a", "b", "c"])
  230. default_dict = defaultdict(lambda: "blank")
  231. default_dict[1] = "stuff"
  232. result = s.map(default_dict)
  233. expected = Series(["stuff", "blank", "blank"], index=["a", "b", "c"])
  234. tm.assert_series_equal(result, expected)
  235. def test_map_dict_na_key():
  236. # https://github.com/pandas-dev/pandas/issues/17648
  237. # Checks that np.nan key is appropriately mapped
  238. s = Series([1, 2, np.nan])
  239. expected = Series(["a", "b", "c"])
  240. result = s.map({1: "a", 2: "b", np.nan: "c"})
  241. tm.assert_series_equal(result, expected)
  242. @pytest.mark.parametrize("na_action", [None, "ignore"])
  243. def test_map_defaultdict_na_key(na_action):
  244. # GH 48813
  245. s = Series([1, 2, np.nan])
  246. default_map = defaultdict(lambda: "missing", {1: "a", 2: "b", np.nan: "c"})
  247. result = s.map(default_map, na_action=na_action)
  248. expected = Series({0: "a", 1: "b", 2: "c" if na_action is None else np.nan})
  249. tm.assert_series_equal(result, expected)
  250. @pytest.mark.parametrize("na_action", [None, "ignore"])
  251. def test_map_defaultdict_missing_key(na_action):
  252. # GH 48813
  253. s = Series([1, 2, np.nan])
  254. default_map = defaultdict(lambda: "missing", {1: "a", 2: "b", 3: "c"})
  255. result = s.map(default_map, na_action=na_action)
  256. expected = Series({0: "a", 1: "b", 2: "missing" if na_action is None else np.nan})
  257. tm.assert_series_equal(result, expected)
  258. @pytest.mark.parametrize("na_action", [None, "ignore"])
  259. def test_map_defaultdict_unmutated(na_action):
  260. # GH 48813
  261. s = Series([1, 2, np.nan])
  262. default_map = defaultdict(lambda: "missing", {1: "a", 2: "b", np.nan: "c"})
  263. expected_default_map = default_map.copy()
  264. s.map(default_map, na_action=na_action)
  265. assert default_map == expected_default_map
  266. @pytest.mark.parametrize("arg_func", [dict, Series])
  267. def test_map_dict_ignore_na(arg_func):
  268. # GH#47527
  269. mapping = arg_func({1: 10, np.nan: 42})
  270. ser = Series([1, np.nan, 2])
  271. result = ser.map(mapping, na_action="ignore")
  272. expected = Series([10, np.nan, np.nan])
  273. tm.assert_series_equal(result, expected)
  274. def test_map_defaultdict_ignore_na():
  275. # GH#47527
  276. mapping = defaultdict(int, {1: 10, np.nan: 42})
  277. ser = Series([1, np.nan, 2])
  278. result = ser.map(mapping)
  279. expected = Series([10, 42, 0])
  280. tm.assert_series_equal(result, expected)
  281. @pytest.mark.parametrize(
  282. "na_action, expected",
  283. [(None, Series([10.0, 42.0, np.nan])), ("ignore", Series([10, np.nan, np.nan]))],
  284. )
  285. def test_map_categorical_na_ignore(na_action, expected):
  286. # GH#47527
  287. values = pd.Categorical([1, np.nan, 2], categories=[10, 1, 2])
  288. ser = Series(values)
  289. result = ser.map({1: 10, np.nan: 42}, na_action=na_action)
  290. tm.assert_series_equal(result, expected)
  291. def test_map_dict_subclass_with_missing():
  292. """
  293. Test Series.map with a dictionary subclass that defines __missing__,
  294. i.e. sets a default value (GH #15999).
  295. """
  296. class DictWithMissing(dict):
  297. def __missing__(self, key):
  298. return "missing"
  299. s = Series([1, 2, 3])
  300. dictionary = DictWithMissing({3: "three"})
  301. result = s.map(dictionary)
  302. expected = Series(["missing", "missing", "three"])
  303. tm.assert_series_equal(result, expected)
  304. def test_map_dict_subclass_without_missing():
  305. class DictWithoutMissing(dict):
  306. pass
  307. s = Series([1, 2, 3])
  308. dictionary = DictWithoutMissing({3: "three"})
  309. result = s.map(dictionary)
  310. expected = Series([np.nan, np.nan, "three"])
  311. tm.assert_series_equal(result, expected)
  312. def test_map_abc_mapping(non_dict_mapping_subclass):
  313. # https://github.com/pandas-dev/pandas/issues/29733
  314. # Check collections.abc.Mapping support as mapper for Series.map
  315. s = Series([1, 2, 3])
  316. not_a_dictionary = non_dict_mapping_subclass({3: "three"})
  317. result = s.map(not_a_dictionary)
  318. expected = Series([np.nan, np.nan, "three"])
  319. tm.assert_series_equal(result, expected)
  320. def test_map_abc_mapping_with_missing(non_dict_mapping_subclass):
  321. # https://github.com/pandas-dev/pandas/issues/29733
  322. # Check collections.abc.Mapping support as mapper for Series.map
  323. class NonDictMappingWithMissing(non_dict_mapping_subclass):
  324. def __missing__(self, key):
  325. return "missing"
  326. s = Series([1, 2, 3])
  327. not_a_dictionary = NonDictMappingWithMissing({3: "three"})
  328. result = s.map(not_a_dictionary)
  329. # __missing__ is a dict concept, not a Mapping concept,
  330. # so it should not change the result!
  331. expected = Series([np.nan, np.nan, "three"])
  332. tm.assert_series_equal(result, expected)
  333. def test_map_box_dt64(unit):
  334. vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]
  335. ser = Series(vals).dt.as_unit(unit)
  336. assert ser.dtype == f"datetime64[{unit}]"
  337. # boxed value must be Timestamp instance
  338. res = ser.map(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}")
  339. exp = Series(["Timestamp_1_None", "Timestamp_2_None"])
  340. tm.assert_series_equal(res, exp)
  341. def test_map_box_dt64tz(unit):
  342. vals = [
  343. pd.Timestamp("2011-01-01", tz="US/Eastern"),
  344. pd.Timestamp("2011-01-02", tz="US/Eastern"),
  345. ]
  346. ser = Series(vals).dt.as_unit(unit)
  347. assert ser.dtype == f"datetime64[{unit}, US/Eastern]"
  348. res = ser.map(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}")
  349. exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"])
  350. tm.assert_series_equal(res, exp)
  351. def test_map_box_td64(unit):
  352. # timedelta
  353. vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")]
  354. ser = Series(vals).dt.as_unit(unit)
  355. assert ser.dtype == f"timedelta64[{unit}]"
  356. res = ser.map(lambda x: f"{type(x).__name__}_{x.days}")
  357. exp = Series(["Timedelta_1", "Timedelta_2"])
  358. tm.assert_series_equal(res, exp)
  359. def test_map_box_period():
  360. # period
  361. vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")]
  362. ser = Series(vals)
  363. assert ser.dtype == "Period[M]"
  364. res = ser.map(lambda x: f"{type(x).__name__}_{x.freqstr}")
  365. exp = Series(["Period_M", "Period_M"])
  366. tm.assert_series_equal(res, exp)
  367. @pytest.mark.parametrize("na_action", [None, "ignore"])
  368. def test_map_categorical(na_action, using_infer_string):
  369. values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True)
  370. s = Series(values, name="XX", index=list("abcdefg"))
  371. result = s.map(lambda x: x.lower(), na_action=na_action)
  372. exp_values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True)
  373. exp = Series(exp_values, name="XX", index=list("abcdefg"))
  374. tm.assert_series_equal(result, exp)
  375. tm.assert_categorical_equal(result.values, exp_values)
  376. result = s.map(lambda x: "A", na_action=na_action)
  377. exp = Series(["A"] * 7, name="XX", index=list("abcdefg"))
  378. tm.assert_series_equal(result, exp)
  379. assert result.dtype == object if not using_infer_string else "str"
  380. @pytest.mark.parametrize(
  381. "na_action, expected",
  382. (
  383. [None, Series(["A", "B", "nan"], name="XX")],
  384. [
  385. "ignore",
  386. Series(
  387. ["A", "B", np.nan],
  388. name="XX",
  389. dtype=pd.CategoricalDtype(list("DCBA"), True),
  390. ),
  391. ],
  392. ),
  393. )
  394. def test_map_categorical_na_action(na_action, expected):
  395. dtype = pd.CategoricalDtype(list("DCBA"), ordered=True)
  396. values = pd.Categorical(list("AB") + [np.nan], dtype=dtype)
  397. s = Series(values, name="XX")
  398. result = s.map(str, na_action=na_action)
  399. tm.assert_series_equal(result, expected)
  400. def test_map_datetimetz():
  401. values = date_range("2011-01-01", "2011-01-02", freq="h").tz_localize("Asia/Tokyo")
  402. s = Series(values, name="XX")
  403. # keep tz
  404. result = s.map(lambda x: x + pd.offsets.Day())
  405. exp_values = date_range("2011-01-02", "2011-01-03", freq="h").tz_localize(
  406. "Asia/Tokyo"
  407. )
  408. exp = Series(exp_values, name="XX")
  409. tm.assert_series_equal(result, exp)
  410. result = s.map(lambda x: x.hour)
  411. exp = Series(list(range(24)) + [0], name="XX", dtype=np.int64)
  412. tm.assert_series_equal(result, exp)
  413. # not vectorized
  414. def f(x):
  415. if not isinstance(x, pd.Timestamp):
  416. raise ValueError
  417. return str(x.tz)
  418. result = s.map(f)
  419. exp = Series(["Asia/Tokyo"] * 25, name="XX")
  420. tm.assert_series_equal(result, exp)
  421. @pytest.mark.parametrize(
  422. "vals,mapping,exp",
  423. [
  424. (list("abc"), {np.nan: "not NaN"}, [np.nan] * 3 + ["not NaN"]),
  425. (list("abc"), {"a": "a letter"}, ["a letter"] + [np.nan] * 3),
  426. (list(range(3)), {0: 42}, [42] + [np.nan] * 3),
  427. ],
  428. )
  429. def test_map_missing_mixed(vals, mapping, exp):
  430. # GH20495
  431. s = Series(vals + [np.nan])
  432. result = s.map(mapping)
  433. exp = Series(exp)
  434. tm.assert_series_equal(result, exp)
  435. def test_map_scalar_on_date_time_index_aware_series():
  436. # GH 25959
  437. # Calling map on a localized time series should not cause an error
  438. series = Series(
  439. np.arange(10, dtype=np.float64),
  440. index=date_range("2020-01-01", periods=10, tz="UTC"),
  441. name="ts",
  442. )
  443. result = Series(series.index).map(lambda x: 1)
  444. tm.assert_series_equal(result, Series(np.ones(len(series)), dtype="int64"))
  445. def test_map_float_to_string_precision():
  446. # GH 13228
  447. ser = Series(1 / 3)
  448. result = ser.map(lambda val: str(val)).to_dict()
  449. expected = {0: "0.3333333333333333"}
  450. assert result == expected
  451. def test_map_to_timedelta():
  452. list_of_valid_strings = ["00:00:01", "00:00:02"]
  453. a = pd.to_timedelta(list_of_valid_strings)
  454. b = Series(list_of_valid_strings).map(pd.to_timedelta)
  455. tm.assert_series_equal(Series(a), b)
  456. list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT]
  457. a = pd.to_timedelta(list_of_strings)
  458. ser = Series(list_of_strings)
  459. b = ser.map(pd.to_timedelta)
  460. tm.assert_series_equal(Series(a), b)
  461. def test_map_type():
  462. # GH 46719
  463. s = Series([3, "string", float], index=["a", "b", "c"])
  464. result = s.map(type)
  465. expected = Series([int, str, type], index=["a", "b", "c"])
  466. tm.assert_series_equal(result, expected)