test_explode.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. import pandas._testing as tm
  5. def test_basic():
  6. s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd"), name="foo")
  7. result = s.explode()
  8. expected = pd.Series(
  9. [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object, name="foo"
  10. )
  11. tm.assert_series_equal(result, expected)
  12. def test_mixed_type():
  13. s = pd.Series(
  14. [[0, 1, 2], np.nan, None, np.array([]), pd.Series(["a", "b"])], name="foo"
  15. )
  16. result = s.explode()
  17. expected = pd.Series(
  18. [0, 1, 2, np.nan, None, np.nan, "a", "b"],
  19. index=[0, 0, 0, 1, 2, 3, 4, 4],
  20. dtype=object,
  21. name="foo",
  22. )
  23. tm.assert_series_equal(result, expected)
  24. def test_empty():
  25. s = pd.Series(dtype=object)
  26. result = s.explode()
  27. expected = s.copy()
  28. tm.assert_series_equal(result, expected)
  29. def test_nested_lists():
  30. s = pd.Series([[[1, 2, 3]], [1, 2], 1])
  31. result = s.explode()
  32. expected = pd.Series([[1, 2, 3], 1, 2, 1], index=[0, 1, 1, 2])
  33. tm.assert_series_equal(result, expected)
  34. def test_multi_index():
  35. s = pd.Series(
  36. [[0, 1, 2], np.nan, [], (3, 4)],
  37. name="foo",
  38. index=pd.MultiIndex.from_product([list("ab"), range(2)], names=["foo", "bar"]),
  39. )
  40. result = s.explode()
  41. index = pd.MultiIndex.from_tuples(
  42. [("a", 0), ("a", 0), ("a", 0), ("a", 1), ("b", 0), ("b", 1), ("b", 1)],
  43. names=["foo", "bar"],
  44. )
  45. expected = pd.Series(
  46. [0, 1, 2, np.nan, np.nan, 3, 4], index=index, dtype=object, name="foo"
  47. )
  48. tm.assert_series_equal(result, expected)
  49. def test_large():
  50. s = pd.Series([range(256)]).explode()
  51. result = s.explode()
  52. tm.assert_series_equal(result, s)
  53. def test_invert_array():
  54. df = pd.DataFrame({"a": pd.date_range("20190101", periods=3, tz="UTC")})
  55. listify = df.apply(lambda x: x.array, axis=1)
  56. result = listify.explode()
  57. tm.assert_series_equal(result, df["a"].rename())
  58. @pytest.mark.parametrize(
  59. "s", [pd.Series([1, 2, 3]), pd.Series(pd.date_range("2019", periods=3, tz="UTC"))]
  60. )
  61. def test_non_object_dtype(s):
  62. result = s.explode()
  63. tm.assert_series_equal(result, s)
  64. def test_typical_usecase():
  65. df = pd.DataFrame(
  66. [{"var1": "a,b,c", "var2": 1}, {"var1": "d,e,f", "var2": 2}],
  67. columns=["var1", "var2"],
  68. )
  69. exploded = df.var1.str.split(",").explode()
  70. result = df[["var2"]].join(exploded)
  71. expected = pd.DataFrame(
  72. {"var2": [1, 1, 1, 2, 2, 2], "var1": list("abcdef")},
  73. columns=["var2", "var1"],
  74. index=[0, 0, 0, 1, 1, 1],
  75. )
  76. tm.assert_frame_equal(result, expected)
  77. def test_nested_EA():
  78. # a nested EA array
  79. s = pd.Series(
  80. [
  81. pd.date_range("20170101", periods=3, tz="UTC"),
  82. pd.date_range("20170104", periods=3, tz="UTC"),
  83. ]
  84. )
  85. result = s.explode()
  86. expected = pd.Series(
  87. pd.date_range("20170101", periods=6, tz="UTC"), index=[0, 0, 0, 1, 1, 1]
  88. )
  89. tm.assert_series_equal(result, expected)
  90. def test_duplicate_index():
  91. # GH 28005
  92. s = pd.Series([[1, 2], [3, 4]], index=[0, 0])
  93. result = s.explode()
  94. expected = pd.Series([1, 2, 3, 4], index=[0, 0, 0, 0], dtype=object)
  95. tm.assert_series_equal(result, expected)
  96. def test_ignore_index():
  97. # GH 34932
  98. s = pd.Series([[1, 2], [3, 4]])
  99. result = s.explode(ignore_index=True)
  100. expected = pd.Series([1, 2, 3, 4], index=[0, 1, 2, 3], dtype=object)
  101. tm.assert_series_equal(result, expected)
  102. def test_explode_sets():
  103. # https://github.com/pandas-dev/pandas/issues/35614
  104. s = pd.Series([{"a", "b", "c"}], index=[1])
  105. result = s.explode().sort_values()
  106. expected = pd.Series(["a", "b", "c"], index=[1, 1, 1])
  107. tm.assert_series_equal(result, expected)
  108. def test_explode_scalars_can_ignore_index():
  109. # https://github.com/pandas-dev/pandas/issues/40487
  110. s = pd.Series([1, 2, 3], index=["a", "b", "c"])
  111. result = s.explode(ignore_index=True)
  112. expected = pd.Series([1, 2, 3])
  113. tm.assert_series_equal(result, expected)
  114. @pytest.mark.parametrize("ignore_index", [True, False])
  115. def test_explode_pyarrow_list_type(ignore_index):
  116. # GH 53602
  117. pa = pytest.importorskip("pyarrow")
  118. data = [
  119. [None, None],
  120. [1],
  121. [],
  122. [2, 3],
  123. None,
  124. ]
  125. ser = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
  126. result = ser.explode(ignore_index=ignore_index)
  127. expected = pd.Series(
  128. data=[None, None, 1, None, 2, 3, None],
  129. index=None if ignore_index else [0, 0, 1, 2, 3, 3, 4],
  130. dtype=pd.ArrowDtype(pa.int64()),
  131. )
  132. tm.assert_series_equal(result, expected)
  133. @pytest.mark.parametrize("ignore_index", [True, False])
  134. def test_explode_pyarrow_non_list_type(ignore_index):
  135. pa = pytest.importorskip("pyarrow")
  136. data = [1, 2, 3]
  137. ser = pd.Series(data, dtype=pd.ArrowDtype(pa.int64()))
  138. result = ser.explode(ignore_index=ignore_index)
  139. expected = pd.Series([1, 2, 3], dtype="int64[pyarrow]", index=[0, 1, 2])
  140. tm.assert_series_equal(result, expected)
  141. def test_str_dtype():
  142. # https://github.com/pandas-dev/pandas/pull/61623
  143. ser = pd.Series(["x", "y"], dtype="str")
  144. result = ser.explode()
  145. assert result is not ser
  146. tm.assert_series_equal(result, ser)