test_join.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. DataFrame,
  5. Index,
  6. Interval,
  7. MultiIndex,
  8. Series,
  9. StringDtype,
  10. )
  11. import pandas._testing as tm
  12. @pytest.mark.parametrize(
  13. "other", [Index(["three", "one", "two"]), Index(["one"]), Index(["one", "three"])]
  14. )
  15. def test_join_level(idx, other, join_type):
  16. join_index, lidx, ridx = other.join(
  17. idx, how=join_type, level="second", return_indexers=True
  18. )
  19. exp_level = other.join(idx.levels[1], how=join_type)
  20. assert join_index.levels[0].equals(idx.levels[0])
  21. assert join_index.levels[1].equals(exp_level)
  22. # pare down levels
  23. mask = np.array([x[1] in exp_level for x in idx], dtype=bool)
  24. exp_values = idx.values[mask]
  25. tm.assert_numpy_array_equal(join_index.values, exp_values)
  26. if join_type in ("outer", "inner"):
  27. join_index2, ridx2, lidx2 = idx.join(
  28. other, how=join_type, level="second", return_indexers=True
  29. )
  30. assert join_index.equals(join_index2)
  31. tm.assert_numpy_array_equal(lidx, lidx2)
  32. tm.assert_numpy_array_equal(ridx, ridx2)
  33. tm.assert_numpy_array_equal(join_index2.values, exp_values)
  34. def test_join_level_corner_case(idx):
  35. # some corner cases
  36. index = Index(["three", "one", "two"])
  37. result = index.join(idx, level="second")
  38. assert isinstance(result, MultiIndex)
  39. with pytest.raises(TypeError, match="Join.*MultiIndex.*ambiguous"):
  40. idx.join(idx, level=1)
  41. def test_join_self(idx, join_type):
  42. result = idx.join(idx, how=join_type)
  43. expected = idx
  44. if join_type == "outer":
  45. expected = expected.sort_values()
  46. tm.assert_index_equal(result, expected)
  47. def test_join_multi():
  48. # GH 10665
  49. midx = MultiIndex.from_product([np.arange(4), np.arange(4)], names=["a", "b"])
  50. idx = Index([1, 2, 5], name="b")
  51. # inner
  52. jidx, lidx, ridx = midx.join(idx, how="inner", return_indexers=True)
  53. exp_idx = MultiIndex.from_product([np.arange(4), [1, 2]], names=["a", "b"])
  54. exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.intp)
  55. exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.intp)
  56. tm.assert_index_equal(jidx, exp_idx)
  57. tm.assert_numpy_array_equal(lidx, exp_lidx)
  58. tm.assert_numpy_array_equal(ridx, exp_ridx)
  59. # flip
  60. jidx, ridx, lidx = idx.join(midx, how="inner", return_indexers=True)
  61. tm.assert_index_equal(jidx, exp_idx)
  62. tm.assert_numpy_array_equal(lidx, exp_lidx)
  63. tm.assert_numpy_array_equal(ridx, exp_ridx)
  64. # keep MultiIndex
  65. jidx, lidx, ridx = midx.join(idx, how="left", return_indexers=True)
  66. exp_ridx = np.array(
  67. [-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1], dtype=np.intp
  68. )
  69. tm.assert_index_equal(jidx, midx)
  70. assert lidx is None
  71. tm.assert_numpy_array_equal(ridx, exp_ridx)
  72. # flip
  73. jidx, ridx, lidx = idx.join(midx, how="right", return_indexers=True)
  74. tm.assert_index_equal(jidx, midx)
  75. assert lidx is None
  76. tm.assert_numpy_array_equal(ridx, exp_ridx)
  77. def test_join_multi_wrong_order():
  78. # GH 25760
  79. # GH 28956
  80. midx1 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
  81. midx2 = MultiIndex.from_product([[1, 2], [3, 4]], names=["b", "a"])
  82. join_idx, lidx, ridx = midx1.join(midx2, return_indexers=True)
  83. exp_ridx = np.array([-1, -1, -1, -1], dtype=np.intp)
  84. tm.assert_index_equal(midx1, join_idx)
  85. assert lidx is None
  86. tm.assert_numpy_array_equal(ridx, exp_ridx)
  87. def test_join_multi_return_indexers():
  88. # GH 34074
  89. midx1 = MultiIndex.from_product([[1, 2], [3, 4], [5, 6]], names=["a", "b", "c"])
  90. midx2 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
  91. result = midx1.join(midx2, return_indexers=False)
  92. tm.assert_index_equal(result, midx1)
  93. def test_join_overlapping_interval_level():
  94. # GH 44096
  95. idx_1 = MultiIndex.from_tuples(
  96. [
  97. (1, Interval(0.0, 1.0)),
  98. (1, Interval(1.0, 2.0)),
  99. (1, Interval(2.0, 5.0)),
  100. (2, Interval(0.0, 1.0)),
  101. (2, Interval(1.0, 3.0)), # interval limit is here at 3.0, not at 2.0
  102. (2, Interval(3.0, 5.0)),
  103. ],
  104. names=["num", "interval"],
  105. )
  106. idx_2 = MultiIndex.from_tuples(
  107. [
  108. (1, Interval(2.0, 5.0)),
  109. (1, Interval(0.0, 1.0)),
  110. (1, Interval(1.0, 2.0)),
  111. (2, Interval(3.0, 5.0)),
  112. (2, Interval(0.0, 1.0)),
  113. (2, Interval(1.0, 3.0)),
  114. ],
  115. names=["num", "interval"],
  116. )
  117. expected = MultiIndex.from_tuples(
  118. [
  119. (1, Interval(0.0, 1.0)),
  120. (1, Interval(1.0, 2.0)),
  121. (1, Interval(2.0, 5.0)),
  122. (2, Interval(0.0, 1.0)),
  123. (2, Interval(1.0, 3.0)),
  124. (2, Interval(3.0, 5.0)),
  125. ],
  126. names=["num", "interval"],
  127. )
  128. result = idx_1.join(idx_2, how="outer")
  129. tm.assert_index_equal(result, expected)
  130. def test_join_midx_ea():
  131. # GH#49277
  132. midx = MultiIndex.from_arrays(
  133. [Series([1, 1, 3], dtype="Int64"), Series([1, 2, 3], dtype="Int64")],
  134. names=["a", "b"],
  135. )
  136. midx2 = MultiIndex.from_arrays(
  137. [Series([1], dtype="Int64"), Series([3], dtype="Int64")], names=["a", "c"]
  138. )
  139. result = midx.join(midx2, how="inner")
  140. expected = MultiIndex.from_arrays(
  141. [
  142. Series([1, 1], dtype="Int64"),
  143. Series([1, 2], dtype="Int64"),
  144. Series([3, 3], dtype="Int64"),
  145. ],
  146. names=["a", "b", "c"],
  147. )
  148. tm.assert_index_equal(result, expected)
  149. def test_join_midx_string():
  150. # GH#49277
  151. midx = MultiIndex.from_arrays(
  152. [
  153. Series(["a", "a", "c"], dtype=StringDtype()),
  154. Series(["a", "b", "c"], dtype=StringDtype()),
  155. ],
  156. names=["a", "b"],
  157. )
  158. midx2 = MultiIndex.from_arrays(
  159. [Series(["a"], dtype=StringDtype()), Series(["c"], dtype=StringDtype())],
  160. names=["a", "c"],
  161. )
  162. result = midx.join(midx2, how="inner")
  163. expected = MultiIndex.from_arrays(
  164. [
  165. Series(["a", "a"], dtype=StringDtype()),
  166. Series(["a", "b"], dtype=StringDtype()),
  167. Series(["c", "c"], dtype=StringDtype()),
  168. ],
  169. names=["a", "b", "c"],
  170. )
  171. tm.assert_index_equal(result, expected)
  172. def test_join_multi_with_nan():
  173. # GH29252
  174. df1 = DataFrame(
  175. data={"col1": [1.1, 1.2]},
  176. index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]),
  177. )
  178. df2 = DataFrame(
  179. data={"col2": [2.1, 2.2]},
  180. index=MultiIndex.from_product([["A"], [np.nan, 2.0]], names=["id1", "id2"]),
  181. )
  182. result = df1.join(df2)
  183. expected = DataFrame(
  184. data={"col1": [1.1, 1.2], "col2": [np.nan, 2.2]},
  185. index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]),
  186. )
  187. tm.assert_frame_equal(result, expected)
  188. @pytest.mark.parametrize("val", [0, 5])
  189. def test_join_dtypes(any_numeric_ea_dtype, val):
  190. # GH#49830
  191. midx = MultiIndex.from_arrays([Series([1, 2], dtype=any_numeric_ea_dtype), [3, 4]])
  192. midx2 = MultiIndex.from_arrays(
  193. [Series([1, val, val], dtype=any_numeric_ea_dtype), [3, 4, 4]]
  194. )
  195. result = midx.join(midx2, how="outer")
  196. expected = MultiIndex.from_arrays(
  197. [Series([val, val, 1, 2], dtype=any_numeric_ea_dtype), [4, 4, 3, 4]]
  198. ).sort_values()
  199. tm.assert_index_equal(result, expected)
  200. def test_join_dtypes_all_nan(any_numeric_ea_dtype):
  201. # GH#49830
  202. midx = MultiIndex.from_arrays(
  203. [Series([1, 2], dtype=any_numeric_ea_dtype), [np.nan, np.nan]]
  204. )
  205. midx2 = MultiIndex.from_arrays(
  206. [Series([1, 0, 0], dtype=any_numeric_ea_dtype), [np.nan, np.nan, np.nan]]
  207. )
  208. result = midx.join(midx2, how="outer")
  209. expected = MultiIndex.from_arrays(
  210. [
  211. Series([0, 0, 1, 2], dtype=any_numeric_ea_dtype),
  212. [np.nan, np.nan, np.nan, np.nan],
  213. ]
  214. )
  215. tm.assert_index_equal(result, expected)
  216. def test_join_index_levels():
  217. # GH#53093
  218. midx = midx = MultiIndex.from_tuples([("a", "2019-02-01"), ("a", "2019-02-01")])
  219. midx2 = MultiIndex.from_tuples([("a", "2019-01-31")])
  220. result = midx.join(midx2, how="outer")
  221. expected = MultiIndex.from_tuples(
  222. [("a", "2019-01-31"), ("a", "2019-02-01"), ("a", "2019-02-01")]
  223. )
  224. tm.assert_index_equal(result.levels[1], expected.levels[1])
  225. tm.assert_index_equal(result, expected)