test_pairwise.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. import numpy as np
  2. import pytest
  3. from pandas.compat import IS64
  4. from pandas import (
  5. DataFrame,
  6. Index,
  7. MultiIndex,
  8. Series,
  9. date_range,
  10. )
  11. import pandas._testing as tm
  12. from pandas.core.algorithms import safe_sort
  13. @pytest.fixture(
  14. params=[
  15. DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]),
  16. DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]),
  17. DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", "C"]),
  18. DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1.0, 0]),
  19. DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0.0, 1]),
  20. DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", 1]),
  21. DataFrame([[2.0, 4.0], [1.0, 2.0], [5.0, 2.0], [8.0, 1.0]], columns=[1, 0.0]),
  22. DataFrame([[2, 4.0], [1, 2.0], [5, 2.0], [8, 1.0]], columns=[0, 1.0]),
  23. DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.0]], columns=[1.0, "X"]),
  24. ]
  25. )
  26. def pairwise_frames(request):
  27. """Pairwise frames test_pairwise"""
  28. return request.param
  29. @pytest.fixture
  30. def pairwise_target_frame():
  31. """Pairwise target frame for test_pairwise"""
  32. return DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1])
  33. @pytest.fixture
  34. def pairwise_other_frame():
  35. """Pairwise other frame for test_pairwise"""
  36. return DataFrame(
  37. [[None, 1, 1], [None, 1, 2], [None, 3, 2], [None, 8, 1]],
  38. columns=["Y", "Z", "X"],
  39. )
  40. def test_rolling_cov(series):
  41. A = series
  42. B = A + np.random.default_rng(2).standard_normal(len(A))
  43. result = A.rolling(window=50, min_periods=25).cov(B)
  44. tm.assert_almost_equal(result.iloc[-1], np.cov(A[-50:], B[-50:])[0, 1])
  45. def test_rolling_corr(series):
  46. A = series
  47. B = A + np.random.default_rng(2).standard_normal(len(A))
  48. result = A.rolling(window=50, min_periods=25).corr(B)
  49. tm.assert_almost_equal(result.iloc[-1], np.corrcoef(A[-50:], B[-50:])[0, 1])
  50. def test_rolling_corr_bias_correction():
  51. # test for correct bias correction
  52. a = Series(
  53. np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20)
  54. )
  55. b = a.copy()
  56. a[:5] = np.nan
  57. b[:10] = np.nan
  58. result = a.rolling(window=len(a), min_periods=1).corr(b)
  59. tm.assert_almost_equal(result.iloc[-1], a.corr(b))
  60. @pytest.mark.parametrize("func", ["cov", "corr"])
  61. def test_rolling_pairwise_cov_corr(func, frame):
  62. result = getattr(frame.rolling(window=10, min_periods=5), func)()
  63. result = result.loc[(slice(None), 1), 5]
  64. result.index = result.index.droplevel(1)
  65. expected = getattr(frame[1].rolling(window=10, min_periods=5), func)(frame[5])
  66. tm.assert_series_equal(result, expected, check_names=False)
  67. @pytest.mark.parametrize("method", ["corr", "cov"])
  68. def test_flex_binary_frame(method, frame):
  69. series = frame[1]
  70. res = getattr(series.rolling(window=10), method)(frame)
  71. res2 = getattr(frame.rolling(window=10), method)(series)
  72. exp = frame.apply(lambda x: getattr(series.rolling(window=10), method)(x))
  73. tm.assert_frame_equal(res, exp)
  74. tm.assert_frame_equal(res2, exp)
  75. frame2 = frame.copy()
  76. frame2 = DataFrame(
  77. np.random.default_rng(2).standard_normal(frame2.shape),
  78. index=frame2.index,
  79. columns=frame2.columns,
  80. )
  81. res3 = getattr(frame.rolling(window=10), method)(frame2)
  82. exp = DataFrame(
  83. {k: getattr(frame[k].rolling(window=10), method)(frame2[k]) for k in frame}
  84. )
  85. tm.assert_frame_equal(res3, exp)
  86. @pytest.mark.parametrize("window", range(7))
  87. def test_rolling_corr_with_zero_variance(window):
  88. # GH 18430
  89. s = Series(np.zeros(20))
  90. other = Series(np.arange(20))
  91. assert s.rolling(window=window).corr(other=other).isna().all()
  92. def test_corr_sanity():
  93. # GH 3155
  94. df = DataFrame(
  95. np.array(
  96. [
  97. [0.87024726, 0.18505595],
  98. [0.64355431, 0.3091617],
  99. [0.92372966, 0.50552513],
  100. [0.00203756, 0.04520709],
  101. [0.84780328, 0.33394331],
  102. [0.78369152, 0.63919667],
  103. ]
  104. )
  105. )
  106. res = df[0].rolling(5, center=True).corr(df[1])
  107. assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res)
  108. df = DataFrame(np.random.default_rng(2).random((30, 2)))
  109. res = df[0].rolling(5, center=True).corr(df[1])
  110. assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res)
  111. def test_rolling_cov_diff_length():
  112. # GH 7512
  113. s1 = Series([1, 2, 3], index=[0, 1, 2])
  114. s2 = Series([1, 3], index=[0, 2])
  115. result = s1.rolling(window=3, min_periods=2).cov(s2)
  116. expected = Series([None, None, 2.0])
  117. tm.assert_series_equal(result, expected)
  118. s2a = Series([1, None, 3], index=[0, 1, 2])
  119. result = s1.rolling(window=3, min_periods=2).cov(s2a)
  120. tm.assert_series_equal(result, expected)
  121. def test_rolling_corr_diff_length():
  122. # GH 7512
  123. s1 = Series([1, 2, 3], index=[0, 1, 2])
  124. s2 = Series([1, 3], index=[0, 2])
  125. result = s1.rolling(window=3, min_periods=2).corr(s2)
  126. expected = Series([None, None, 1.0])
  127. tm.assert_series_equal(result, expected)
  128. s2a = Series([1, None, 3], index=[0, 1, 2])
  129. result = s1.rolling(window=3, min_periods=2).corr(s2a)
  130. tm.assert_series_equal(result, expected)
  131. @pytest.mark.parametrize(
  132. "f",
  133. [
  134. lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)),
  135. lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)),
  136. ],
  137. )
  138. def test_rolling_functions_window_non_shrinkage_binary(f):
  139. # corr/cov return a MI DataFrame
  140. df = DataFrame(
  141. [[1, 5], [3, 2], [3, 9], [-1, 0]],
  142. columns=Index(["A", "B"], name="foo"),
  143. index=Index(range(4), name="bar"),
  144. )
  145. df_expected = DataFrame(
  146. columns=Index(["A", "B"], name="foo"),
  147. index=MultiIndex.from_product([df.index, df.columns], names=["bar", "foo"]),
  148. dtype="float64",
  149. )
  150. df_result = f(df)
  151. tm.assert_frame_equal(df_result, df_expected)
  152. @pytest.mark.parametrize(
  153. "f",
  154. [
  155. lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)),
  156. lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)),
  157. ],
  158. )
  159. def test_moment_functions_zero_length_pairwise(f):
  160. df1 = DataFrame()
  161. df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar"))
  162. df2["a"] = df2["a"].astype("float64")
  163. df1_expected = DataFrame(index=MultiIndex.from_product([df1.index, df1.columns]))
  164. df2_expected = DataFrame(
  165. index=MultiIndex.from_product([df2.index, df2.columns], names=["bar", "foo"]),
  166. columns=Index(["a"], name="foo"),
  167. dtype="float64",
  168. )
  169. df1_result = f(df1)
  170. tm.assert_frame_equal(df1_result, df1_expected)
  171. df2_result = f(df2)
  172. tm.assert_frame_equal(df2_result, df2_expected)
  173. class TestPairwise:
  174. # GH 7738
  175. @pytest.mark.parametrize("f", [lambda x: x.cov(), lambda x: x.corr()])
  176. def test_no_flex(self, pairwise_frames, pairwise_target_frame, f):
  177. # DataFrame methods (which do not call flex_binary_moment())
  178. result = f(pairwise_frames)
  179. tm.assert_index_equal(result.index, pairwise_frames.columns)
  180. tm.assert_index_equal(result.columns, pairwise_frames.columns)
  181. expected = f(pairwise_target_frame)
  182. # since we have sorted the results
  183. # we can only compare non-nans
  184. result = result.dropna().values
  185. expected = expected.dropna().values
  186. tm.assert_numpy_array_equal(result, expected, check_dtype=False)
  187. @pytest.mark.parametrize(
  188. "f",
  189. [
  190. lambda x: x.expanding().cov(pairwise=True),
  191. lambda x: x.expanding().corr(pairwise=True),
  192. lambda x: x.rolling(window=3).cov(pairwise=True),
  193. lambda x: x.rolling(window=3).corr(pairwise=True),
  194. lambda x: x.ewm(com=3).cov(pairwise=True),
  195. lambda x: x.ewm(com=3).corr(pairwise=True),
  196. ],
  197. )
  198. def test_pairwise_with_self(self, pairwise_frames, pairwise_target_frame, f):
  199. # DataFrame with itself, pairwise=True
  200. # note that we may construct the 1st level of the MI
  201. # in a non-monotonic way, so compare accordingly
  202. result = f(pairwise_frames)
  203. tm.assert_index_equal(
  204. result.index.levels[0], pairwise_frames.index, check_names=False
  205. )
  206. tm.assert_index_equal(
  207. safe_sort(result.index.levels[1]),
  208. safe_sort(pairwise_frames.columns.unique()),
  209. )
  210. tm.assert_index_equal(result.columns, pairwise_frames.columns)
  211. expected = f(pairwise_target_frame)
  212. # since we have sorted the results
  213. # we can only compare non-nans
  214. result = result.dropna().values
  215. expected = expected.dropna().values
  216. tm.assert_numpy_array_equal(result, expected, check_dtype=False)
  217. @pytest.mark.parametrize(
  218. "f",
  219. [
  220. lambda x: x.expanding().cov(pairwise=False),
  221. lambda x: x.expanding().corr(pairwise=False),
  222. lambda x: x.rolling(window=3).cov(pairwise=False),
  223. lambda x: x.rolling(window=3).corr(pairwise=False),
  224. lambda x: x.ewm(com=3).cov(pairwise=False),
  225. lambda x: x.ewm(com=3).corr(pairwise=False),
  226. ],
  227. )
  228. def test_no_pairwise_with_self(self, pairwise_frames, pairwise_target_frame, f):
  229. # DataFrame with itself, pairwise=False
  230. result = f(pairwise_frames)
  231. tm.assert_index_equal(result.index, pairwise_frames.index)
  232. tm.assert_index_equal(result.columns, pairwise_frames.columns)
  233. expected = f(pairwise_target_frame)
  234. # since we have sorted the results
  235. # we can only compare non-nans
  236. result = result.dropna().values
  237. expected = expected.dropna().values
  238. tm.assert_numpy_array_equal(result, expected, check_dtype=False)
  239. @pytest.mark.parametrize(
  240. "f",
  241. [
  242. lambda x, y: x.expanding().cov(y, pairwise=True),
  243. lambda x, y: x.expanding().corr(y, pairwise=True),
  244. lambda x, y: x.rolling(window=3).cov(y, pairwise=True),
  245. # TODO: We're missing a flag somewhere in meson
  246. pytest.param(
  247. lambda x, y: x.rolling(window=3).corr(y, pairwise=True),
  248. marks=pytest.mark.xfail(
  249. not IS64, reason="Precision issues on 32 bit", strict=False
  250. ),
  251. ),
  252. lambda x, y: x.ewm(com=3).cov(y, pairwise=True),
  253. lambda x, y: x.ewm(com=3).corr(y, pairwise=True),
  254. ],
  255. )
  256. def test_pairwise_with_other(
  257. self, pairwise_frames, pairwise_target_frame, pairwise_other_frame, f
  258. ):
  259. # DataFrame with another DataFrame, pairwise=True
  260. result = f(pairwise_frames, pairwise_other_frame)
  261. tm.assert_index_equal(
  262. result.index.levels[0], pairwise_frames.index, check_names=False
  263. )
  264. tm.assert_index_equal(
  265. safe_sort(result.index.levels[1]),
  266. safe_sort(pairwise_other_frame.columns.unique()),
  267. )
  268. expected = f(pairwise_target_frame, pairwise_other_frame)
  269. # since we have sorted the results
  270. # we can only compare non-nans
  271. result = result.dropna().values
  272. expected = expected.dropna().values
  273. tm.assert_numpy_array_equal(result, expected, check_dtype=False)
  274. @pytest.mark.filterwarnings("ignore:RuntimeWarning")
  275. @pytest.mark.parametrize(
  276. "f",
  277. [
  278. lambda x, y: x.expanding().cov(y, pairwise=False),
  279. lambda x, y: x.expanding().corr(y, pairwise=False),
  280. lambda x, y: x.rolling(window=3).cov(y, pairwise=False),
  281. lambda x, y: x.rolling(window=3).corr(y, pairwise=False),
  282. lambda x, y: x.ewm(com=3).cov(y, pairwise=False),
  283. lambda x, y: x.ewm(com=3).corr(y, pairwise=False),
  284. ],
  285. )
  286. def test_no_pairwise_with_other(self, pairwise_frames, pairwise_other_frame, f):
  287. # DataFrame with another DataFrame, pairwise=False
  288. result = (
  289. f(pairwise_frames, pairwise_other_frame)
  290. if pairwise_frames.columns.is_unique
  291. else None
  292. )
  293. if result is not None:
  294. # we can have int and str columns
  295. expected_index = pairwise_frames.index.union(pairwise_other_frame.index)
  296. expected_columns = pairwise_frames.columns.union(
  297. pairwise_other_frame.columns
  298. )
  299. tm.assert_index_equal(result.index, expected_index)
  300. tm.assert_index_equal(result.columns, expected_columns)
  301. else:
  302. with pytest.raises(ValueError, match="'arg1' columns are not unique"):
  303. f(pairwise_frames, pairwise_other_frame)
  304. with pytest.raises(ValueError, match="'arg2' columns are not unique"):
  305. f(pairwise_other_frame, pairwise_frames)
  306. @pytest.mark.parametrize(
  307. "f",
  308. [
  309. lambda x, y: x.expanding().cov(y),
  310. lambda x, y: x.expanding().corr(y),
  311. lambda x, y: x.rolling(window=3).cov(y),
  312. lambda x, y: x.rolling(window=3).corr(y),
  313. lambda x, y: x.ewm(com=3).cov(y),
  314. lambda x, y: x.ewm(com=3).corr(y),
  315. ],
  316. )
  317. def test_pairwise_with_series(self, pairwise_frames, pairwise_target_frame, f):
  318. # DataFrame with a Series
  319. result = f(pairwise_frames, Series([1, 1, 3, 8]))
  320. tm.assert_index_equal(result.index, pairwise_frames.index)
  321. tm.assert_index_equal(result.columns, pairwise_frames.columns)
  322. expected = f(pairwise_target_frame, Series([1, 1, 3, 8]))
  323. # since we have sorted the results
  324. # we can only compare non-nans
  325. result = result.dropna().values
  326. expected = expected.dropna().values
  327. tm.assert_numpy_array_equal(result, expected, check_dtype=False)
  328. result = f(Series([1, 1, 3, 8]), pairwise_frames)
  329. tm.assert_index_equal(result.index, pairwise_frames.index)
  330. tm.assert_index_equal(result.columns, pairwise_frames.columns)
  331. expected = f(Series([1, 1, 3, 8]), pairwise_target_frame)
  332. # since we have sorted the results
  333. # we can only compare non-nans
  334. result = result.dropna().values
  335. expected = expected.dropna().values
  336. tm.assert_numpy_array_equal(result, expected, check_dtype=False)
  337. def test_corr_freq_memory_error(self):
  338. # GH 31789
  339. s = Series(range(5), index=date_range("2020", periods=5))
  340. result = s.rolling("12h").corr(s)
  341. expected = Series([np.nan] * 5, index=date_range("2020", periods=5))
  342. tm.assert_series_equal(result, expected)
  343. def test_cov_mulittindex(self):
  344. # GH 34440
  345. columns = MultiIndex.from_product([list("ab"), list("xy"), list("AB")])
  346. index = range(3)
  347. df = DataFrame(np.arange(24).reshape(3, 8), index=index, columns=columns)
  348. result = df.ewm(alpha=0.1).cov()
  349. index = MultiIndex.from_product([range(3), list("ab"), list("xy"), list("AB")])
  350. columns = MultiIndex.from_product([list("ab"), list("xy"), list("AB")])
  351. expected = DataFrame(
  352. np.vstack(
  353. (
  354. np.full((8, 8), np.nan),
  355. np.full((8, 8), 32.000000),
  356. np.full((8, 8), 63.881919),
  357. )
  358. ),
  359. index=index,
  360. columns=columns,
  361. )
  362. tm.assert_frame_equal(result, expected)
  363. def test_multindex_columns_pairwise_func(self):
  364. # GH 21157
  365. columns = MultiIndex.from_arrays([["M", "N"], ["P", "Q"]], names=["a", "b"])
  366. df = DataFrame(np.ones((5, 2)), columns=columns)
  367. result = df.rolling(3).corr()
  368. expected = DataFrame(
  369. np.nan,
  370. index=MultiIndex.from_arrays(
  371. [
  372. np.repeat(np.arange(5, dtype=np.int64), 2),
  373. ["M", "N"] * 5,
  374. ["P", "Q"] * 5,
  375. ],
  376. names=[None, "a", "b"],
  377. ),
  378. columns=columns,
  379. )
  380. tm.assert_frame_equal(result, expected)