test_missing.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. DataFrame,
  6. Index,
  7. date_range,
  8. )
  9. import pandas._testing as tm
  10. @pytest.mark.parametrize("func", ["ffill", "bfill"])
  11. def test_groupby_column_index_name_lost_fill_funcs(func):
  12. # GH: 29764 groupby loses index sometimes
  13. df = DataFrame(
  14. [[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]],
  15. columns=Index(["type", "a", "b"], name="idx"),
  16. )
  17. df_grouped = df.groupby(["type"])[["a", "b"]]
  18. result = getattr(df_grouped, func)().columns
  19. expected = Index(["a", "b"], name="idx")
  20. tm.assert_index_equal(result, expected)
  21. @pytest.mark.parametrize("func", ["ffill", "bfill"])
  22. def test_groupby_fill_duplicate_column_names(func):
  23. # GH: 25610 ValueError with duplicate column names
  24. df1 = DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]})
  25. df2 = DataFrame({"field1": [1, np.nan, 4]})
  26. df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"])
  27. expected = DataFrame(
  28. [[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"]
  29. )
  30. result = getattr(df_grouped, func)()
  31. tm.assert_frame_equal(result, expected)
  32. def test_ffill_missing_arguments():
  33. # GH 14955
  34. df = DataFrame({"a": [1, 2], "b": [1, 1]})
  35. msg = "DataFrameGroupBy.fillna is deprecated"
  36. with tm.assert_produces_warning(FutureWarning, match=msg):
  37. with pytest.raises(ValueError, match="Must specify a fill"):
  38. df.groupby("b").fillna()
  39. @pytest.mark.parametrize(
  40. "method, expected", [("ffill", [None, "a", "a"]), ("bfill", ["a", "a", None])]
  41. )
  42. def test_fillna_with_string_dtype(method, expected):
  43. # GH 40250
  44. df = DataFrame({"a": pd.array([None, "a", None], dtype="string"), "b": [0, 0, 0]})
  45. grp = df.groupby("b")
  46. msg = "DataFrameGroupBy.fillna is deprecated"
  47. with tm.assert_produces_warning(FutureWarning, match=msg):
  48. result = grp.fillna(method=method)
  49. expected = DataFrame({"a": pd.array(expected, dtype="string")})
  50. tm.assert_frame_equal(result, expected)
  51. def test_fill_consistency():
  52. # GH9221
  53. # pass thru keyword arguments to the generated wrapper
  54. # are set if the passed kw is None (only)
  55. df = DataFrame(
  56. index=pd.MultiIndex.from_product(
  57. [["value1", "value2"], date_range("2014-01-01", "2014-01-06")]
  58. ),
  59. columns=Index(["1", "2"], name="id"),
  60. )
  61. df["1"] = [
  62. np.nan,
  63. 1,
  64. np.nan,
  65. np.nan,
  66. 11,
  67. np.nan,
  68. np.nan,
  69. 2,
  70. np.nan,
  71. np.nan,
  72. 22,
  73. np.nan,
  74. ]
  75. df["2"] = [
  76. np.nan,
  77. 3,
  78. np.nan,
  79. np.nan,
  80. 33,
  81. np.nan,
  82. np.nan,
  83. 4,
  84. np.nan,
  85. np.nan,
  86. 44,
  87. np.nan,
  88. ]
  89. msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
  90. with tm.assert_produces_warning(FutureWarning, match=msg):
  91. expected = df.groupby(level=0, axis=0).fillna(method="ffill")
  92. msg = "DataFrame.groupby with axis=1 is deprecated"
  93. with tm.assert_produces_warning(FutureWarning, match=msg):
  94. result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T
  95. tm.assert_frame_equal(result, expected)
  96. @pytest.mark.parametrize("method", ["ffill", "bfill"])
  97. @pytest.mark.parametrize("dropna", [True, False])
  98. @pytest.mark.parametrize("has_nan_group", [True, False])
  99. def test_ffill_handles_nan_groups(dropna, method, has_nan_group):
  100. # GH 34725
  101. df_without_nan_rows = DataFrame([(1, 0.1), (2, 0.2)])
  102. ridx = [-1, 0, -1, -1, 1, -1]
  103. df = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
  104. group_b = np.nan if has_nan_group else "b"
  105. df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3)
  106. grouped = df.groupby(by="group_col", dropna=dropna)
  107. result = getattr(grouped, method)(limit=None)
  108. expected_rows = {
  109. ("ffill", True, True): [-1, 0, 0, -1, -1, -1],
  110. ("ffill", True, False): [-1, 0, 0, -1, 1, 1],
  111. ("ffill", False, True): [-1, 0, 0, -1, 1, 1],
  112. ("ffill", False, False): [-1, 0, 0, -1, 1, 1],
  113. ("bfill", True, True): [0, 0, -1, -1, -1, -1],
  114. ("bfill", True, False): [0, 0, -1, 1, 1, -1],
  115. ("bfill", False, True): [0, 0, -1, 1, 1, -1],
  116. ("bfill", False, False): [0, 0, -1, 1, 1, -1],
  117. }
  118. ridx = expected_rows.get((method, dropna, has_nan_group))
  119. expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
  120. # columns are a 'take' on df.columns, which are object dtype
  121. expected.columns = expected.columns.astype(object)
  122. tm.assert_frame_equal(result, expected)
  123. @pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)])
  124. @pytest.mark.parametrize("func", ["first", "last", "max", "min"])
  125. def test_min_count(func, min_count, value):
  126. # GH#37821
  127. df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3})
  128. result = getattr(df.groupby("a"), func)(min_count=min_count)
  129. expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
  130. tm.assert_frame_equal(result, expected)
  131. def test_indices_with_missing():
  132. # GH 9304
  133. df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
  134. g = df.groupby(["a", "b"])
  135. result = g.indices
  136. expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
  137. assert result == expected