test_sample.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. DataFrame,
  5. Index,
  6. Series,
  7. )
  8. import pandas._testing as tm
  9. import pandas.core.common as com
  10. class TestSample:
  11. @pytest.fixture
  12. def obj(self, frame_or_series):
  13. if frame_or_series is Series:
  14. arr = np.random.default_rng(2).standard_normal(10)
  15. else:
  16. arr = np.random.default_rng(2).standard_normal((10, 10))
  17. return frame_or_series(arr, dtype=None)
  18. @pytest.mark.parametrize("test", list(range(10)))
  19. def test_sample(self, test, obj):
  20. # Fixes issue: 2419
  21. # Check behavior of random_state argument
  22. # Check for stability when receives seed or random state -- run 10
  23. # times.
  24. seed = np.random.default_rng(2).integers(0, 100)
  25. tm.assert_equal(
  26. obj.sample(n=4, random_state=seed), obj.sample(n=4, random_state=seed)
  27. )
  28. tm.assert_equal(
  29. obj.sample(frac=0.7, random_state=seed),
  30. obj.sample(frac=0.7, random_state=seed),
  31. )
  32. tm.assert_equal(
  33. obj.sample(n=4, random_state=np.random.default_rng(test)),
  34. obj.sample(n=4, random_state=np.random.default_rng(test)),
  35. )
  36. tm.assert_equal(
  37. obj.sample(frac=0.7, random_state=np.random.default_rng(test)),
  38. obj.sample(frac=0.7, random_state=np.random.default_rng(test)),
  39. )
  40. tm.assert_equal(
  41. obj.sample(
  42. frac=2,
  43. replace=True,
  44. random_state=np.random.default_rng(test),
  45. ),
  46. obj.sample(
  47. frac=2,
  48. replace=True,
  49. random_state=np.random.default_rng(test),
  50. ),
  51. )
  52. os1, os2 = [], []
  53. for _ in range(2):
  54. os1.append(obj.sample(n=4, random_state=test))
  55. os2.append(obj.sample(frac=0.7, random_state=test))
  56. tm.assert_equal(*os1)
  57. tm.assert_equal(*os2)
  58. def test_sample_lengths(self, obj):
  59. # Check lengths are right
  60. assert len(obj.sample(n=4) == 4)
  61. assert len(obj.sample(frac=0.34) == 3)
  62. assert len(obj.sample(frac=0.36) == 4)
  63. def test_sample_invalid_random_state(self, obj):
  64. # Check for error when random_state argument invalid.
  65. msg = (
  66. "random_state must be an integer, array-like, a BitGenerator, Generator, "
  67. "a numpy RandomState, or None"
  68. )
  69. with pytest.raises(ValueError, match=msg):
  70. obj.sample(random_state="a_string")
  71. def test_sample_wont_accept_n_and_frac(self, obj):
  72. # Giving both frac and N throws error
  73. msg = "Please enter a value for `frac` OR `n`, not both"
  74. with pytest.raises(ValueError, match=msg):
  75. obj.sample(n=3, frac=0.3)
  76. def test_sample_requires_positive_n_frac(self, obj):
  77. with pytest.raises(
  78. ValueError,
  79. match="A negative number of rows requested. Please provide `n` >= 0",
  80. ):
  81. obj.sample(n=-3)
  82. with pytest.raises(
  83. ValueError,
  84. match="A negative number of rows requested. Please provide `frac` >= 0",
  85. ):
  86. obj.sample(frac=-0.3)
  87. def test_sample_requires_integer_n(self, obj):
  88. # Make sure float values of `n` give error
  89. with pytest.raises(ValueError, match="Only integers accepted as `n` values"):
  90. obj.sample(n=3.2)
  91. def test_sample_invalid_weight_lengths(self, obj):
  92. # Weight length must be right
  93. msg = "Weights and axis to be sampled must be of same length"
  94. with pytest.raises(ValueError, match=msg):
  95. obj.sample(n=3, weights=[0, 1])
  96. with pytest.raises(ValueError, match=msg):
  97. bad_weights = [0.5] * 11
  98. obj.sample(n=3, weights=bad_weights)
  99. with pytest.raises(ValueError, match="Fewer non-zero entries in p than size"):
  100. bad_weight_series = Series([0, 0, 0.2])
  101. obj.sample(n=4, weights=bad_weight_series)
  102. def test_sample_negative_weights(self, obj):
  103. # Check won't accept negative weights
  104. bad_weights = [-0.1] * 10
  105. msg = "weight vector many not include negative values"
  106. with pytest.raises(ValueError, match=msg):
  107. obj.sample(n=3, weights=bad_weights)
  108. def test_sample_inf_weights(self, obj):
  109. # Check inf and -inf throw errors:
  110. weights_with_inf = [0.1] * 10
  111. weights_with_inf[0] = np.inf
  112. msg = "weight vector may not include `inf` values"
  113. with pytest.raises(ValueError, match=msg):
  114. obj.sample(n=3, weights=weights_with_inf)
  115. weights_with_ninf = [0.1] * 10
  116. weights_with_ninf[0] = -np.inf
  117. with pytest.raises(ValueError, match=msg):
  118. obj.sample(n=3, weights=weights_with_ninf)
  119. def test_sample_zero_weights(self, obj):
  120. # All zeros raises errors
  121. zero_weights = [0] * 10
  122. with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"):
  123. obj.sample(n=3, weights=zero_weights)
  124. def test_sample_missing_weights(self, obj):
  125. # All missing weights
  126. nan_weights = [np.nan] * 10
  127. with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"):
  128. obj.sample(n=3, weights=nan_weights)
  129. def test_sample_none_weights(self, obj):
  130. # Check None are also replaced by zeros.
  131. weights_with_None = [None] * 10
  132. weights_with_None[5] = 0.5
  133. tm.assert_equal(
  134. obj.sample(n=1, axis=0, weights=weights_with_None), obj.iloc[5:6]
  135. )
  136. @pytest.mark.parametrize(
  137. "func_str,arg",
  138. [
  139. ("np.array", [2, 3, 1, 0]),
  140. ("np.random.MT19937", 3),
  141. ("np.random.PCG64", 11),
  142. ],
  143. )
  144. def test_sample_random_state(self, func_str, arg, frame_or_series):
  145. # GH#32503
  146. obj = DataFrame({"col1": range(10, 20), "col2": range(20, 30)})
  147. obj = tm.get_obj(obj, frame_or_series)
  148. result = obj.sample(n=3, random_state=eval(func_str)(arg))
  149. expected = obj.sample(n=3, random_state=com.random_state(eval(func_str)(arg)))
  150. tm.assert_equal(result, expected)
  151. def test_sample_generator(self, frame_or_series):
  152. # GH#38100
  153. obj = frame_or_series(np.arange(100))
  154. rng = np.random.default_rng(2)
  155. # Consecutive calls should advance the seed
  156. result1 = obj.sample(n=50, random_state=rng)
  157. result2 = obj.sample(n=50, random_state=rng)
  158. assert not (result1.index.values == result2.index.values).all()
  159. # Matching generator initialization must give same result
  160. # Consecutive calls should advance the seed
  161. result1 = obj.sample(n=50, random_state=np.random.default_rng(11))
  162. result2 = obj.sample(n=50, random_state=np.random.default_rng(11))
  163. tm.assert_equal(result1, result2)
  164. def test_sample_upsampling_without_replacement(self, frame_or_series):
  165. # GH#27451
  166. obj = DataFrame({"A": list("abc")})
  167. obj = tm.get_obj(obj, frame_or_series)
  168. msg = (
  169. "Replace has to be set to `True` when "
  170. "upsampling the population `frac` > 1."
  171. )
  172. with pytest.raises(ValueError, match=msg):
  173. obj.sample(frac=2, replace=False)
  174. class TestSampleDataFrame:
  175. # Tests which are relevant only for DataFrame, so these are
  176. # as fully parametrized as they can get.
  177. def test_sample(self):
  178. # GH#2419
  179. # additional specific object based tests
  180. # A few dataframe test with degenerate weights.
  181. easy_weight_list = [0] * 10
  182. easy_weight_list[5] = 1
  183. df = DataFrame(
  184. {
  185. "col1": range(10, 20),
  186. "col2": range(20, 30),
  187. "colString": ["a"] * 10,
  188. "easyweights": easy_weight_list,
  189. }
  190. )
  191. sample1 = df.sample(n=1, weights="easyweights")
  192. tm.assert_frame_equal(sample1, df.iloc[5:6])
  193. # Ensure proper error if string given as weight for Series or
  194. # DataFrame with axis = 1.
  195. ser = Series(range(10))
  196. msg = "Strings cannot be passed as weights when sampling from a Series."
  197. with pytest.raises(ValueError, match=msg):
  198. ser.sample(n=3, weights="weight_column")
  199. msg = (
  200. "Strings can only be passed to weights when sampling from rows on a "
  201. "DataFrame"
  202. )
  203. with pytest.raises(ValueError, match=msg):
  204. df.sample(n=1, weights="weight_column", axis=1)
  205. # Check weighting key error
  206. with pytest.raises(
  207. KeyError, match="'String passed to weights not a valid column'"
  208. ):
  209. df.sample(n=3, weights="not_a_real_column_name")
  210. # Check that re-normalizes weights that don't sum to one.
  211. weights_less_than_1 = [0] * 10
  212. weights_less_than_1[0] = 0.5
  213. tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1])
  214. ###
  215. # Test axis argument
  216. ###
  217. # Test axis argument
  218. df = DataFrame({"col1": range(10), "col2": ["a"] * 10})
  219. second_column_weight = [0, 1]
  220. tm.assert_frame_equal(
  221. df.sample(n=1, axis=1, weights=second_column_weight), df[["col2"]]
  222. )
  223. # Different axis arg types
  224. tm.assert_frame_equal(
  225. df.sample(n=1, axis="columns", weights=second_column_weight), df[["col2"]]
  226. )
  227. weight = [0] * 10
  228. weight[5] = 0.5
  229. tm.assert_frame_equal(df.sample(n=1, axis="rows", weights=weight), df.iloc[5:6])
  230. tm.assert_frame_equal(
  231. df.sample(n=1, axis="index", weights=weight), df.iloc[5:6]
  232. )
  233. # Check out of range axis values
  234. msg = "No axis named 2 for object type DataFrame"
  235. with pytest.raises(ValueError, match=msg):
  236. df.sample(n=1, axis=2)
  237. msg = "No axis named not_a_name for object type DataFrame"
  238. with pytest.raises(ValueError, match=msg):
  239. df.sample(n=1, axis="not_a_name")
  240. ser = Series(range(10))
  241. with pytest.raises(ValueError, match="No axis named 1 for object type Series"):
  242. ser.sample(n=1, axis=1)
  243. # Test weight length compared to correct axis
  244. msg = "Weights and axis to be sampled must be of same length"
  245. with pytest.raises(ValueError, match=msg):
  246. df.sample(n=1, axis=1, weights=[0.5] * 10)
  247. def test_sample_axis1(self):
  248. # Check weights with axis = 1
  249. easy_weight_list = [0] * 3
  250. easy_weight_list[2] = 1
  251. df = DataFrame(
  252. {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10}
  253. )
  254. sample1 = df.sample(n=1, axis=1, weights=easy_weight_list)
  255. tm.assert_frame_equal(sample1, df[["colString"]])
  256. # Test default axes
  257. tm.assert_frame_equal(
  258. df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42)
  259. )
  260. def test_sample_aligns_weights_with_frame(self):
  261. # Test that function aligns weights with frame
  262. df = DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3])
  263. ser = Series([1, 0, 0], index=[3, 5, 9])
  264. tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser))
  265. # Weights have index values to be dropped because not in
  266. # sampled DataFrame
  267. ser2 = Series([0.001, 0, 10000], index=[3, 5, 10])
  268. tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser2))
  269. # Weights have empty values to be filed with zeros
  270. ser3 = Series([0.01, 0], index=[3, 5])
  271. tm.assert_frame_equal(df.loc[[3]], df.sample(1, weights=ser3))
  272. # No overlap in weight and sampled DataFrame indices
  273. ser4 = Series([1, 0], index=[1, 2])
  274. with pytest.raises(ValueError, match="Invalid weights: weights sum to zero"):
  275. df.sample(1, weights=ser4)
  276. def test_sample_is_copy(self):
  277. # GH#27357, GH#30784: ensure the result of sample is an actual copy and
  278. # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
  279. df = DataFrame(
  280. np.random.default_rng(2).standard_normal((10, 3)), columns=["a", "b", "c"]
  281. )
  282. df2 = df.sample(3)
  283. with tm.assert_produces_warning(None):
  284. df2["d"] = 1
  285. def test_sample_does_not_modify_weights(self):
  286. # GH-42843
  287. result = np.array([np.nan, 1, np.nan])
  288. expected = result.copy()
  289. ser = Series([1, 2, 3])
  290. # Test numpy array weights won't be modified in place
  291. ser.sample(weights=result)
  292. tm.assert_numpy_array_equal(result, expected)
  293. # Test DataFrame column won't be modified in place
  294. df = DataFrame({"values": [1, 1, 1], "weights": [1, np.nan, np.nan]})
  295. expected = df["weights"].copy()
  296. df.sample(frac=1.0, replace=True, weights="weights")
  297. result = df["weights"]
  298. tm.assert_series_equal(result, expected)
  299. def test_sample_ignore_index(self):
  300. # GH 38581
  301. df = DataFrame(
  302. {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10}
  303. )
  304. result = df.sample(3, ignore_index=True)
  305. expected_index = Index(range(3))
  306. tm.assert_index_equal(result.index, expected_index, exact=True)