test_hashing.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. DataFrame,
  6. Index,
  7. MultiIndex,
  8. Series,
  9. period_range,
  10. timedelta_range,
  11. )
  12. import pandas._testing as tm
  13. from pandas.core.util.hashing import hash_tuples
  14. from pandas.util import (
  15. hash_array,
  16. hash_pandas_object,
  17. )
  18. @pytest.fixture(
  19. params=[
  20. Series([1, 2, 3] * 3, dtype="int32"),
  21. Series([None, 2.5, 3.5] * 3, dtype="float32"),
  22. Series(["a", "b", "c"] * 3, dtype="category"),
  23. Series(["d", "e", "f"] * 3),
  24. Series([True, False, True] * 3),
  25. Series(pd.date_range("20130101", periods=9)),
  26. Series(pd.date_range("20130101", periods=9, tz="US/Eastern")),
  27. Series(timedelta_range("2000", periods=9)),
  28. ]
  29. )
  30. def series(request):
  31. return request.param
  32. @pytest.fixture(params=[True, False])
  33. def index(request):
  34. return request.param
  35. def test_consistency():
  36. # Check that our hash doesn't change because of a mistake
  37. # in the actual code; this is the ground truth.
  38. result = hash_pandas_object(Index(["foo", "bar", "baz"]))
  39. expected = Series(
  40. np.array(
  41. [3600424527151052760, 1374399572096150070, 477881037637427054],
  42. dtype="uint64",
  43. ),
  44. index=["foo", "bar", "baz"],
  45. )
  46. tm.assert_series_equal(result, expected)
  47. def test_hash_array(series):
  48. arr = series.values
  49. tm.assert_numpy_array_equal(hash_array(arr), hash_array(arr))
  50. @pytest.mark.parametrize("dtype", ["U", object])
  51. def test_hash_array_mixed(dtype):
  52. result1 = hash_array(np.array(["3", "4", "All"]))
  53. result2 = hash_array(np.array([3, 4, "All"], dtype=dtype))
  54. tm.assert_numpy_array_equal(result1, result2)
  55. @pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")])
  56. def test_hash_array_errors(val):
  57. msg = "must pass a ndarray-like"
  58. with pytest.raises(TypeError, match=msg):
  59. hash_array(val)
  60. def test_hash_array_index_exception():
  61. # GH42003 TypeError instead of AttributeError
  62. obj = pd.DatetimeIndex(["2018-10-28 01:20:00"], tz="Europe/Berlin")
  63. msg = "Use hash_pandas_object instead"
  64. with pytest.raises(TypeError, match=msg):
  65. hash_array(obj)
  66. def test_hash_tuples():
  67. tuples = [(1, "one"), (1, "two"), (2, "one")]
  68. result = hash_tuples(tuples)
  69. expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values
  70. tm.assert_numpy_array_equal(result, expected)
  71. # We only need to support MultiIndex and list-of-tuples
  72. msg = "|".join(["object is not iterable", "zip argument #1 must support iteration"])
  73. with pytest.raises(TypeError, match=msg):
  74. hash_tuples(tuples[0])
  75. @pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")])
  76. def test_hash_tuples_err(val):
  77. msg = "must be convertible to a list-of-tuples"
  78. with pytest.raises(TypeError, match=msg):
  79. hash_tuples(val)
  80. def test_multiindex_unique():
  81. mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)])
  82. assert mi.is_unique is True
  83. result = hash_pandas_object(mi)
  84. assert result.is_unique is True
  85. def test_multiindex_objects():
  86. mi = MultiIndex(
  87. levels=[["b", "d", "a"], [1, 2, 3]],
  88. codes=[[0, 1, 0, 2], [2, 0, 0, 1]],
  89. names=["col1", "col2"],
  90. )
  91. recons = mi._sort_levels_monotonic()
  92. # These are equal.
  93. assert mi.equals(recons)
  94. assert Index(mi.values).equals(Index(recons.values))
  95. @pytest.mark.parametrize(
  96. "obj",
  97. [
  98. Series([1, 2, 3]),
  99. Series([1.0, 1.5, 3.2]),
  100. Series([1.0, 1.5, np.nan]),
  101. Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
  102. Series(["a", "b", "c"]),
  103. Series(["a", np.nan, "c"]),
  104. Series(["a", None, "c"]),
  105. Series([True, False, True]),
  106. Series(dtype=object),
  107. DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
  108. DataFrame(),
  109. DataFrame(np.full((10, 4), np.nan)),
  110. DataFrame(
  111. {
  112. "A": [0.0, 1.0, 2.0, 3.0, 4.0],
  113. "B": [0.0, 1.0, 0.0, 1.0, 0.0],
  114. "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object),
  115. "D": pd.date_range("20130101", periods=5),
  116. }
  117. ),
  118. DataFrame(range(5), index=pd.date_range("2020-01-01", periods=5)),
  119. Series(range(5), index=pd.date_range("2020-01-01", periods=5)),
  120. Series(period_range("2020-01-01", periods=10, freq="D")),
  121. Series(pd.date_range("20130101", periods=3, tz="US/Eastern")),
  122. ],
  123. )
  124. def test_hash_pandas_object(obj, index):
  125. a = hash_pandas_object(obj, index=index)
  126. b = hash_pandas_object(obj, index=index)
  127. tm.assert_series_equal(a, b)
  128. @pytest.mark.parametrize(
  129. "obj",
  130. [
  131. Series([1, 2, 3]),
  132. Series([1.0, 1.5, 3.2]),
  133. Series([1.0, 1.5, np.nan]),
  134. Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
  135. Series(["a", "b", "c"]),
  136. Series(["a", np.nan, "c"]),
  137. Series(["a", None, "c"]),
  138. Series([True, False, True]),
  139. DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
  140. DataFrame(np.full((10, 4), np.nan)),
  141. DataFrame(
  142. {
  143. "A": [0.0, 1.0, 2.0, 3.0, 4.0],
  144. "B": [0.0, 1.0, 0.0, 1.0, 0.0],
  145. "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object),
  146. "D": pd.date_range("20130101", periods=5),
  147. }
  148. ),
  149. DataFrame(range(5), index=pd.date_range("2020-01-01", periods=5)),
  150. Series(range(5), index=pd.date_range("2020-01-01", periods=5)),
  151. Series(period_range("2020-01-01", periods=10, freq="D")),
  152. Series(pd.date_range("20130101", periods=3, tz="US/Eastern")),
  153. ],
  154. )
  155. def test_hash_pandas_object_diff_index_non_empty(obj):
  156. a = hash_pandas_object(obj, index=True)
  157. b = hash_pandas_object(obj, index=False)
  158. assert not (a == b).all()
  159. @pytest.mark.parametrize(
  160. "obj",
  161. [
  162. Index([1, 2, 3]),
  163. Index([True, False, True]),
  164. timedelta_range("1 day", periods=2),
  165. period_range("2020-01-01", freq="D", periods=2),
  166. MultiIndex.from_product(
  167. [range(5), ["foo", "bar", "baz"], pd.date_range("20130101", periods=2)]
  168. ),
  169. MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]),
  170. ],
  171. )
  172. def test_hash_pandas_index(obj, index):
  173. a = hash_pandas_object(obj, index=index)
  174. b = hash_pandas_object(obj, index=index)
  175. tm.assert_series_equal(a, b)
  176. def test_hash_pandas_series(series, index):
  177. a = hash_pandas_object(series, index=index)
  178. b = hash_pandas_object(series, index=index)
  179. tm.assert_series_equal(a, b)
  180. def test_hash_pandas_series_diff_index(series):
  181. a = hash_pandas_object(series, index=True)
  182. b = hash_pandas_object(series, index=False)
  183. assert not (a == b).all()
  184. @pytest.mark.parametrize(
  185. "obj", [Series([], dtype="float64"), Series([], dtype="object"), Index([])]
  186. )
  187. def test_hash_pandas_empty_object(obj, index):
  188. # These are by-definition the same with
  189. # or without the index as the data is empty.
  190. a = hash_pandas_object(obj, index=index)
  191. b = hash_pandas_object(obj, index=index)
  192. tm.assert_series_equal(a, b)
  193. @pytest.mark.parametrize(
  194. "s1",
  195. [
  196. Series(["a", "b", "c", "d"]),
  197. Series([1000, 2000, 3000, 4000]),
  198. Series(pd.date_range(0, periods=4)),
  199. ],
  200. )
  201. @pytest.mark.parametrize("categorize", [True, False])
  202. def test_categorical_consistency(s1, categorize):
  203. # see gh-15143
  204. #
  205. # Check that categoricals hash consistent with their values,
  206. # not codes. This should work for categoricals of any dtype.
  207. s2 = s1.astype("category").cat.set_categories(s1)
  208. s3 = s2.cat.set_categories(list(reversed(s1)))
  209. # These should all hash identically.
  210. h1 = hash_pandas_object(s1, categorize=categorize)
  211. h2 = hash_pandas_object(s2, categorize=categorize)
  212. h3 = hash_pandas_object(s3, categorize=categorize)
  213. tm.assert_series_equal(h1, h2)
  214. tm.assert_series_equal(h1, h3)
  215. def test_categorical_with_nan_consistency():
  216. c = pd.Categorical.from_codes(
  217. [-1, 0, 1, 2, 3, 4], categories=pd.date_range("2012-01-01", periods=5, name="B")
  218. )
  219. expected = hash_array(c, categorize=False)
  220. c = pd.Categorical.from_codes([-1, 0], categories=[pd.Timestamp("2012-01-01")])
  221. result = hash_array(c, categorize=False)
  222. assert result[0] in expected
  223. assert result[1] in expected
  224. def test_pandas_errors():
  225. msg = "Unexpected type for hashing"
  226. with pytest.raises(TypeError, match=msg):
  227. hash_pandas_object(pd.Timestamp("20130101"))
  228. def test_hash_keys():
  229. # Using different hash keys, should have
  230. # different hashes for the same data.
  231. #
  232. # This only matters for object dtypes.
  233. obj = Series(list("abc"))
  234. a = hash_pandas_object(obj, hash_key="9876543210123456")
  235. b = hash_pandas_object(obj, hash_key="9876543210123465")
  236. assert (a != b).all()
  237. def test_df_hash_keys():
  238. # DataFrame version of the test_hash_keys.
  239. # https://github.com/pandas-dev/pandas/issues/41404
  240. obj = DataFrame({"x": np.arange(3), "y": list("abc")})
  241. a = hash_pandas_object(obj, hash_key="9876543210123456")
  242. b = hash_pandas_object(obj, hash_key="9876543210123465")
  243. assert (a != b).all()
  244. def test_df_encoding():
  245. # Check that DataFrame recognizes optional encoding.
  246. # https://github.com/pandas-dev/pandas/issues/41404
  247. # https://github.com/pandas-dev/pandas/pull/42049
  248. obj = DataFrame({"x": np.arange(3), "y": list("a+c")})
  249. a = hash_pandas_object(obj, encoding="utf8")
  250. b = hash_pandas_object(obj, encoding="utf7")
  251. # Note that the "+" is encoded as "+-" in utf-7.
  252. assert a[0] == b[0]
  253. assert a[1] != b[1]
  254. assert a[2] == b[2]
  255. def test_invalid_key():
  256. # This only matters for object dtypes.
  257. msg = "key should be a 16-byte string encoded"
  258. with pytest.raises(ValueError, match=msg):
  259. hash_pandas_object(Series(list("abc")), hash_key="foo")
  260. def test_already_encoded(index):
  261. # If already encoded, then ok.
  262. obj = Series(list("abc")).str.encode("utf8")
  263. a = hash_pandas_object(obj, index=index)
  264. b = hash_pandas_object(obj, index=index)
  265. tm.assert_series_equal(a, b)
  266. def test_alternate_encoding(index):
  267. obj = Series(list("abc"))
  268. a = hash_pandas_object(obj, index=index)
  269. b = hash_pandas_object(obj, index=index)
  270. tm.assert_series_equal(a, b)
  271. @pytest.mark.parametrize("l_exp", range(8))
  272. @pytest.mark.parametrize("l_add", [0, 1])
  273. def test_same_len_hash_collisions(l_exp, l_add):
  274. length = 2 ** (l_exp + 8) + l_add
  275. idx = np.array([str(i) for i in range(length)], dtype=object)
  276. result = hash_array(idx, "utf8")
  277. assert not result[0] == result[1]
  278. def test_hash_collisions():
  279. # Hash collisions are bad.
  280. #
  281. # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
  282. hashes = [
  283. "Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9",
  284. "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe",
  285. ]
  286. # These should be different.
  287. result1 = hash_array(np.asarray(hashes[0:1], dtype=object), "utf8")
  288. expected1 = np.array([14963968704024874985], dtype=np.uint64)
  289. tm.assert_numpy_array_equal(result1, expected1)
  290. result2 = hash_array(np.asarray(hashes[1:2], dtype=object), "utf8")
  291. expected2 = np.array([16428432627716348016], dtype=np.uint64)
  292. tm.assert_numpy_array_equal(result2, expected2)
  293. result = hash_array(np.asarray(hashes, dtype=object), "utf8")
  294. tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0))
  295. @pytest.mark.parametrize(
  296. "data, result_data",
  297. [
  298. [[tuple("1"), tuple("2")], [10345501319357378243, 8331063931016360761]],
  299. [[(1,), (2,)], [9408946347443669104, 3278256261030523334]],
  300. ],
  301. )
  302. def test_hash_with_tuple(data, result_data):
  303. # GH#28969 array containing a tuple raises on call to arr.astype(str)
  304. # apparently a numpy bug github.com/numpy/numpy/issues/9441
  305. df = DataFrame({"data": data})
  306. result = hash_pandas_object(df)
  307. expected = Series(result_data, dtype=np.uint64)
  308. tm.assert_series_equal(result, expected)
  309. def test_hashable_tuple_args():
  310. # require that the elements of such tuples are themselves hashable
  311. df3 = DataFrame(
  312. {
  313. "data": [
  314. (
  315. 1,
  316. [],
  317. ),
  318. (
  319. 2,
  320. {},
  321. ),
  322. ]
  323. }
  324. )
  325. with pytest.raises(TypeError, match="unhashable type: 'list'"):
  326. hash_pandas_object(df3)
  327. def test_hash_object_none_key():
  328. # https://github.com/pandas-dev/pandas/issues/30887
  329. result = pd.util.hash_pandas_object(Series(["a", "b"]), hash_key=None)
  330. expected = Series([4578374827886788867, 17338122309987883691], dtype="uint64")
  331. tm.assert_series_equal(result, expected)