test_json.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
  1. import collections
  2. import operator
  3. import sys
  4. import numpy as np
  5. import pytest
  6. import pandas as pd
  7. import pandas._testing as tm
  8. from pandas.tests.extension import base
  9. from pandas.tests.extension.json.array import (
  10. JSONArray,
  11. JSONDtype,
  12. make_data,
  13. )
  14. # We intentionally don't run base.BaseSetitemTests because pandas'
  15. # internals has trouble setting sequences of values into scalar positions.
  16. unhashable = pytest.mark.xfail(reason="Unhashable")
  17. @pytest.fixture
  18. def dtype():
  19. return JSONDtype()
  20. @pytest.fixture
  21. def data():
  22. """Length-100 PeriodArray for semantics test."""
  23. data = make_data()
  24. # Why the while loop? NumPy is unable to construct an ndarray from
  25. # equal-length ndarrays. Many of our operations involve coercing the
  26. # EA to an ndarray of objects. To avoid random test failures, we ensure
  27. # that our data is coercible to an ndarray. Several tests deal with only
  28. # the first two elements, so that's what we'll check.
  29. while len(data[0]) == len(data[1]):
  30. data = make_data()
  31. return JSONArray(data)
  32. @pytest.fixture
  33. def data_missing():
  34. """Length 2 array with [NA, Valid]"""
  35. return JSONArray([{}, {"a": 10}])
  36. @pytest.fixture
  37. def data_for_sorting():
  38. return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}])
  39. @pytest.fixture
  40. def data_missing_for_sorting():
  41. return JSONArray([{"b": 1}, {}, {"a": 4}])
  42. @pytest.fixture
  43. def na_cmp():
  44. return operator.eq
  45. @pytest.fixture
  46. def data_for_grouping():
  47. return JSONArray(
  48. [
  49. {"b": 1},
  50. {"b": 1},
  51. {},
  52. {},
  53. {"a": 0, "c": 2},
  54. {"a": 0, "c": 2},
  55. {"b": 1},
  56. {"c": 2},
  57. ]
  58. )
  59. class TestJSONArray(base.ExtensionTests):
  60. @pytest.mark.xfail(
  61. reason="comparison method not implemented for JSONArray (GH-37867)"
  62. )
  63. def test_contains(self, data):
  64. # GH-37867
  65. super().test_contains(data)
  66. @pytest.mark.xfail(reason="not implemented constructor from dtype")
  67. def test_from_dtype(self, data):
  68. # construct from our dtype & string dtype
  69. super().test_from_dtype(data)
  70. @pytest.mark.xfail(reason="RecursionError, GH-33900")
  71. def test_series_constructor_no_data_with_index(self, dtype, na_value):
  72. # RecursionError: maximum recursion depth exceeded in comparison
  73. rec_limit = sys.getrecursionlimit()
  74. try:
  75. # Limit to avoid stack overflow on Windows CI
  76. sys.setrecursionlimit(100)
  77. super().test_series_constructor_no_data_with_index(dtype, na_value)
  78. finally:
  79. sys.setrecursionlimit(rec_limit)
  80. @pytest.mark.xfail(reason="RecursionError, GH-33900")
  81. def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
  82. # RecursionError: maximum recursion depth exceeded in comparison
  83. rec_limit = sys.getrecursionlimit()
  84. try:
  85. # Limit to avoid stack overflow on Windows CI
  86. sys.setrecursionlimit(100)
  87. super().test_series_constructor_scalar_na_with_index(dtype, na_value)
  88. finally:
  89. sys.setrecursionlimit(rec_limit)
  90. @pytest.mark.xfail(reason="collection as scalar, GH-33901")
  91. def test_series_constructor_scalar_with_index(self, data, dtype):
  92. # TypeError: All values must be of type <class 'collections.abc.Mapping'>
  93. rec_limit = sys.getrecursionlimit()
  94. try:
  95. # Limit to avoid stack overflow on Windows CI
  96. sys.setrecursionlimit(100)
  97. super().test_series_constructor_scalar_with_index(data, dtype)
  98. finally:
  99. sys.setrecursionlimit(rec_limit)
  100. @pytest.mark.xfail(reason="Different definitions of NA")
  101. def test_stack(self):
  102. """
  103. The test does .astype(object).stack(future_stack=True). If we happen to have
  104. any missing values in `data`, then we'll end up with different
  105. rows since we consider `{}` NA, but `.astype(object)` doesn't.
  106. """
  107. super().test_stack()
  108. @pytest.mark.xfail(reason="dict for NA")
  109. def test_unstack(self, data, index):
  110. # The base test has NaN for the expected NA value.
  111. # this matches otherwise
  112. return super().test_unstack(data, index)
  113. @pytest.mark.xfail(reason="Setting a dict as a scalar")
  114. def test_fillna_series(self):
  115. """We treat dictionaries as a mapping in fillna, not a scalar."""
  116. super().test_fillna_series()
  117. @pytest.mark.xfail(reason="Setting a dict as a scalar")
  118. def test_fillna_frame(self):
  119. """We treat dictionaries as a mapping in fillna, not a scalar."""
  120. super().test_fillna_frame()
  121. @pytest.mark.parametrize(
  122. "limit_area, input_ilocs, expected_ilocs",
  123. [
  124. ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]),
  125. ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]),
  126. ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]),
  127. ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]),
  128. ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]),
  129. ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]),
  130. ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]),
  131. ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]),
  132. ],
  133. )
  134. def test_ffill_limit_area(
  135. self, data_missing, limit_area, input_ilocs, expected_ilocs
  136. ):
  137. # GH#56616
  138. msg = "JSONArray does not implement limit_area"
  139. with pytest.raises(NotImplementedError, match=msg):
  140. super().test_ffill_limit_area(
  141. data_missing, limit_area, input_ilocs, expected_ilocs
  142. )
  143. @unhashable
  144. def test_value_counts(self, all_data, dropna):
  145. super().test_value_counts(all_data, dropna)
  146. @unhashable
  147. def test_value_counts_with_normalize(self, data):
  148. super().test_value_counts_with_normalize(data)
  149. @unhashable
  150. def test_sort_values_frame(self):
  151. # TODO (EA.factorize): see if _values_for_factorize allows this.
  152. super().test_sort_values_frame()
  153. @pytest.mark.parametrize("ascending", [True, False])
  154. def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
  155. super().test_sort_values(data_for_sorting, ascending, sort_by_key)
  156. @pytest.mark.parametrize("ascending", [True, False])
  157. def test_sort_values_missing(
  158. self, data_missing_for_sorting, ascending, sort_by_key
  159. ):
  160. super().test_sort_values_missing(
  161. data_missing_for_sorting, ascending, sort_by_key
  162. )
  163. @pytest.mark.xfail(reason="combine for JSONArray not supported")
  164. def test_combine_le(self, data_repeated):
  165. super().test_combine_le(data_repeated)
  166. @pytest.mark.xfail(
  167. reason="combine for JSONArray not supported - "
  168. "may pass depending on random data",
  169. strict=False,
  170. raises=AssertionError,
  171. )
  172. def test_combine_first(self, data):
  173. super().test_combine_first(data)
  174. @pytest.mark.xfail(reason="broadcasting error")
  175. def test_where_series(self, data, na_value):
  176. # Fails with
  177. # *** ValueError: operands could not be broadcast together
  178. # with shapes (4,) (4,) (0,)
  179. super().test_where_series(data, na_value)
  180. @pytest.mark.xfail(reason="Can't compare dicts.")
  181. def test_searchsorted(self, data_for_sorting):
  182. super().test_searchsorted(data_for_sorting)
  183. @pytest.mark.xfail(reason="Can't compare dicts.")
  184. def test_equals(self, data, na_value, as_series):
  185. super().test_equals(data, na_value, as_series)
  186. @pytest.mark.skip("fill-value is interpreted as a dict of values")
  187. def test_fillna_copy_frame(self, data_missing):
  188. super().test_fillna_copy_frame(data_missing)
  189. def test_equals_same_data_different_object(
  190. self, data, using_copy_on_write, request
  191. ):
  192. if using_copy_on_write:
  193. mark = pytest.mark.xfail(reason="Fails with CoW")
  194. request.applymarker(mark)
  195. super().test_equals_same_data_different_object(data)
  196. @pytest.mark.xfail(reason="failing on np.array(self, dtype=str)")
  197. def test_astype_str(self):
  198. """This currently fails in NumPy on np.array(self, dtype=str) with
  199. *** ValueError: setting an array element with a sequence
  200. """
  201. super().test_astype_str()
  202. @unhashable
  203. def test_groupby_extension_transform(self):
  204. """
  205. This currently fails in Series.name.setter, since the
  206. name must be hashable, but the value is a dictionary.
  207. I think this is what we want, i.e. `.name` should be the original
  208. values, and not the values for factorization.
  209. """
  210. super().test_groupby_extension_transform()
  211. @unhashable
  212. def test_groupby_extension_apply(self):
  213. """
  214. This fails in Index._do_unique_check with
  215. > hash(val)
  216. E TypeError: unhashable type: 'UserDict' with
  217. I suspect that once we support Index[ExtensionArray],
  218. we'll be able to dispatch unique.
  219. """
  220. super().test_groupby_extension_apply()
  221. @unhashable
  222. def test_groupby_extension_agg(self):
  223. """
  224. This fails when we get to tm.assert_series_equal when left.index
  225. contains dictionaries, which are not hashable.
  226. """
  227. super().test_groupby_extension_agg()
  228. @unhashable
  229. def test_groupby_extension_no_sort(self):
  230. """
  231. This fails when we get to tm.assert_series_equal when left.index
  232. contains dictionaries, which are not hashable.
  233. """
  234. super().test_groupby_extension_no_sort()
  235. def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
  236. if len(data[0]) != 1:
  237. mark = pytest.mark.xfail(reason="raises in coercing to Series")
  238. request.applymarker(mark)
  239. super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
  240. def test_compare_array(self, data, comparison_op, request):
  241. if comparison_op.__name__ in ["eq", "ne"]:
  242. mark = pytest.mark.xfail(reason="Comparison methods not implemented")
  243. request.applymarker(mark)
  244. super().test_compare_array(data, comparison_op)
  245. @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
  246. def test_setitem_loc_scalar_mixed(self, data):
  247. super().test_setitem_loc_scalar_mixed(data)
  248. @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
  249. def test_setitem_loc_scalar_multiple_homogoneous(self, data):
  250. super().test_setitem_loc_scalar_multiple_homogoneous(data)
  251. @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
  252. def test_setitem_iloc_scalar_mixed(self, data):
  253. super().test_setitem_iloc_scalar_mixed(data)
  254. @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
  255. def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
  256. super().test_setitem_iloc_scalar_multiple_homogoneous(data)
  257. @pytest.mark.parametrize(
  258. "mask",
  259. [
  260. np.array([True, True, True, False, False]),
  261. pd.array([True, True, True, False, False], dtype="boolean"),
  262. pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
  263. ],
  264. ids=["numpy-array", "boolean-array", "boolean-array-na"],
  265. )
  266. def test_setitem_mask(self, data, mask, box_in_series, request):
  267. if box_in_series:
  268. mark = pytest.mark.xfail(
  269. reason="cannot set using a list-like indexer with a different length"
  270. )
  271. request.applymarker(mark)
  272. elif not isinstance(mask, np.ndarray):
  273. mark = pytest.mark.xfail(reason="Issues unwanted DeprecationWarning")
  274. request.applymarker(mark)
  275. super().test_setitem_mask(data, mask, box_in_series)
  276. def test_setitem_mask_raises(self, data, box_in_series, request):
  277. if not box_in_series:
  278. mark = pytest.mark.xfail(reason="Fails to raise")
  279. request.applymarker(mark)
  280. super().test_setitem_mask_raises(data, box_in_series)
  281. @pytest.mark.xfail(
  282. reason="cannot set using a list-like indexer with a different length"
  283. )
  284. def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
  285. super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
  286. @pytest.mark.parametrize(
  287. "idx",
  288. [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
  289. ids=["list", "integer-array", "numpy-array"],
  290. )
  291. def test_setitem_integer_array(self, data, idx, box_in_series, request):
  292. if box_in_series:
  293. mark = pytest.mark.xfail(
  294. reason="cannot set using a list-like indexer with a different length"
  295. )
  296. request.applymarker(mark)
  297. super().test_setitem_integer_array(data, idx, box_in_series)
  298. @pytest.mark.xfail(reason="list indices must be integers or slices, not NAType")
  299. @pytest.mark.parametrize(
  300. "idx, box_in_series",
  301. [
  302. ([0, 1, 2, pd.NA], False),
  303. pytest.param(
  304. [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948")
  305. ),
  306. (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
  307. (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
  308. ],
  309. ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
  310. )
  311. def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
  312. super().test_setitem_integer_with_missing_raises(data, idx, box_in_series)
  313. @pytest.mark.xfail(reason="Fails to raise")
  314. def test_setitem_scalar_key_sequence_raise(self, data):
  315. super().test_setitem_scalar_key_sequence_raise(data)
  316. def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request):
  317. if "full_slice" in request.node.name:
  318. mark = pytest.mark.xfail(reason="slice is not iterable")
  319. request.applymarker(mark)
  320. super().test_setitem_with_expansion_dataframe_column(data, full_indexer)
  321. @pytest.mark.xfail(reason="slice is not iterable")
  322. def test_setitem_frame_2d_values(self, data):
  323. super().test_setitem_frame_2d_values(data)
  324. @pytest.mark.xfail(
  325. reason="cannot set using a list-like indexer with a different length"
  326. )
  327. @pytest.mark.parametrize("setter", ["loc", None])
  328. def test_setitem_mask_broadcast(self, data, setter):
  329. super().test_setitem_mask_broadcast(data, setter)
  330. @pytest.mark.xfail(
  331. reason="cannot set using a slice indexer with a different length"
  332. )
  333. def test_setitem_slice(self, data, box_in_series):
  334. super().test_setitem_slice(data, box_in_series)
  335. @pytest.mark.xfail(reason="slice object is not iterable")
  336. def test_setitem_loc_iloc_slice(self, data):
  337. super().test_setitem_loc_iloc_slice(data)
  338. @pytest.mark.xfail(reason="slice object is not iterable")
  339. def test_setitem_slice_mismatch_length_raises(self, data):
  340. super().test_setitem_slice_mismatch_length_raises(data)
  341. @pytest.mark.xfail(reason="slice object is not iterable")
  342. def test_setitem_slice_array(self, data):
  343. super().test_setitem_slice_array(data)
  344. @pytest.mark.xfail(reason="Fail to raise")
  345. def test_setitem_invalid(self, data, invalid_scalar):
  346. super().test_setitem_invalid(data, invalid_scalar)
  347. @pytest.mark.xfail(reason="only integer scalar arrays can be converted")
  348. def test_setitem_2d_values(self, data):
  349. super().test_setitem_2d_values(data)
  350. @pytest.mark.xfail(reason="data type 'json' not understood")
  351. @pytest.mark.parametrize("engine", ["c", "python"])
  352. def test_EA_types(self, engine, data, request):
  353. super().test_EA_types(engine, data, request)
  354. def custom_assert_series_equal(left, right, *args, **kwargs):
  355. # NumPy doesn't handle an array of equal-length UserDicts.
  356. # The default assert_series_equal eventually does a
  357. # Series.values, which raises. We work around it by
  358. # converting the UserDicts to dicts.
  359. if left.dtype.name == "json":
  360. assert left.dtype == right.dtype
  361. left = pd.Series(
  362. JSONArray(left.values.astype(object)), index=left.index, name=left.name
  363. )
  364. right = pd.Series(
  365. JSONArray(right.values.astype(object)),
  366. index=right.index,
  367. name=right.name,
  368. )
  369. tm.assert_series_equal(left, right, *args, **kwargs)
  370. def custom_assert_frame_equal(left, right, *args, **kwargs):
  371. obj_type = kwargs.get("obj", "DataFrame")
  372. tm.assert_index_equal(
  373. left.columns,
  374. right.columns,
  375. exact=kwargs.get("check_column_type", "equiv"),
  376. check_names=kwargs.get("check_names", True),
  377. check_exact=kwargs.get("check_exact", False),
  378. check_categorical=kwargs.get("check_categorical", True),
  379. obj=f"{obj_type}.columns",
  380. )
  381. jsons = (left.dtypes == "json").index
  382. for col in jsons:
  383. custom_assert_series_equal(left[col], right[col], *args, **kwargs)
  384. left = left.drop(columns=jsons)
  385. right = right.drop(columns=jsons)
  386. tm.assert_frame_equal(left, right, *args, **kwargs)
  387. def test_custom_asserts():
  388. # This would always trigger the KeyError from trying to put
  389. # an array of equal-length UserDicts inside an ndarray.
  390. data = JSONArray(
  391. [
  392. collections.UserDict({"a": 1}),
  393. collections.UserDict({"b": 2}),
  394. collections.UserDict({"c": 3}),
  395. ]
  396. )
  397. a = pd.Series(data)
  398. custom_assert_series_equal(a, a)
  399. custom_assert_frame_equal(a.to_frame(), a.to_frame())
  400. b = pd.Series(data.take([0, 0, 1]))
  401. msg = r"Series are different"
  402. with pytest.raises(AssertionError, match=msg):
  403. custom_assert_series_equal(a, b)
  404. with pytest.raises(AssertionError, match=msg):
  405. custom_assert_frame_equal(a.to_frame(), b.to_frame())