test_pandas.py 76 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188
  1. import datetime
  2. from datetime import timedelta
  3. from decimal import Decimal
  4. from io import (
  5. BytesIO,
  6. StringIO,
  7. )
  8. import json
  9. import os
  10. import sys
  11. import time
  12. import numpy as np
  13. import pytest
  14. from pandas._config import using_string_dtype
  15. from pandas.compat import IS64
  16. import pandas.util._test_decorators as td
  17. import pandas as pd
  18. from pandas import (
  19. NA,
  20. DataFrame,
  21. DatetimeIndex,
  22. Index,
  23. RangeIndex,
  24. Series,
  25. Timestamp,
  26. date_range,
  27. read_json,
  28. )
  29. import pandas._testing as tm
  30. from pandas.io.json import ujson_dumps
  31. def test_literal_json_deprecation():
  32. # PR 53409
  33. expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
  34. jsonl = """{"a": 1, "b": 2}
  35. {"a": 3, "b": 4}
  36. {"a": 5, "b": 6}
  37. {"a": 7, "b": 8}"""
  38. msg = (
  39. "Passing literal json to 'read_json' is deprecated and "
  40. "will be removed in a future version. To read from a "
  41. "literal string, wrap it in a 'StringIO' object."
  42. )
  43. with tm.assert_produces_warning(FutureWarning, match=msg):
  44. try:
  45. read_json(jsonl, lines=False)
  46. except ValueError:
  47. pass
  48. with tm.assert_produces_warning(FutureWarning, match=msg):
  49. read_json(expected.to_json(), lines=False)
  50. with tm.assert_produces_warning(FutureWarning, match=msg):
  51. result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
  52. tm.assert_frame_equal(result, expected)
  53. with tm.assert_produces_warning(FutureWarning, match=msg):
  54. try:
  55. result = read_json(
  56. '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n',
  57. lines=False,
  58. )
  59. except ValueError:
  60. pass
  61. with tm.assert_produces_warning(FutureWarning, match=msg):
  62. try:
  63. result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=False)
  64. except ValueError:
  65. pass
  66. tm.assert_frame_equal(result, expected)
  67. def assert_json_roundtrip_equal(result, expected, orient):
  68. if orient in ("records", "values"):
  69. expected = expected.reset_index(drop=True)
  70. if orient == "values":
  71. expected.columns = range(len(expected.columns))
  72. tm.assert_frame_equal(result, expected)
  73. class TestPandasContainer:
  74. @pytest.fixture
  75. def categorical_frame(self):
  76. data = {
  77. c: np.random.default_rng(i).standard_normal(30)
  78. for i, c in enumerate(list("ABCD"))
  79. }
  80. cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * 15
  81. data["E"] = list(reversed(cat))
  82. data["sort"] = np.arange(30, dtype="int64")
  83. return DataFrame(data, index=pd.CategoricalIndex(cat, name="E"))
  84. @pytest.fixture
  85. def datetime_series(self):
  86. # Same as usual datetime_series, but with index freq set to None,
  87. # since that doesn't round-trip, see GH#33711
  88. ser = Series(
  89. 1.1 * np.arange(10, dtype=np.float64),
  90. index=date_range("2020-01-01", periods=10),
  91. name="ts",
  92. )
  93. ser.index = ser.index._with_freq(None)
  94. return ser
  95. @pytest.fixture
  96. def datetime_frame(self):
  97. # Same as usual datetime_frame, but with index freq set to None,
  98. # since that doesn't round-trip, see GH#33711
  99. df = DataFrame(
  100. np.random.default_rng(2).standard_normal((30, 4)),
  101. columns=Index(list("ABCD")),
  102. index=date_range("2000-01-01", periods=30, freq="B"),
  103. )
  104. df.index = df.index._with_freq(None)
  105. return df
  106. def test_frame_double_encoded_labels(self, orient):
  107. df = DataFrame(
  108. [["a", "b"], ["c", "d"]],
  109. index=['index " 1', "index / 2"],
  110. columns=["a \\ b", "y / z"],
  111. )
  112. data = StringIO(df.to_json(orient=orient))
  113. result = read_json(data, orient=orient)
  114. expected = df.copy()
  115. assert_json_roundtrip_equal(result, expected, orient)
  116. @pytest.mark.parametrize("orient", ["split", "records", "values"])
  117. def test_frame_non_unique_index(self, orient):
  118. df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"])
  119. data = StringIO(df.to_json(orient=orient))
  120. result = read_json(data, orient=orient)
  121. expected = df.copy()
  122. assert_json_roundtrip_equal(result, expected, orient)
  123. @pytest.mark.parametrize("orient", ["index", "columns"])
  124. def test_frame_non_unique_index_raises(self, orient):
  125. df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"])
  126. msg = f"DataFrame index must be unique for orient='{orient}'"
  127. with pytest.raises(ValueError, match=msg):
  128. df.to_json(orient=orient)
  129. @pytest.mark.parametrize("orient", ["split", "values"])
  130. @pytest.mark.parametrize(
  131. "data",
  132. [
  133. [["a", "b"], ["c", "d"]],
  134. [[1.5, 2.5], [3.5, 4.5]],
  135. [[1, 2.5], [3, 4.5]],
  136. [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]],
  137. ],
  138. )
  139. def test_frame_non_unique_columns(self, orient, data):
  140. df = DataFrame(data, index=[1, 2], columns=["x", "x"])
  141. result = read_json(
  142. StringIO(df.to_json(orient=orient)), orient=orient, convert_dates=["x"]
  143. )
  144. if orient == "values":
  145. expected = DataFrame(data)
  146. if expected.iloc[:, 0].dtype == "datetime64[ns]":
  147. # orient == "values" by default will write Timestamp objects out
  148. # in milliseconds; these are internally stored in nanosecond,
  149. # so divide to get where we need
  150. # TODO: a to_epoch method would also solve; see GH 14772
  151. expected.isetitem(0, expected.iloc[:, 0].astype(np.int64) // 1000000)
  152. elif orient == "split":
  153. expected = df
  154. expected.columns = ["x", "x.1"]
  155. tm.assert_frame_equal(result, expected)
  156. @pytest.mark.parametrize("orient", ["index", "columns", "records"])
  157. def test_frame_non_unique_columns_raises(self, orient):
  158. df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 2], columns=["x", "x"])
  159. msg = f"DataFrame columns must be unique for orient='{orient}'"
  160. with pytest.raises(ValueError, match=msg):
  161. df.to_json(orient=orient)
  162. def test_frame_default_orient(self, float_frame):
  163. assert float_frame.to_json() == float_frame.to_json(orient="columns")
  164. @pytest.mark.parametrize("dtype", [False, float])
  165. @pytest.mark.parametrize("convert_axes", [True, False])
  166. def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame):
  167. data = StringIO(float_frame.to_json(orient=orient))
  168. result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype)
  169. expected = float_frame
  170. assert_json_roundtrip_equal(result, expected, orient)
  171. @pytest.mark.parametrize("dtype", [False, np.int64])
  172. @pytest.mark.parametrize("convert_axes", [True, False])
  173. def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame):
  174. data = StringIO(int_frame.to_json(orient=orient))
  175. result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype)
  176. expected = int_frame
  177. assert_json_roundtrip_equal(result, expected, orient)
  178. @pytest.mark.parametrize("dtype", [None, np.float64, int, "U3"])
  179. @pytest.mark.parametrize("convert_axes", [True, False])
  180. def test_roundtrip_str_axes(self, orient, convert_axes, dtype):
  181. df = DataFrame(
  182. np.zeros((200, 4)),
  183. columns=[str(i) for i in range(4)],
  184. index=[str(i) for i in range(200)],
  185. dtype=dtype,
  186. )
  187. data = StringIO(df.to_json(orient=orient))
  188. result = read_json(data, orient=orient, convert_axes=convert_axes, dtype=dtype)
  189. expected = df.copy()
  190. if not dtype:
  191. expected = expected.astype(np.int64)
  192. # index columns, and records orients cannot fully preserve the string
  193. # dtype for axes as the index and column labels are used as keys in
  194. # JSON objects. JSON keys are by definition strings, so there's no way
  195. # to disambiguate whether those keys actually were strings or numeric
  196. # beforehand and numeric wins out.
  197. if convert_axes and (orient in ("index", "columns")):
  198. expected.columns = expected.columns.astype(np.int64)
  199. expected.index = expected.index.astype(np.int64)
  200. elif orient == "records" and convert_axes:
  201. expected.columns = expected.columns.astype(np.int64)
  202. elif convert_axes and orient == "split":
  203. expected.columns = expected.columns.astype(np.int64)
  204. assert_json_roundtrip_equal(result, expected, orient)
  205. @pytest.mark.parametrize("convert_axes", [True, False])
  206. def test_roundtrip_categorical(
  207. self, request, orient, categorical_frame, convert_axes, using_infer_string
  208. ):
  209. # TODO: create a better frame to test with and improve coverage
  210. if orient in ("index", "columns"):
  211. request.applymarker(
  212. pytest.mark.xfail(
  213. reason=f"Can't have duplicate index values for orient '{orient}')"
  214. )
  215. )
  216. data = StringIO(categorical_frame.to_json(orient=orient))
  217. result = read_json(data, orient=orient, convert_axes=convert_axes)
  218. expected = categorical_frame.copy()
  219. expected.index = expected.index.astype(
  220. str if not using_infer_string else "str"
  221. ) # Categorical not preserved
  222. expected.index.name = None # index names aren't preserved in JSON
  223. assert_json_roundtrip_equal(result, expected, orient)
  224. @pytest.mark.parametrize("convert_axes", [True, False])
  225. def test_roundtrip_empty(self, orient, convert_axes):
  226. empty_frame = DataFrame()
  227. data = StringIO(empty_frame.to_json(orient=orient))
  228. result = read_json(data, orient=orient, convert_axes=convert_axes)
  229. if orient == "split":
  230. idx = Index([], dtype=(float if convert_axes else object))
  231. expected = DataFrame(index=idx, columns=idx)
  232. elif orient in ["index", "columns"]:
  233. expected = DataFrame()
  234. else:
  235. expected = empty_frame.copy()
  236. tm.assert_frame_equal(result, expected)
  237. @pytest.mark.parametrize("convert_axes", [True, False])
  238. def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame):
  239. # TODO: improve coverage with date_format parameter
  240. data = StringIO(datetime_frame.to_json(orient=orient))
  241. result = read_json(data, orient=orient, convert_axes=convert_axes)
  242. expected = datetime_frame.copy()
  243. if not convert_axes: # one off for ts handling
  244. # DTI gets converted to epoch values
  245. idx = expected.index.view(np.int64) // 1000000
  246. if orient != "split": # TODO: handle consistently across orients
  247. idx = idx.astype(str)
  248. expected.index = idx
  249. assert_json_roundtrip_equal(result, expected, orient)
  250. @pytest.mark.parametrize("convert_axes", [True, False])
  251. def test_roundtrip_mixed(self, orient, convert_axes):
  252. index = Index(["a", "b", "c", "d", "e"])
  253. values = {
  254. "A": [0.0, 1.0, 2.0, 3.0, 4.0],
  255. "B": [0.0, 1.0, 0.0, 1.0, 0.0],
  256. "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
  257. "D": [True, False, True, False, True],
  258. }
  259. df = DataFrame(data=values, index=index)
  260. data = StringIO(df.to_json(orient=orient))
  261. result = read_json(data, orient=orient, convert_axes=convert_axes)
  262. expected = df.copy()
  263. expected = expected.assign(**expected.select_dtypes("number").astype(np.int64))
  264. assert_json_roundtrip_equal(result, expected, orient)
  265. @pytest.mark.xfail(
  266. reason="#50456 Column multiindex is stored and loaded differently",
  267. raises=AssertionError,
  268. )
  269. @pytest.mark.parametrize(
  270. "columns",
  271. [
  272. [["2022", "2022"], ["JAN", "FEB"]],
  273. [["2022", "2023"], ["JAN", "JAN"]],
  274. [["2022", "2022"], ["JAN", "JAN"]],
  275. ],
  276. )
  277. def test_roundtrip_multiindex(self, columns):
  278. df = DataFrame(
  279. [[1, 2], [3, 4]],
  280. columns=pd.MultiIndex.from_arrays(columns),
  281. )
  282. data = StringIO(df.to_json(orient="split"))
  283. result = read_json(data, orient="split")
  284. tm.assert_frame_equal(result, df)
  285. @pytest.mark.parametrize(
  286. "data,msg,orient",
  287. [
  288. ('{"key":b:a:d}', "Expected object or value", "columns"),
  289. # too few indices
  290. (
  291. '{"columns":["A","B"],'
  292. '"index":["2","3"],'
  293. '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}',
  294. "|".join(
  295. [
  296. r"Length of values \(3\) does not match length of index \(2\)",
  297. ]
  298. ),
  299. "split",
  300. ),
  301. # too many columns
  302. (
  303. '{"columns":["A","B","C"],'
  304. '"index":["1","2","3"],'
  305. '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}',
  306. "3 columns passed, passed data had 2 columns",
  307. "split",
  308. ),
  309. # bad key
  310. (
  311. '{"badkey":["A","B"],'
  312. '"index":["2","3"],'
  313. '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}',
  314. r"unexpected key\(s\): badkey",
  315. "split",
  316. ),
  317. ],
  318. )
  319. def test_frame_from_json_bad_data_raises(self, data, msg, orient):
  320. with pytest.raises(ValueError, match=msg):
  321. read_json(StringIO(data), orient=orient)
  322. @pytest.mark.parametrize("dtype", [True, False])
  323. @pytest.mark.parametrize("convert_axes", [True, False])
  324. def test_frame_from_json_missing_data(self, orient, convert_axes, dtype):
  325. num_df = DataFrame([[1, 2], [4, 5, 6]])
  326. result = read_json(
  327. StringIO(num_df.to_json(orient=orient)),
  328. orient=orient,
  329. convert_axes=convert_axes,
  330. dtype=dtype,
  331. )
  332. assert np.isnan(result.iloc[0, 2])
  333. obj_df = DataFrame([["1", "2"], ["4", "5", "6"]])
  334. result = read_json(
  335. StringIO(obj_df.to_json(orient=orient)),
  336. orient=orient,
  337. convert_axes=convert_axes,
  338. dtype=dtype,
  339. )
  340. assert np.isnan(result.iloc[0, 2])
  341. @pytest.mark.parametrize("dtype", [True, False])
  342. def test_frame_read_json_dtype_missing_value(self, dtype):
  343. # GH28501 Parse missing values using read_json with dtype=False
  344. # to NaN instead of None
  345. result = read_json(StringIO("[null]"), dtype=dtype)
  346. expected = DataFrame([np.nan])
  347. tm.assert_frame_equal(result, expected)
  348. @pytest.mark.parametrize("inf", [np.inf, -np.inf])
  349. @pytest.mark.parametrize("dtype", [True, False])
  350. def test_frame_infinity(self, inf, dtype):
  351. # infinities get mapped to nulls which get mapped to NaNs during
  352. # deserialisation
  353. df = DataFrame([[1, 2], [4, 5, 6]])
  354. df.loc[0, 2] = inf
  355. data = StringIO(df.to_json())
  356. result = read_json(data, dtype=dtype)
  357. assert np.isnan(result.iloc[0, 2])
  358. @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865")
  359. @pytest.mark.parametrize(
  360. "value,precision,expected_val",
  361. [
  362. (0.95, 1, 1.0),
  363. (1.95, 1, 2.0),
  364. (-1.95, 1, -2.0),
  365. (0.995, 2, 1.0),
  366. (0.9995, 3, 1.0),
  367. (0.99999999999999944, 15, 1.0),
  368. ],
  369. )
  370. def test_frame_to_json_float_precision(self, value, precision, expected_val):
  371. df = DataFrame([{"a_float": value}])
  372. encoded = df.to_json(double_precision=precision)
  373. assert encoded == f'{{"a_float":{{"0":{expected_val}}}}}'
  374. def test_frame_to_json_except(self):
  375. df = DataFrame([1, 2, 3])
  376. msg = "Invalid value 'garbage' for option 'orient'"
  377. with pytest.raises(ValueError, match=msg):
  378. df.to_json(orient="garbage")
  379. def test_frame_empty(self):
  380. df = DataFrame(columns=["jim", "joe"])
  381. assert not df._is_mixed_type
  382. data = StringIO(df.to_json())
  383. result = read_json(data, dtype=dict(df.dtypes))
  384. tm.assert_frame_equal(result, df, check_index_type=False)
  385. def test_frame_empty_to_json(self):
  386. # GH 7445
  387. df = DataFrame({"test": []}, index=[])
  388. result = df.to_json(orient="columns")
  389. expected = '{"test":{}}'
  390. assert result == expected
  391. def test_frame_empty_mixedtype(self):
  392. # mixed type
  393. df = DataFrame(columns=["jim", "joe"])
  394. df["joe"] = df["joe"].astype("i8")
  395. assert df._is_mixed_type
  396. data = df.to_json()
  397. tm.assert_frame_equal(
  398. read_json(StringIO(data), dtype=dict(df.dtypes)),
  399. df,
  400. check_index_type=False,
  401. )
  402. def test_frame_mixedtype_orient(self): # GH10289
  403. vals = [
  404. [10, 1, "foo", 0.1, 0.01],
  405. [20, 2, "bar", 0.2, 0.02],
  406. [30, 3, "baz", 0.3, 0.03],
  407. [40, 4, "qux", 0.4, 0.04],
  408. ]
  409. df = DataFrame(
  410. vals, index=list("abcd"), columns=["1st", "2nd", "3rd", "4th", "5th"]
  411. )
  412. assert df._is_mixed_type
  413. right = df.copy()
  414. for orient in ["split", "index", "columns"]:
  415. inp = StringIO(df.to_json(orient=orient))
  416. left = read_json(inp, orient=orient, convert_axes=False)
  417. tm.assert_frame_equal(left, right)
  418. right.index = RangeIndex(len(df))
  419. inp = StringIO(df.to_json(orient="records"))
  420. left = read_json(inp, orient="records", convert_axes=False)
  421. tm.assert_frame_equal(left, right)
  422. right.columns = RangeIndex(df.shape[1])
  423. inp = StringIO(df.to_json(orient="values"))
  424. left = read_json(inp, orient="values", convert_axes=False)
  425. tm.assert_frame_equal(left, right)
  426. def test_v12_compat(self, datapath):
  427. dti = date_range("2000-01-03", "2000-01-07")
  428. # freq doesn't roundtrip
  429. dti = DatetimeIndex(np.asarray(dti), freq=None)
  430. df = DataFrame(
  431. [
  432. [1.56808523, 0.65727391, 1.81021139, -0.17251653],
  433. [-0.2550111, -0.08072427, -0.03202878, -0.17581665],
  434. [1.51493992, 0.11805825, 1.629455, -1.31506612],
  435. [-0.02765498, 0.44679743, 0.33192641, -0.27885413],
  436. [0.05951614, -2.69652057, 1.28163262, 0.34703478],
  437. ],
  438. columns=["A", "B", "C", "D"],
  439. index=dti,
  440. )
  441. df["date"] = Timestamp("19920106 18:21:32.12").as_unit("ns")
  442. df.iloc[3, df.columns.get_loc("date")] = Timestamp("20130101")
  443. df["modified"] = df["date"]
  444. df.iloc[1, df.columns.get_loc("modified")] = pd.NaT
  445. dirpath = datapath("io", "json", "data")
  446. v12_json = os.path.join(dirpath, "tsframe_v012.json")
  447. df_unser = read_json(v12_json)
  448. tm.assert_frame_equal(df, df_unser)
  449. df_iso = df.drop(["modified"], axis=1)
  450. v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json")
  451. df_unser_iso = read_json(v12_iso_json)
  452. tm.assert_frame_equal(df_iso, df_unser_iso, check_column_type=False)
  453. def test_blocks_compat_GH9037(self, using_infer_string):
  454. index = date_range("20000101", periods=10, freq="h")
  455. # freq doesn't round-trip
  456. index = DatetimeIndex(list(index), freq=None)
  457. df_mixed = DataFrame(
  458. {
  459. "float_1": [
  460. -0.92077639,
  461. 0.77434435,
  462. 1.25234727,
  463. 0.61485564,
  464. -0.60316077,
  465. 0.24653374,
  466. 0.28668979,
  467. -2.51969012,
  468. 0.95748401,
  469. -1.02970536,
  470. ],
  471. "int_1": [
  472. 19680418,
  473. 75337055,
  474. 99973684,
  475. 65103179,
  476. 79373900,
  477. 40314334,
  478. 21290235,
  479. 4991321,
  480. 41903419,
  481. 16008365,
  482. ],
  483. "str_1": [
  484. "78c608f1",
  485. "64a99743",
  486. "13d2ff52",
  487. "ca7f4af2",
  488. "97236474",
  489. "bde7e214",
  490. "1a6bde47",
  491. "b1190be5",
  492. "7a669144",
  493. "8d64d068",
  494. ],
  495. "float_2": [
  496. -0.0428278,
  497. -1.80872357,
  498. 3.36042349,
  499. -0.7573685,
  500. -0.48217572,
  501. 0.86229683,
  502. 1.08935819,
  503. 0.93898739,
  504. -0.03030452,
  505. 1.43366348,
  506. ],
  507. "str_2": [
  508. "14f04af9",
  509. "d085da90",
  510. "4bcfac83",
  511. "81504caf",
  512. "2ffef4a9",
  513. "08e2f5c4",
  514. "07e1af03",
  515. "addbd4a7",
  516. "1f6a09ba",
  517. "4bfc4d87",
  518. ],
  519. "int_2": [
  520. 86967717,
  521. 98098830,
  522. 51927505,
  523. 20372254,
  524. 12601730,
  525. 20884027,
  526. 34193846,
  527. 10561746,
  528. 24867120,
  529. 76131025,
  530. ],
  531. },
  532. index=index,
  533. )
  534. # JSON deserialisation always creates unicode strings
  535. df_mixed.columns = df_mixed.columns.astype(
  536. np.str_ if not using_infer_string else "str"
  537. )
  538. data = StringIO(df_mixed.to_json(orient="split"))
  539. df_roundtrip = read_json(data, orient="split")
  540. tm.assert_frame_equal(
  541. df_mixed,
  542. df_roundtrip,
  543. check_index_type=True,
  544. check_column_type=True,
  545. by_blocks=True,
  546. check_exact=True,
  547. )
  548. def test_frame_nonprintable_bytes(self):
  549. # GH14256: failing column caused segfaults, if it is not the last one
  550. class BinaryThing:
  551. def __init__(self, hexed) -> None:
  552. self.hexed = hexed
  553. self.binary = bytes.fromhex(hexed)
  554. def __str__(self) -> str:
  555. return self.hexed
  556. hexed = "574b4454ba8c5eb4f98a8f45"
  557. binthing = BinaryThing(hexed)
  558. # verify the proper conversion of printable content
  559. df_printable = DataFrame({"A": [binthing.hexed]})
  560. assert df_printable.to_json() == f'{{"A":{{"0":"{hexed}"}}}}'
  561. # check if non-printable content throws appropriate Exception
  562. df_nonprintable = DataFrame({"A": [binthing]})
  563. msg = "Unsupported UTF-8 sequence length when encoding string"
  564. with pytest.raises(OverflowError, match=msg):
  565. df_nonprintable.to_json()
  566. # the same with multiple columns threw segfaults
  567. df_mixed = DataFrame({"A": [binthing], "B": [1]}, columns=["A", "B"])
  568. with pytest.raises(OverflowError, match=msg):
  569. df_mixed.to_json()
  570. # default_handler should resolve exceptions for non-string types
  571. result = df_nonprintable.to_json(default_handler=str)
  572. expected = f'{{"A":{{"0":"{hexed}"}}}}'
  573. assert result == expected
  574. assert (
  575. df_mixed.to_json(default_handler=str)
  576. == f'{{"A":{{"0":"{hexed}"}},"B":{{"0":1}}}}'
  577. )
  578. def test_label_overflow(self):
  579. # GH14256: buffer length not checked when writing label
  580. result = DataFrame({"bar" * 100000: [1], "foo": [1337]}).to_json()
  581. expected = f'{{"{"bar" * 100000}":{{"0":1}},"foo":{{"0":1337}}}}'
  582. assert result == expected
  583. def test_series_non_unique_index(self):
  584. s = Series(["a", "b"], index=[1, 1])
  585. msg = "Series index must be unique for orient='index'"
  586. with pytest.raises(ValueError, match=msg):
  587. s.to_json(orient="index")
  588. tm.assert_series_equal(
  589. s,
  590. read_json(
  591. StringIO(s.to_json(orient="split")), orient="split", typ="series"
  592. ),
  593. )
  594. unserialized = read_json(
  595. StringIO(s.to_json(orient="records")), orient="records", typ="series"
  596. )
  597. tm.assert_equal(s.values, unserialized.values)
  598. def test_series_default_orient(self, string_series):
  599. assert string_series.to_json() == string_series.to_json(orient="index")
  600. def test_series_roundtrip_simple(self, orient, string_series, using_infer_string):
  601. data = StringIO(string_series.to_json(orient=orient))
  602. result = read_json(data, typ="series", orient=orient)
  603. expected = string_series
  604. if using_infer_string and orient in ("split", "index", "columns"):
  605. # These schemas don't contain dtypes, so we infer string
  606. expected.index = expected.index.astype("str")
  607. if orient in ("values", "records"):
  608. expected = expected.reset_index(drop=True)
  609. if orient != "split":
  610. expected.name = None
  611. tm.assert_series_equal(result, expected)
  612. @pytest.mark.parametrize("dtype", [False, None])
  613. def test_series_roundtrip_object(self, orient, dtype, object_series):
  614. data = StringIO(object_series.to_json(orient=orient))
  615. result = read_json(data, typ="series", orient=orient, dtype=dtype)
  616. expected = object_series
  617. if orient in ("values", "records"):
  618. expected = expected.reset_index(drop=True)
  619. if orient != "split":
  620. expected.name = None
  621. if using_string_dtype():
  622. expected = expected.astype("str")
  623. tm.assert_series_equal(result, expected)
  624. def test_series_roundtrip_empty(self, orient):
  625. empty_series = Series([], index=[], dtype=np.float64)
  626. data = StringIO(empty_series.to_json(orient=orient))
  627. result = read_json(data, typ="series", orient=orient)
  628. expected = empty_series.reset_index(drop=True)
  629. if orient in ("split"):
  630. expected.index = expected.index.astype(np.float64)
  631. tm.assert_series_equal(result, expected)
  632. def test_series_roundtrip_timeseries(self, orient, datetime_series):
  633. data = StringIO(datetime_series.to_json(orient=orient))
  634. result = read_json(data, typ="series", orient=orient)
  635. expected = datetime_series
  636. if orient in ("values", "records"):
  637. expected = expected.reset_index(drop=True)
  638. if orient != "split":
  639. expected.name = None
  640. tm.assert_series_equal(result, expected)
  641. @pytest.mark.parametrize("dtype", [np.float64, int])
  642. def test_series_roundtrip_numeric(self, orient, dtype):
  643. s = Series(range(6), index=["a", "b", "c", "d", "e", "f"])
  644. data = StringIO(s.to_json(orient=orient))
  645. result = read_json(data, typ="series", orient=orient)
  646. expected = s.copy()
  647. if orient in ("values", "records"):
  648. expected = expected.reset_index(drop=True)
  649. tm.assert_series_equal(result, expected)
  650. def test_series_to_json_except(self):
  651. s = Series([1, 2, 3])
  652. msg = "Invalid value 'garbage' for option 'orient'"
  653. with pytest.raises(ValueError, match=msg):
  654. s.to_json(orient="garbage")
  655. def test_series_from_json_precise_float(self):
  656. s = Series([4.56, 4.56, 4.56])
  657. result = read_json(StringIO(s.to_json()), typ="series", precise_float=True)
  658. tm.assert_series_equal(result, s, check_index_type=False)
  659. def test_series_with_dtype(self):
  660. # GH 21986
  661. s = Series([4.56, 4.56, 4.56])
  662. result = read_json(StringIO(s.to_json()), typ="series", dtype=np.int64)
  663. expected = Series([4] * 3)
  664. tm.assert_series_equal(result, expected)
  665. @pytest.mark.parametrize(
  666. "dtype,expected",
  667. [
  668. (True, Series(["2000-01-01"], dtype="datetime64[ns]")),
  669. (False, Series([946684800000])),
  670. ],
  671. )
  672. def test_series_with_dtype_datetime(self, dtype, expected):
  673. s = Series(["2000-01-01"], dtype="datetime64[ns]")
  674. data = StringIO(s.to_json())
  675. result = read_json(data, typ="series", dtype=dtype)
  676. tm.assert_series_equal(result, expected)
  677. def test_frame_from_json_precise_float(self):
  678. df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]])
  679. result = read_json(StringIO(df.to_json()), precise_float=True)
  680. tm.assert_frame_equal(result, df)
  681. def test_typ(self):
  682. s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64")
  683. result = read_json(StringIO(s.to_json()), typ=None)
  684. tm.assert_series_equal(result, s)
  685. def test_reconstruction_index(self):
  686. df = DataFrame([[1, 2, 3], [4, 5, 6]])
  687. result = read_json(StringIO(df.to_json()))
  688. tm.assert_frame_equal(result, df)
  689. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=["A", "B", "C"])
  690. result = read_json(StringIO(df.to_json()))
  691. tm.assert_frame_equal(result, df)
  692. def test_path(self, float_frame, int_frame, datetime_frame):
  693. with tm.ensure_clean("test.json") as path:
  694. for df in [float_frame, int_frame, datetime_frame]:
  695. df.to_json(path)
  696. read_json(path)
  697. def test_axis_dates(self, datetime_series, datetime_frame):
  698. # frame
  699. json = StringIO(datetime_frame.to_json())
  700. result = read_json(json)
  701. tm.assert_frame_equal(result, datetime_frame)
  702. # series
  703. json = StringIO(datetime_series.to_json())
  704. result = read_json(json, typ="series")
  705. tm.assert_series_equal(result, datetime_series, check_names=False)
  706. assert result.name is None
  707. def test_convert_dates(self, datetime_series, datetime_frame):
  708. # frame
  709. df = datetime_frame
  710. df["date"] = Timestamp("20130101").as_unit("ns")
  711. json = StringIO(df.to_json())
  712. result = read_json(json)
  713. tm.assert_frame_equal(result, df)
  714. df["foo"] = 1.0
  715. json = StringIO(df.to_json(date_unit="ns"))
  716. result = read_json(json, convert_dates=False)
  717. expected = df.copy()
  718. expected["date"] = expected["date"].values.view("i8")
  719. expected["foo"] = expected["foo"].astype("int64")
  720. tm.assert_frame_equal(result, expected)
  721. # series
  722. ts = Series(Timestamp("20130101").as_unit("ns"), index=datetime_series.index)
  723. json = StringIO(ts.to_json())
  724. result = read_json(json, typ="series")
  725. tm.assert_series_equal(result, ts)
  726. @pytest.mark.parametrize("date_format", ["epoch", "iso"])
  727. @pytest.mark.parametrize("as_object", [True, False])
  728. @pytest.mark.parametrize("date_typ", [datetime.date, datetime.datetime, Timestamp])
  729. def test_date_index_and_values(self, date_format, as_object, date_typ):
  730. data = [date_typ(year=2020, month=1, day=1), pd.NaT]
  731. if as_object:
  732. data.append("a")
  733. ser = Series(data, index=data)
  734. result = ser.to_json(date_format=date_format)
  735. if date_format == "epoch":
  736. expected = '{"1577836800000":1577836800000,"null":null}'
  737. else:
  738. expected = (
  739. '{"2020-01-01T00:00:00.000":"2020-01-01T00:00:00.000","null":null}'
  740. )
  741. if as_object:
  742. expected = expected.replace("}", ',"a":"a"}')
  743. assert result == expected
  744. @pytest.mark.parametrize(
  745. "infer_word",
  746. [
  747. "trade_time",
  748. "date",
  749. "datetime",
  750. "sold_at",
  751. "modified",
  752. "timestamp",
  753. "timestamps",
  754. ],
  755. )
  756. def test_convert_dates_infer(self, infer_word):
  757. # GH10747
  758. data = [{"id": 1, infer_word: 1036713600000}, {"id": 2}]
  759. expected = DataFrame(
  760. [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word]
  761. )
  762. result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]]
  763. tm.assert_frame_equal(result, expected)
  764. @pytest.mark.parametrize(
  765. "date,date_unit",
  766. [
  767. ("20130101 20:43:42.123", None),
  768. ("20130101 20:43:42", "s"),
  769. ("20130101 20:43:42.123", "ms"),
  770. ("20130101 20:43:42.123456", "us"),
  771. ("20130101 20:43:42.123456789", "ns"),
  772. ],
  773. )
  774. def test_date_format_frame(self, date, date_unit, datetime_frame):
  775. df = datetime_frame
  776. df["date"] = Timestamp(date).as_unit("ns")
  777. df.iloc[1, df.columns.get_loc("date")] = pd.NaT
  778. df.iloc[5, df.columns.get_loc("date")] = pd.NaT
  779. if date_unit:
  780. json = df.to_json(date_format="iso", date_unit=date_unit)
  781. else:
  782. json = df.to_json(date_format="iso")
  783. result = read_json(StringIO(json))
  784. expected = df.copy()
  785. tm.assert_frame_equal(result, expected)
  786. def test_date_format_frame_raises(self, datetime_frame):
  787. df = datetime_frame
  788. msg = "Invalid value 'foo' for option 'date_unit'"
  789. with pytest.raises(ValueError, match=msg):
  790. df.to_json(date_format="iso", date_unit="foo")
  791. @pytest.mark.parametrize(
  792. "date,date_unit",
  793. [
  794. ("20130101 20:43:42.123", None),
  795. ("20130101 20:43:42", "s"),
  796. ("20130101 20:43:42.123", "ms"),
  797. ("20130101 20:43:42.123456", "us"),
  798. ("20130101 20:43:42.123456789", "ns"),
  799. ],
  800. )
  801. def test_date_format_series(self, date, date_unit, datetime_series):
  802. ts = Series(Timestamp(date).as_unit("ns"), index=datetime_series.index)
  803. ts.iloc[1] = pd.NaT
  804. ts.iloc[5] = pd.NaT
  805. if date_unit:
  806. json = ts.to_json(date_format="iso", date_unit=date_unit)
  807. else:
  808. json = ts.to_json(date_format="iso")
  809. result = read_json(StringIO(json), typ="series")
  810. expected = ts.copy()
  811. tm.assert_series_equal(result, expected)
  812. def test_date_format_series_raises(self, datetime_series):
  813. ts = Series(Timestamp("20130101 20:43:42.123"), index=datetime_series.index)
  814. msg = "Invalid value 'foo' for option 'date_unit'"
  815. with pytest.raises(ValueError, match=msg):
  816. ts.to_json(date_format="iso", date_unit="foo")
  817. @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
  818. def test_date_unit(self, unit, datetime_frame):
  819. df = datetime_frame
  820. df["date"] = Timestamp("20130101 20:43:42").as_unit("ns")
  821. dl = df.columns.get_loc("date")
  822. df.iloc[1, dl] = Timestamp("19710101 20:43:42")
  823. df.iloc[2, dl] = Timestamp("21460101 20:43:42")
  824. df.iloc[4, dl] = pd.NaT
  825. json = df.to_json(date_format="epoch", date_unit=unit)
  826. # force date unit
  827. result = read_json(StringIO(json), date_unit=unit)
  828. tm.assert_frame_equal(result, df)
  829. # detect date unit
  830. result = read_json(StringIO(json), date_unit=None)
  831. tm.assert_frame_equal(result, df)
  832. @pytest.mark.parametrize("unit", ["s", "ms", "us"])
  833. def test_iso_non_nano_datetimes(self, unit):
  834. # Test that numpy datetimes
  835. # in an Index or a column with non-nano resolution can be serialized
  836. # correctly
  837. # GH53686
  838. index = DatetimeIndex(
  839. [np.datetime64("2023-01-01T11:22:33.123456", unit)],
  840. dtype=f"datetime64[{unit}]",
  841. )
  842. df = DataFrame(
  843. {
  844. "date": Series(
  845. [np.datetime64("2022-01-01T11:22:33.123456", unit)],
  846. dtype=f"datetime64[{unit}]",
  847. index=index,
  848. ),
  849. "date_obj": Series(
  850. [np.datetime64("2023-01-01T11:22:33.123456", unit)],
  851. dtype=object,
  852. index=index,
  853. ),
  854. },
  855. )
  856. buf = StringIO()
  857. df.to_json(buf, date_format="iso", date_unit=unit)
  858. buf.seek(0)
  859. # read_json always reads datetimes in nanosecond resolution
  860. # TODO: check_dtype/check_index_type should be removable
  861. # once read_json gets non-nano support
  862. tm.assert_frame_equal(
  863. read_json(buf, convert_dates=["date", "date_obj"]),
  864. df,
  865. check_index_type=False,
  866. check_dtype=False,
  867. )
  868. def test_weird_nested_json(self):
  869. # this used to core dump the parser
  870. s = r"""{
  871. "status": "success",
  872. "data": {
  873. "posts": [
  874. {
  875. "id": 1,
  876. "title": "A blog post",
  877. "body": "Some useful content"
  878. },
  879. {
  880. "id": 2,
  881. "title": "Another blog post",
  882. "body": "More content"
  883. }
  884. ]
  885. }
  886. }"""
  887. read_json(StringIO(s))
  888. def test_doc_example(self):
  889. dfj2 = DataFrame(
  890. np.random.default_rng(2).standard_normal((5, 2)), columns=list("AB")
  891. )
  892. dfj2["date"] = Timestamp("20130101")
  893. dfj2["ints"] = range(5)
  894. dfj2["bools"] = True
  895. dfj2.index = date_range("20130101", periods=5)
  896. json = StringIO(dfj2.to_json())
  897. result = read_json(json, dtype={"ints": np.int64, "bools": np.bool_})
  898. tm.assert_frame_equal(result, result)
  899. def test_round_trip_exception(self, datapath):
  900. # GH 3867
  901. path = datapath("io", "json", "data", "teams.csv")
  902. df = pd.read_csv(path)
  903. s = df.to_json()
  904. result = read_json(StringIO(s))
  905. res = result.reindex(index=df.index, columns=df.columns)
  906. msg = "The 'downcast' keyword in fillna is deprecated"
  907. with tm.assert_produces_warning(FutureWarning, match=msg):
  908. res = res.fillna(np.nan, downcast=False)
  909. tm.assert_frame_equal(res, df)
  910. @pytest.mark.network
  911. @pytest.mark.single_cpu
  912. @pytest.mark.parametrize(
  913. "field,dtype",
  914. [
  915. ["created_at", pd.DatetimeTZDtype(tz="UTC")],
  916. ["closed_at", "datetime64[ns]"],
  917. ["updated_at", pd.DatetimeTZDtype(tz="UTC")],
  918. ],
  919. )
  920. def test_url(self, field, dtype, httpserver):
  921. data = '{"created_at": ["2023-06-23T18:21:36Z"], "closed_at": ["2023-06-23T18:21:36"], "updated_at": ["2023-06-23T18:21:36Z"]}\n' # noqa: E501
  922. httpserver.serve_content(content=data)
  923. result = read_json(httpserver.url, convert_dates=True)
  924. assert result[field].dtype == dtype
  925. def test_timedelta(self):
  926. converter = lambda x: pd.to_timedelta(x, unit="ms")
  927. ser = Series([timedelta(23), timedelta(seconds=5)])
  928. assert ser.dtype == "timedelta64[ns]"
  929. result = read_json(StringIO(ser.to_json()), typ="series").apply(converter)
  930. tm.assert_series_equal(result, ser)
  931. ser = Series([timedelta(23), timedelta(seconds=5)], index=Index([0, 1]))
  932. assert ser.dtype == "timedelta64[ns]"
  933. result = read_json(StringIO(ser.to_json()), typ="series").apply(converter)
  934. tm.assert_series_equal(result, ser)
  935. frame = DataFrame([timedelta(23), timedelta(seconds=5)])
  936. assert frame[0].dtype == "timedelta64[ns]"
  937. tm.assert_frame_equal(
  938. frame, read_json(StringIO(frame.to_json())).apply(converter)
  939. )
  940. def test_timedelta2(self):
  941. frame = DataFrame(
  942. {
  943. "a": [timedelta(days=23), timedelta(seconds=5)],
  944. "b": [1, 2],
  945. "c": date_range(start="20130101", periods=2),
  946. }
  947. )
  948. data = StringIO(frame.to_json(date_unit="ns"))
  949. result = read_json(data)
  950. result["a"] = pd.to_timedelta(result.a, unit="ns")
  951. result["c"] = pd.to_datetime(result.c)
  952. tm.assert_frame_equal(frame, result)
  953. def test_mixed_timedelta_datetime(self):
  954. td = timedelta(23)
  955. ts = Timestamp("20130101")
  956. frame = DataFrame({"a": [td, ts]}, dtype=object)
  957. expected = DataFrame(
  958. {"a": [pd.Timedelta(td).as_unit("ns")._value, ts.as_unit("ns")._value]}
  959. )
  960. data = StringIO(frame.to_json(date_unit="ns"))
  961. result = read_json(data, dtype={"a": "int64"})
  962. tm.assert_frame_equal(result, expected, check_index_type=False)
  963. @pytest.mark.parametrize("as_object", [True, False])
  964. @pytest.mark.parametrize("date_format", ["iso", "epoch"])
  965. @pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta])
  966. def test_timedelta_to_json(self, as_object, date_format, timedelta_typ):
  967. # GH28156: to_json not correctly formatting Timedelta
  968. data = [timedelta_typ(days=1), timedelta_typ(days=2), pd.NaT]
  969. if as_object:
  970. data.append("a")
  971. ser = Series(data, index=data)
  972. if date_format == "iso":
  973. expected = (
  974. '{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}'
  975. )
  976. else:
  977. expected = '{"86400000":86400000,"172800000":172800000,"null":null}'
  978. if as_object:
  979. expected = expected.replace("}", ',"a":"a"}')
  980. result = ser.to_json(date_format=date_format)
  981. assert result == expected
  982. @pytest.mark.parametrize("as_object", [True, False])
  983. @pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta])
  984. def test_timedelta_to_json_fractional_precision(self, as_object, timedelta_typ):
  985. data = [timedelta_typ(milliseconds=42)]
  986. ser = Series(data, index=data)
  987. if as_object:
  988. ser = ser.astype(object)
  989. result = ser.to_json()
  990. expected = '{"42":42}'
  991. assert result == expected
  992. def test_default_handler(self):
  993. value = object()
  994. frame = DataFrame({"a": [7, value]})
  995. expected = DataFrame({"a": [7, str(value)]})
  996. result = read_json(StringIO(frame.to_json(default_handler=str)))
  997. tm.assert_frame_equal(expected, result, check_index_type=False)
  998. def test_default_handler_indirect(self):
  999. def default(obj):
  1000. if isinstance(obj, complex):
  1001. return [("mathjs", "Complex"), ("re", obj.real), ("im", obj.imag)]
  1002. return str(obj)
  1003. df_list = [
  1004. 9,
  1005. DataFrame(
  1006. {"a": [1, "STR", complex(4, -5)], "b": [float("nan"), None, "N/A"]},
  1007. columns=["a", "b"],
  1008. ),
  1009. ]
  1010. expected = (
  1011. '[9,[[1,null],["STR",null],[[["mathjs","Complex"],'
  1012. '["re",4.0],["im",-5.0]],"N\\/A"]]]'
  1013. )
  1014. assert (
  1015. ujson_dumps(df_list, default_handler=default, orient="values") == expected
  1016. )
  1017. def test_default_handler_numpy_unsupported_dtype(self):
  1018. # GH12554 to_json raises 'Unhandled numpy dtype 15'
  1019. df = DataFrame(
  1020. {"a": [1, 2.3, complex(4, -5)], "b": [float("nan"), None, complex(1.2, 0)]},
  1021. columns=["a", "b"],
  1022. )
  1023. expected = (
  1024. '[["(1+0j)","(nan+0j)"],'
  1025. '["(2.3+0j)","(nan+0j)"],'
  1026. '["(4-5j)","(1.2+0j)"]]'
  1027. )
  1028. assert df.to_json(default_handler=str, orient="values") == expected
  1029. def test_default_handler_raises(self):
  1030. msg = "raisin"
  1031. def my_handler_raises(obj):
  1032. raise TypeError(msg)
  1033. with pytest.raises(TypeError, match=msg):
  1034. DataFrame({"a": [1, 2, object()]}).to_json(
  1035. default_handler=my_handler_raises
  1036. )
  1037. with pytest.raises(TypeError, match=msg):
  1038. DataFrame({"a": [1, 2, complex(4, -5)]}).to_json(
  1039. default_handler=my_handler_raises
  1040. )
  1041. def test_categorical(self):
  1042. # GH4377 df.to_json segfaults with non-ndarray blocks
  1043. df = DataFrame({"A": ["a", "b", "c", "a", "b", "b", "a"]})
  1044. df["B"] = df["A"]
  1045. expected = df.to_json()
  1046. df["B"] = df["A"].astype("category")
  1047. assert expected == df.to_json()
  1048. s = df["A"]
  1049. sc = df["B"]
  1050. assert s.to_json() == sc.to_json()
  1051. def test_datetime_tz(self):
  1052. # GH4377 df.to_json segfaults with non-ndarray blocks
  1053. tz_range = date_range("20130101", periods=3, tz="US/Eastern")
  1054. tz_naive = tz_range.tz_convert("utc").tz_localize(None)
  1055. df = DataFrame({"A": tz_range, "B": date_range("20130101", periods=3)})
  1056. df_naive = df.copy()
  1057. df_naive["A"] = tz_naive
  1058. expected = df_naive.to_json()
  1059. assert expected == df.to_json()
  1060. stz = Series(tz_range)
  1061. s_naive = Series(tz_naive)
  1062. assert stz.to_json() == s_naive.to_json()
  1063. def test_sparse(self):
  1064. # GH4377 df.to_json segfaults with non-ndarray blocks
  1065. df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
  1066. df.loc[:8] = np.nan
  1067. sdf = df.astype("Sparse")
  1068. expected = df.to_json()
  1069. assert expected == sdf.to_json()
  1070. s = Series(np.random.default_rng(2).standard_normal(10))
  1071. s.loc[:8] = np.nan
  1072. ss = s.astype("Sparse")
  1073. expected = s.to_json()
  1074. assert expected == ss.to_json()
  1075. @pytest.mark.parametrize(
  1076. "ts",
  1077. [
  1078. Timestamp("2013-01-10 05:00:00Z"),
  1079. Timestamp("2013-01-10 00:00:00", tz="US/Eastern"),
  1080. Timestamp("2013-01-10 00:00:00-0500"),
  1081. ],
  1082. )
  1083. def test_tz_is_utc(self, ts):
  1084. exp = '"2013-01-10T05:00:00.000Z"'
  1085. assert ujson_dumps(ts, iso_dates=True) == exp
  1086. dt = ts.to_pydatetime()
  1087. assert ujson_dumps(dt, iso_dates=True) == exp
  1088. def test_tz_is_naive(self):
  1089. ts = Timestamp("2013-01-10 05:00:00")
  1090. exp = '"2013-01-10T05:00:00.000"'
  1091. assert ujson_dumps(ts, iso_dates=True) == exp
  1092. dt = ts.to_pydatetime()
  1093. assert ujson_dumps(dt, iso_dates=True) == exp
  1094. @pytest.mark.parametrize(
  1095. "tz_range",
  1096. [
  1097. date_range("2013-01-01 05:00:00Z", periods=2),
  1098. date_range("2013-01-01 00:00:00", periods=2, tz="US/Eastern"),
  1099. date_range("2013-01-01 00:00:00-0500", periods=2),
  1100. ],
  1101. )
  1102. def test_tz_range_is_utc(self, tz_range):
  1103. exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]'
  1104. dfexp = (
  1105. '{"DT":{'
  1106. '"0":"2013-01-01T05:00:00.000Z",'
  1107. '"1":"2013-01-02T05:00:00.000Z"}}'
  1108. )
  1109. assert ujson_dumps(tz_range, iso_dates=True) == exp
  1110. dti = DatetimeIndex(tz_range)
  1111. # Ensure datetimes in object array are serialized correctly
  1112. # in addition to the normal DTI case
  1113. assert ujson_dumps(dti, iso_dates=True) == exp
  1114. assert ujson_dumps(dti.astype(object), iso_dates=True) == exp
  1115. df = DataFrame({"DT": dti})
  1116. result = ujson_dumps(df, iso_dates=True)
  1117. assert result == dfexp
  1118. assert ujson_dumps(df.astype({"DT": object}), iso_dates=True)
  1119. def test_tz_range_is_naive(self):
  1120. dti = date_range("2013-01-01 05:00:00", periods=2)
  1121. exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]'
  1122. dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}'
  1123. # Ensure datetimes in object array are serialized correctly
  1124. # in addition to the normal DTI case
  1125. assert ujson_dumps(dti, iso_dates=True) == exp
  1126. assert ujson_dumps(dti.astype(object), iso_dates=True) == exp
  1127. df = DataFrame({"DT": dti})
  1128. result = ujson_dumps(df, iso_dates=True)
  1129. assert result == dfexp
  1130. assert ujson_dumps(df.astype({"DT": object}), iso_dates=True)
  1131. def test_read_inline_jsonl(self):
  1132. # GH9180
  1133. result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True)
  1134. expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
  1135. tm.assert_frame_equal(result, expected)
  1136. @pytest.mark.single_cpu
  1137. @td.skip_if_not_us_locale
  1138. def test_read_s3_jsonl(self, s3_public_bucket_with_data, s3so):
  1139. # GH17200
  1140. result = read_json(
  1141. f"s3n://{s3_public_bucket_with_data.name}/items.jsonl",
  1142. lines=True,
  1143. storage_options=s3so,
  1144. )
  1145. expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
  1146. tm.assert_frame_equal(result, expected)
  1147. def test_read_local_jsonl(self):
  1148. # GH17200
  1149. with tm.ensure_clean("tmp_items.json") as path:
  1150. with open(path, "w", encoding="utf-8") as infile:
  1151. infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
  1152. result = read_json(path, lines=True)
  1153. expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
  1154. tm.assert_frame_equal(result, expected)
  1155. def test_read_jsonl_unicode_chars(self):
  1156. # GH15132: non-ascii unicode characters
  1157. # \u201d == RIGHT DOUBLE QUOTATION MARK
  1158. # simulate file handle
  1159. json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
  1160. json = StringIO(json)
  1161. result = read_json(json, lines=True)
  1162. expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
  1163. tm.assert_frame_equal(result, expected)
  1164. # simulate string
  1165. json = StringIO('{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n')
  1166. result = read_json(json, lines=True)
  1167. expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
  1168. tm.assert_frame_equal(result, expected)
  1169. @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
  1170. def test_to_json_large_numbers(self, bigNum):
  1171. # GH34473
  1172. series = Series(bigNum, dtype=object, index=["articleId"])
  1173. json = series.to_json()
  1174. expected = '{"articleId":' + str(bigNum) + "}"
  1175. assert json == expected
  1176. df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0])
  1177. json = df.to_json()
  1178. expected = '{"0":{"articleId":' + str(bigNum) + "}}"
  1179. assert json == expected
  1180. @pytest.mark.parametrize("bigNum", [-(2**63) - 1, 2**64])
  1181. def test_read_json_large_numbers(self, bigNum):
  1182. # GH20599, 26068
  1183. json = StringIO('{"articleId":' + str(bigNum) + "}")
  1184. msg = r"Value is too small|Value is too big"
  1185. with pytest.raises(ValueError, match=msg):
  1186. read_json(json)
  1187. json = StringIO('{"0":{"articleId":' + str(bigNum) + "}}")
  1188. with pytest.raises(ValueError, match=msg):
  1189. read_json(json)
  1190. def test_read_json_large_numbers2(self):
  1191. # GH18842
  1192. json = '{"articleId": "1404366058080022500245"}'
  1193. json = StringIO(json)
  1194. result = read_json(json, typ="series")
  1195. expected = Series(1.404366e21, index=["articleId"])
  1196. tm.assert_series_equal(result, expected)
  1197. json = '{"0": {"articleId": "1404366058080022500245"}}'
  1198. json = StringIO(json)
  1199. result = read_json(json)
  1200. expected = DataFrame(1.404366e21, index=["articleId"], columns=[0])
  1201. tm.assert_frame_equal(result, expected)
  1202. def test_to_jsonl(self):
  1203. # GH9180
  1204. df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
  1205. result = df.to_json(orient="records", lines=True)
  1206. expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n'
  1207. assert result == expected
  1208. df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"])
  1209. result = df.to_json(orient="records", lines=True)
  1210. expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n'
  1211. assert result == expected
  1212. tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
  1213. # GH15096: escaped characters in columns and data
  1214. df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"])
  1215. result = df.to_json(orient="records", lines=True)
  1216. expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n'
  1217. assert result == expected
  1218. tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
  1219. # TODO: there is a near-identical test for pytables; can we share?
  1220. @pytest.mark.xfail(reason="GH#13774 encoding kwarg not supported", raises=TypeError)
  1221. @pytest.mark.parametrize(
  1222. "val",
  1223. [
  1224. [b"E\xc9, 17", b"", b"a", b"b", b"c"],
  1225. [b"E\xc9, 17", b"a", b"b", b"c"],
  1226. [b"EE, 17", b"", b"a", b"b", b"c"],
  1227. [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"],
  1228. [b"", b"a", b"b", b"c"],
  1229. [b"\xf8\xfc", b"a", b"b", b"c"],
  1230. [b"A\xf8\xfc", b"", b"a", b"b", b"c"],
  1231. [np.nan, b"", b"b", b"c"],
  1232. [b"A\xf8\xfc", np.nan, b"", b"b", b"c"],
  1233. ],
  1234. )
  1235. @pytest.mark.parametrize("dtype", ["category", object])
  1236. def test_latin_encoding(self, dtype, val):
  1237. # GH 13774
  1238. ser = Series(
  1239. [x.decode("latin-1") if isinstance(x, bytes) else x for x in val],
  1240. dtype=dtype,
  1241. )
  1242. encoding = "latin-1"
  1243. with tm.ensure_clean("test.json") as path:
  1244. ser.to_json(path, encoding=encoding)
  1245. retr = read_json(StringIO(path), encoding=encoding)
  1246. tm.assert_series_equal(ser, retr, check_categorical=False)
  1247. def test_data_frame_size_after_to_json(self):
  1248. # GH15344
  1249. df = DataFrame({"a": [str(1)]})
  1250. size_before = df.memory_usage(index=True, deep=True).sum()
  1251. df.to_json()
  1252. size_after = df.memory_usage(index=True, deep=True).sum()
  1253. assert size_before == size_after
  1254. @pytest.mark.parametrize(
  1255. "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]]
  1256. )
  1257. @pytest.mark.parametrize("columns", [["a", "b"], ["1", "2"], ["1.", "2."]])
  1258. def test_from_json_to_json_table_index_and_columns(self, index, columns):
  1259. # GH25433 GH25435
  1260. expected = DataFrame([[1, 2], [3, 4]], index=index, columns=columns)
  1261. dfjson = expected.to_json(orient="table")
  1262. result = read_json(StringIO(dfjson), orient="table")
  1263. tm.assert_frame_equal(result, expected)
  1264. def test_from_json_to_json_table_dtypes(self):
  1265. # GH21345
  1266. expected = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]})
  1267. dfjson = expected.to_json(orient="table")
  1268. result = read_json(StringIO(dfjson), orient="table")
  1269. tm.assert_frame_equal(result, expected)
  1270. # TODO: We are casting to string which coerces None to NaN before casting back
  1271. # to object, ending up with incorrect na values
  1272. @pytest.mark.xfail(using_string_dtype(), reason="incorrect na conversion")
  1273. @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"])
  1274. def test_to_json_from_json_columns_dtypes(self, orient):
  1275. # GH21892 GH33205
  1276. expected = DataFrame.from_dict(
  1277. {
  1278. "Integer": Series([1, 2, 3], dtype="int64"),
  1279. "Float": Series([None, 2.0, 3.0], dtype="float64"),
  1280. "Object": Series([None, "", "c"], dtype="object"),
  1281. "Bool": Series([True, False, True], dtype="bool"),
  1282. "Category": Series(["a", "b", None], dtype="category"),
  1283. "Datetime": Series(
  1284. ["2020-01-01", None, "2020-01-03"], dtype="datetime64[ns]"
  1285. ),
  1286. }
  1287. )
  1288. dfjson = expected.to_json(orient=orient)
  1289. result = read_json(
  1290. StringIO(dfjson),
  1291. orient=orient,
  1292. dtype={
  1293. "Integer": "int64",
  1294. "Float": "float64",
  1295. "Object": "object",
  1296. "Bool": "bool",
  1297. "Category": "category",
  1298. "Datetime": "datetime64[ns]",
  1299. },
  1300. )
  1301. tm.assert_frame_equal(result, expected)
  1302. @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}])
  1303. def test_read_json_table_dtype_raises(self, dtype):
  1304. # GH21345
  1305. df = DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["5", "6"]})
  1306. dfjson = df.to_json(orient="table")
  1307. msg = "cannot pass both dtype and orient='table'"
  1308. with pytest.raises(ValueError, match=msg):
  1309. read_json(dfjson, orient="table", dtype=dtype)
  1310. @pytest.mark.parametrize("orient", ["index", "columns", "records", "values"])
  1311. def test_read_json_table_empty_axes_dtype(self, orient):
  1312. # GH28558
  1313. expected = DataFrame()
  1314. result = read_json(StringIO("{}"), orient=orient, convert_axes=True)
  1315. tm.assert_index_equal(result.index, expected.index)
  1316. tm.assert_index_equal(result.columns, expected.columns)
  1317. def test_read_json_table_convert_axes_raises(self):
  1318. # GH25433 GH25435
  1319. df = DataFrame([[1, 2], [3, 4]], index=[1.0, 2.0], columns=["1.", "2."])
  1320. dfjson = df.to_json(orient="table")
  1321. msg = "cannot pass both convert_axes and orient='table'"
  1322. with pytest.raises(ValueError, match=msg):
  1323. read_json(dfjson, orient="table", convert_axes=True)
  1324. @pytest.mark.parametrize(
  1325. "data, expected",
  1326. [
  1327. (
  1328. DataFrame([[1, 2], [4, 5]], columns=["a", "b"]),
  1329. {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]},
  1330. ),
  1331. (
  1332. DataFrame([[1, 2], [4, 5]], columns=["a", "b"]).rename_axis("foo"),
  1333. {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]},
  1334. ),
  1335. (
  1336. DataFrame(
  1337. [[1, 2], [4, 5]], columns=["a", "b"], index=[["a", "b"], ["c", "d"]]
  1338. ),
  1339. {"columns": ["a", "b"], "data": [[1, 2], [4, 5]]},
  1340. ),
  1341. (Series([1, 2, 3], name="A"), {"name": "A", "data": [1, 2, 3]}),
  1342. (
  1343. Series([1, 2, 3], name="A").rename_axis("foo"),
  1344. {"name": "A", "data": [1, 2, 3]},
  1345. ),
  1346. (
  1347. Series([1, 2], name="A", index=[["a", "b"], ["c", "d"]]),
  1348. {"name": "A", "data": [1, 2]},
  1349. ),
  1350. ],
  1351. )
  1352. def test_index_false_to_json_split(self, data, expected):
  1353. # GH 17394
  1354. # Testing index=False in to_json with orient='split'
  1355. result = data.to_json(orient="split", index=False)
  1356. result = json.loads(result)
  1357. assert result == expected
  1358. @pytest.mark.parametrize(
  1359. "data",
  1360. [
  1361. (DataFrame([[1, 2], [4, 5]], columns=["a", "b"])),
  1362. (DataFrame([[1, 2], [4, 5]], columns=["a", "b"]).rename_axis("foo")),
  1363. (
  1364. DataFrame(
  1365. [[1, 2], [4, 5]], columns=["a", "b"], index=[["a", "b"], ["c", "d"]]
  1366. )
  1367. ),
  1368. (Series([1, 2, 3], name="A")),
  1369. (Series([1, 2, 3], name="A").rename_axis("foo")),
  1370. (Series([1, 2], name="A", index=[["a", "b"], ["c", "d"]])),
  1371. ],
  1372. )
  1373. def test_index_false_to_json_table(self, data):
  1374. # GH 17394
  1375. # Testing index=False in to_json with orient='table'
  1376. result = data.to_json(orient="table", index=False)
  1377. result = json.loads(result)
  1378. expected = {
  1379. "schema": pd.io.json.build_table_schema(data, index=False),
  1380. "data": DataFrame(data).to_dict(orient="records"),
  1381. }
  1382. assert result == expected
  1383. @pytest.mark.parametrize("orient", ["index", "columns"])
  1384. def test_index_false_error_to_json(self, orient):
  1385. # GH 17394, 25513
  1386. # Testing error message from to_json with index=False
  1387. df = DataFrame([[1, 2], [4, 5]], columns=["a", "b"])
  1388. msg = (
  1389. "'index=False' is only valid when 'orient' is 'split', "
  1390. "'table', 'records', or 'values'"
  1391. )
  1392. with pytest.raises(ValueError, match=msg):
  1393. df.to_json(orient=orient, index=False)
  1394. @pytest.mark.parametrize("orient", ["records", "values"])
  1395. def test_index_true_error_to_json(self, orient):
  1396. # GH 25513
  1397. # Testing error message from to_json with index=True
  1398. df = DataFrame([[1, 2], [4, 5]], columns=["a", "b"])
  1399. msg = (
  1400. "'index=True' is only valid when 'orient' is 'split', "
  1401. "'table', 'index', or 'columns'"
  1402. )
  1403. with pytest.raises(ValueError, match=msg):
  1404. df.to_json(orient=orient, index=True)
  1405. @pytest.mark.parametrize("orient", ["split", "table"])
  1406. @pytest.mark.parametrize("index", [True, False])
  1407. def test_index_false_from_json_to_json(self, orient, index):
  1408. # GH25170
  1409. # Test index=False in from_json to_json
  1410. expected = DataFrame({"a": [1, 2], "b": [3, 4]})
  1411. dfjson = expected.to_json(orient=orient, index=index)
  1412. result = read_json(StringIO(dfjson), orient=orient)
  1413. tm.assert_frame_equal(result, expected)
  1414. def test_read_timezone_information(self):
  1415. # GH 25546
  1416. result = read_json(
  1417. StringIO('{"2019-01-01T11:00:00.000Z":88}'), typ="series", orient="index"
  1418. )
  1419. exp_dti = DatetimeIndex(["2019-01-01 11:00:00"], dtype="M8[ns, UTC]")
  1420. expected = Series([88], index=exp_dti)
  1421. tm.assert_series_equal(result, expected)
  1422. @pytest.mark.parametrize(
  1423. "url",
  1424. [
  1425. "s3://example-fsspec/",
  1426. "gcs://another-fsspec/file.json",
  1427. "https://example-site.com/data",
  1428. "some-protocol://data.txt",
  1429. ],
  1430. )
  1431. def test_read_json_with_url_value(self, url):
  1432. # GH 36271
  1433. result = read_json(StringIO(f'{{"url":{{"0":"{url}"}}}}'))
  1434. expected = DataFrame({"url": [url]})
  1435. tm.assert_frame_equal(result, expected)
  1436. @pytest.mark.parametrize(
  1437. "compression",
  1438. ["", ".gz", ".bz2", ".tar"],
  1439. )
  1440. def test_read_json_with_very_long_file_path(self, compression):
  1441. # GH 46718
  1442. long_json_path = f'{"a" * 1000}.json{compression}'
  1443. with pytest.raises(
  1444. FileNotFoundError, match=f"File {long_json_path} does not exist"
  1445. ):
  1446. # path too long for Windows is handled in file_exists() but raises in
  1447. # _get_data_from_filepath()
  1448. read_json(long_json_path)
  1449. @pytest.mark.parametrize(
  1450. "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")]
  1451. )
  1452. def test_timedelta_as_label(self, date_format, key):
  1453. df = DataFrame([[1]], columns=[pd.Timedelta("1D")])
  1454. expected = f'{{"{key}":{{"0":1}}}}'
  1455. result = df.to_json(date_format=date_format)
  1456. assert result == expected
  1457. @pytest.mark.parametrize(
  1458. "orient,expected",
  1459. [
  1460. ("index", "{\"('a', 'b')\":{\"('c', 'd')\":1}}"),
  1461. ("columns", "{\"('c', 'd')\":{\"('a', 'b')\":1}}"),
  1462. # TODO: the below have separate encoding procedures
  1463. pytest.param(
  1464. "split",
  1465. "",
  1466. marks=pytest.mark.xfail(
  1467. reason="Produces JSON but not in a consistent manner"
  1468. ),
  1469. ),
  1470. pytest.param(
  1471. "table",
  1472. "",
  1473. marks=pytest.mark.xfail(
  1474. reason="Produces JSON but not in a consistent manner"
  1475. ),
  1476. ),
  1477. ],
  1478. )
  1479. def test_tuple_labels(self, orient, expected):
  1480. # GH 20500
  1481. df = DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")])
  1482. result = df.to_json(orient=orient)
  1483. assert result == expected
  1484. @pytest.mark.parametrize("indent", [1, 2, 4])
  1485. def test_to_json_indent(self, indent):
  1486. # GH 12004
  1487. df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"])
  1488. result = df.to_json(indent=indent)
  1489. spaces = " " * indent
  1490. expected = f"""{{
  1491. {spaces}"a":{{
  1492. {spaces}{spaces}"0":"foo",
  1493. {spaces}{spaces}"1":"baz"
  1494. {spaces}}},
  1495. {spaces}"b":{{
  1496. {spaces}{spaces}"0":"bar",
  1497. {spaces}{spaces}"1":"qux"
  1498. {spaces}}}
  1499. }}"""
  1500. assert result == expected
  1501. @pytest.mark.skipif(
  1502. using_string_dtype(),
  1503. reason="Adjust expected when infer_string is default, no bug here, "
  1504. "just a complicated parametrization",
  1505. )
  1506. @pytest.mark.parametrize(
  1507. "orient,expected",
  1508. [
  1509. (
  1510. "split",
  1511. """{
  1512. "columns":[
  1513. "a",
  1514. "b"
  1515. ],
  1516. "index":[
  1517. 0,
  1518. 1
  1519. ],
  1520. "data":[
  1521. [
  1522. "foo",
  1523. "bar"
  1524. ],
  1525. [
  1526. "baz",
  1527. "qux"
  1528. ]
  1529. ]
  1530. }""",
  1531. ),
  1532. (
  1533. "records",
  1534. """[
  1535. {
  1536. "a":"foo",
  1537. "b":"bar"
  1538. },
  1539. {
  1540. "a":"baz",
  1541. "b":"qux"
  1542. }
  1543. ]""",
  1544. ),
  1545. (
  1546. "index",
  1547. """{
  1548. "0":{
  1549. "a":"foo",
  1550. "b":"bar"
  1551. },
  1552. "1":{
  1553. "a":"baz",
  1554. "b":"qux"
  1555. }
  1556. }""",
  1557. ),
  1558. (
  1559. "columns",
  1560. """{
  1561. "a":{
  1562. "0":"foo",
  1563. "1":"baz"
  1564. },
  1565. "b":{
  1566. "0":"bar",
  1567. "1":"qux"
  1568. }
  1569. }""",
  1570. ),
  1571. (
  1572. "values",
  1573. """[
  1574. [
  1575. "foo",
  1576. "bar"
  1577. ],
  1578. [
  1579. "baz",
  1580. "qux"
  1581. ]
  1582. ]""",
  1583. ),
  1584. (
  1585. "table",
  1586. """{
  1587. "schema":{
  1588. "fields":[
  1589. {
  1590. "name":"index",
  1591. "type":"integer"
  1592. },
  1593. {
  1594. "name":"a",
  1595. "type":"string"
  1596. },
  1597. {
  1598. "name":"b",
  1599. "type":"string"
  1600. }
  1601. ],
  1602. "primaryKey":[
  1603. "index"
  1604. ],
  1605. "pandas_version":"1.4.0"
  1606. },
  1607. "data":[
  1608. {
  1609. "index":0,
  1610. "a":"foo",
  1611. "b":"bar"
  1612. },
  1613. {
  1614. "index":1,
  1615. "a":"baz",
  1616. "b":"qux"
  1617. }
  1618. ]
  1619. }""",
  1620. ),
  1621. ],
  1622. )
  1623. def test_json_indent_all_orients(self, orient, expected):
  1624. # GH 12004
  1625. df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"])
  1626. result = df.to_json(orient=orient, indent=4)
  1627. assert result == expected
  1628. def test_json_negative_indent_raises(self):
  1629. with pytest.raises(ValueError, match="must be a nonnegative integer"):
  1630. DataFrame().to_json(indent=-1)
  1631. def test_emca_262_nan_inf_support(self):
  1632. # GH 12213
  1633. data = StringIO(
  1634. '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]'
  1635. )
  1636. result = read_json(data)
  1637. expected = DataFrame(
  1638. ["a", None, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"]
  1639. )
  1640. tm.assert_frame_equal(result, expected)
  1641. def test_frame_int_overflow(self):
  1642. # GH 30320
  1643. encoded_json = json.dumps([{"col": "31900441201190696999"}, {"col": "Text"}])
  1644. expected = DataFrame({"col": ["31900441201190696999", "Text"]})
  1645. result = read_json(StringIO(encoded_json))
  1646. tm.assert_frame_equal(result, expected)
  1647. @pytest.mark.parametrize(
  1648. "dataframe,expected",
  1649. [
  1650. (
  1651. DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]}),
  1652. '{"(0, \'x\')":1,"(0, \'y\')":"a","(1, \'x\')":2,'
  1653. '"(1, \'y\')":"b","(2, \'x\')":3,"(2, \'y\')":"c"}',
  1654. )
  1655. ],
  1656. )
  1657. def test_json_multiindex(self, dataframe, expected):
  1658. series = dataframe.stack(future_stack=True)
  1659. result = series.to_json(orient="index")
  1660. assert result == expected
  1661. @pytest.mark.single_cpu
  1662. def test_to_s3(self, s3_public_bucket, s3so):
  1663. # GH 28375
  1664. mock_bucket_name, target_file = s3_public_bucket.name, "test.json"
  1665. df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
  1666. df.to_json(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so)
  1667. timeout = 5
  1668. while True:
  1669. if target_file in (obj.key for obj in s3_public_bucket.objects.all()):
  1670. break
  1671. time.sleep(0.1)
  1672. timeout -= 0.1
  1673. assert timeout > 0, "Timed out waiting for file to appear on moto"
  1674. def test_json_pandas_nulls(self, nulls_fixture, request):
  1675. # GH 31615
  1676. if isinstance(nulls_fixture, Decimal):
  1677. mark = pytest.mark.xfail(reason="not implemented")
  1678. request.applymarker(mark)
  1679. result = DataFrame([[nulls_fixture]]).to_json()
  1680. assert result == '{"0":{"0":null}}'
  1681. def test_readjson_bool_series(self):
  1682. # GH31464
  1683. result = read_json(StringIO("[true, true, false]"), typ="series")
  1684. expected = Series([True, True, False])
  1685. tm.assert_series_equal(result, expected)
  1686. def test_to_json_multiindex_escape(self):
  1687. # GH 15273
  1688. df = DataFrame(
  1689. True,
  1690. index=date_range("2017-01-20", "2017-01-23"),
  1691. columns=["foo", "bar"],
  1692. ).stack(future_stack=True)
  1693. result = df.to_json()
  1694. expected = (
  1695. "{\"(Timestamp('2017-01-20 00:00:00'), 'foo')\":true,"
  1696. "\"(Timestamp('2017-01-20 00:00:00'), 'bar')\":true,"
  1697. "\"(Timestamp('2017-01-21 00:00:00'), 'foo')\":true,"
  1698. "\"(Timestamp('2017-01-21 00:00:00'), 'bar')\":true,"
  1699. "\"(Timestamp('2017-01-22 00:00:00'), 'foo')\":true,"
  1700. "\"(Timestamp('2017-01-22 00:00:00'), 'bar')\":true,"
  1701. "\"(Timestamp('2017-01-23 00:00:00'), 'foo')\":true,"
  1702. "\"(Timestamp('2017-01-23 00:00:00'), 'bar')\":true}"
  1703. )
  1704. assert result == expected
  1705. def test_to_json_series_of_objects(self):
  1706. class _TestObject:
  1707. def __init__(self, a, b, _c, d) -> None:
  1708. self.a = a
  1709. self.b = b
  1710. self._c = _c
  1711. self.d = d
  1712. def e(self):
  1713. return 5
  1714. # JSON keys should be all non-callable non-underscore attributes, see GH-42768
  1715. series = Series([_TestObject(a=1, b=2, _c=3, d=4)])
  1716. assert json.loads(series.to_json()) == {"0": {"a": 1, "b": 2, "d": 4}}
  1717. @pytest.mark.parametrize(
  1718. "data,expected",
  1719. [
  1720. (
  1721. Series({0: -6 + 8j, 1: 0 + 1j, 2: 9 - 5j}),
  1722. '{"0":{"imag":8.0,"real":-6.0},'
  1723. '"1":{"imag":1.0,"real":0.0},'
  1724. '"2":{"imag":-5.0,"real":9.0}}',
  1725. ),
  1726. (
  1727. Series({0: -9.39 + 0.66j, 1: 3.95 + 9.32j, 2: 4.03 - 0.17j}),
  1728. '{"0":{"imag":0.66,"real":-9.39},'
  1729. '"1":{"imag":9.32,"real":3.95},'
  1730. '"2":{"imag":-0.17,"real":4.03}}',
  1731. ),
  1732. (
  1733. DataFrame([[-2 + 3j, -1 - 0j], [4 - 3j, -0 - 10j]]),
  1734. '{"0":{"0":{"imag":3.0,"real":-2.0},'
  1735. '"1":{"imag":-3.0,"real":4.0}},'
  1736. '"1":{"0":{"imag":0.0,"real":-1.0},'
  1737. '"1":{"imag":-10.0,"real":0.0}}}',
  1738. ),
  1739. (
  1740. DataFrame(
  1741. [[-0.28 + 0.34j, -1.08 - 0.39j], [0.41 - 0.34j, -0.78 - 1.35j]]
  1742. ),
  1743. '{"0":{"0":{"imag":0.34,"real":-0.28},'
  1744. '"1":{"imag":-0.34,"real":0.41}},'
  1745. '"1":{"0":{"imag":-0.39,"real":-1.08},'
  1746. '"1":{"imag":-1.35,"real":-0.78}}}',
  1747. ),
  1748. ],
  1749. )
  1750. def test_complex_data_tojson(self, data, expected):
  1751. # GH41174
  1752. result = data.to_json()
  1753. assert result == expected
  1754. def test_json_uint64(self):
  1755. # GH21073
  1756. expected = (
  1757. '{"columns":["col1"],"index":[0,1],'
  1758. '"data":[[13342205958987758245],[12388075603347835679]]}'
  1759. )
  1760. df = DataFrame(data={"col1": [13342205958987758245, 12388075603347835679]})
  1761. result = df.to_json(orient="split")
  1762. assert result == expected
  1763. @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
  1764. def test_read_json_dtype_backend(
  1765. self, string_storage, dtype_backend, orient, using_infer_string
  1766. ):
  1767. # GH#50750
  1768. df = DataFrame(
  1769. {
  1770. "a": Series([1, np.nan, 3], dtype="Int64"),
  1771. "b": Series([1, 2, 3], dtype="Int64"),
  1772. "c": Series([1.5, np.nan, 2.5], dtype="Float64"),
  1773. "d": Series([1.5, 2.0, 2.5], dtype="Float64"),
  1774. "e": [True, False, None],
  1775. "f": [True, False, True],
  1776. "g": ["a", "b", "c"],
  1777. "h": ["a", "b", None],
  1778. }
  1779. )
  1780. out = df.to_json(orient=orient)
  1781. with pd.option_context("mode.string_storage", string_storage):
  1782. result = read_json(
  1783. StringIO(out), dtype_backend=dtype_backend, orient=orient
  1784. )
  1785. if dtype_backend == "pyarrow":
  1786. pa = pytest.importorskip("pyarrow")
  1787. string_dtype = pd.ArrowDtype(pa.string())
  1788. else:
  1789. string_dtype = pd.StringDtype(string_storage)
  1790. expected = DataFrame(
  1791. {
  1792. "a": Series([1, np.nan, 3], dtype="Int64"),
  1793. "b": Series([1, 2, 3], dtype="Int64"),
  1794. "c": Series([1.5, np.nan, 2.5], dtype="Float64"),
  1795. "d": Series([1.5, 2.0, 2.5], dtype="Float64"),
  1796. "e": Series([True, False, NA], dtype="boolean"),
  1797. "f": Series([True, False, True], dtype="boolean"),
  1798. "g": Series(["a", "b", "c"], dtype=string_dtype),
  1799. "h": Series(["a", "b", None], dtype=string_dtype),
  1800. }
  1801. )
  1802. if dtype_backend == "pyarrow":
  1803. pa = pytest.importorskip("pyarrow")
  1804. from pandas.arrays import ArrowExtensionArray
  1805. expected = DataFrame(
  1806. {
  1807. col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
  1808. for col in expected.columns
  1809. }
  1810. )
  1811. if orient == "values":
  1812. expected.columns = list(range(8))
  1813. # the storage of the str columns' Index is also affected by the
  1814. # string_storage setting -> ignore that for checking the result
  1815. tm.assert_frame_equal(result, expected, check_column_type=False)
  1816. @pytest.mark.parametrize("orient", ["split", "records", "index"])
  1817. def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
  1818. # GH#50750
  1819. pa = pytest.importorskip("pyarrow")
  1820. ser = Series([1, np.nan, 3], dtype="Int64")
  1821. out = ser.to_json(orient=orient)
  1822. with pd.option_context("mode.string_storage", string_storage):
  1823. result = read_json(
  1824. StringIO(out), dtype_backend=dtype_backend, orient=orient, typ="series"
  1825. )
  1826. expected = Series([1, np.nan, 3], dtype="Int64")
  1827. if dtype_backend == "pyarrow":
  1828. from pandas.arrays import ArrowExtensionArray
  1829. expected = Series(ArrowExtensionArray(pa.array(expected, from_pandas=True)))
  1830. tm.assert_series_equal(result, expected)
  1831. def test_invalid_dtype_backend(self):
  1832. msg = (
  1833. "dtype_backend numpy is invalid, only 'numpy_nullable' and "
  1834. "'pyarrow' are allowed."
  1835. )
  1836. with pytest.raises(ValueError, match=msg):
  1837. read_json("test", dtype_backend="numpy")
  1838. def test_invalid_engine():
  1839. # GH 48893
  1840. ser = Series(range(1))
  1841. out = ser.to_json()
  1842. with pytest.raises(ValueError, match="The engine type foo"):
  1843. read_json(out, engine="foo")
  1844. def test_pyarrow_engine_lines_false():
  1845. # GH 48893
  1846. ser = Series(range(1))
  1847. out = ser.to_json()
  1848. with pytest.raises(ValueError, match="currently pyarrow engine only supports"):
  1849. read_json(out, engine="pyarrow", lines=False)
  1850. def test_json_roundtrip_string_inference(orient):
  1851. df = DataFrame(
  1852. [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"]
  1853. )
  1854. out = df.to_json()
  1855. with pd.option_context("future.infer_string", True):
  1856. result = read_json(StringIO(out))
  1857. dtype = pd.StringDtype(na_value=np.nan)
  1858. expected = DataFrame(
  1859. [["a", "b"], ["c", "d"]],
  1860. dtype=dtype,
  1861. index=Index(["row 1", "row 2"], dtype=dtype),
  1862. columns=Index(["col 1", "col 2"], dtype=dtype),
  1863. )
  1864. tm.assert_frame_equal(result, expected)
  1865. def test_json_pos_args_deprecation():
  1866. # GH-54229
  1867. df = DataFrame({"a": [1, 2, 3]})
  1868. msg = (
  1869. r"Starting with pandas version 3.0 all arguments of to_json except for the "
  1870. r"argument 'path_or_buf' will be keyword-only."
  1871. )
  1872. with tm.assert_produces_warning(FutureWarning, match=msg):
  1873. buf = BytesIO()
  1874. df.to_json(buf, "split")
  1875. @td.skip_if_no("pyarrow")
  1876. def test_to_json_ea_null():
  1877. # GH#57224
  1878. df = DataFrame(
  1879. {
  1880. "a": Series([1, NA], dtype="int64[pyarrow]"),
  1881. "b": Series([2, NA], dtype="Int64"),
  1882. }
  1883. )
  1884. result = df.to_json(orient="records", lines=True)
  1885. expected = """{"a":1,"b":2}
  1886. {"a":null,"b":null}
  1887. """
  1888. assert result == expected
  1889. def test_read_json_lines_rangeindex():
  1890. # GH 57429
  1891. data = """
  1892. {"a": 1, "b": 2}
  1893. {"a": 3, "b": 4}
  1894. """
  1895. result = read_json(StringIO(data), lines=True).index
  1896. expected = RangeIndex(2)
  1897. tm.assert_index_equal(result, expected, exact=True)