test_readlines.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543
  1. from collections.abc import Iterator
  2. from io import StringIO
  3. from pathlib import Path
  4. import numpy as np
  5. import pytest
  6. import pandas as pd
  7. from pandas import (
  8. DataFrame,
  9. read_json,
  10. )
  11. import pandas._testing as tm
  12. from pandas.io.json._json import JsonReader
  13. pytestmark = pytest.mark.filterwarnings(
  14. "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
  15. )
  16. @pytest.fixture
  17. def lines_json_df():
  18. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  19. return df.to_json(lines=True, orient="records")
  20. @pytest.fixture(params=["ujson", "pyarrow"])
  21. def engine(request):
  22. if request.param == "pyarrow":
  23. pytest.importorskip("pyarrow.json")
  24. return request.param
  25. def test_read_jsonl():
  26. # GH9180
  27. result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True)
  28. expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
  29. tm.assert_frame_equal(result, expected)
  30. def test_read_jsonl_engine_pyarrow(datapath, engine):
  31. result = read_json(
  32. datapath("io", "json", "data", "line_delimited.json"),
  33. lines=True,
  34. engine=engine,
  35. )
  36. expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]})
  37. tm.assert_frame_equal(result, expected)
  38. def test_read_datetime(request, engine):
  39. # GH33787
  40. if engine == "pyarrow":
  41. # GH 48893
  42. reason = "Pyarrow only supports a file path as an input and line delimited json"
  43. request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
  44. df = DataFrame(
  45. [([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")],
  46. columns=["accounts", "date", "name"],
  47. )
  48. json_line = df.to_json(lines=True, orient="records")
  49. if engine == "pyarrow":
  50. result = read_json(StringIO(json_line), engine=engine)
  51. else:
  52. result = read_json(StringIO(json_line), engine=engine)
  53. expected = DataFrame(
  54. [[1, "2020-03-05", "hector"], [2, "2020-04-08T09:58:49+00:00", "hector"]],
  55. columns=["accounts", "date", "name"],
  56. )
  57. tm.assert_frame_equal(result, expected)
  58. def test_read_jsonl_unicode_chars():
  59. # GH15132: non-ascii unicode characters
  60. # \u201d == RIGHT DOUBLE QUOTATION MARK
  61. # simulate file handle
  62. json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
  63. json = StringIO(json)
  64. result = read_json(json, lines=True)
  65. expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
  66. tm.assert_frame_equal(result, expected)
  67. # simulate string
  68. json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
  69. result = read_json(StringIO(json), lines=True)
  70. expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
  71. tm.assert_frame_equal(result, expected)
  72. def test_to_jsonl():
  73. # GH9180
  74. df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
  75. result = df.to_json(orient="records", lines=True)
  76. expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n'
  77. assert result == expected
  78. df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"])
  79. result = df.to_json(orient="records", lines=True)
  80. expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n'
  81. assert result == expected
  82. tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
  83. # GH15096: escaped characters in columns and data
  84. df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"])
  85. result = df.to_json(orient="records", lines=True)
  86. expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n'
  87. assert result == expected
  88. tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
  89. def test_to_jsonl_count_new_lines():
  90. # GH36888
  91. df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
  92. actual_new_lines_count = df.to_json(orient="records", lines=True).count("\n")
  93. expected_new_lines_count = 2
  94. assert actual_new_lines_count == expected_new_lines_count
  95. @pytest.mark.parametrize("chunksize", [1, 1.0])
  96. def test_readjson_chunks(request, lines_json_df, chunksize, engine):
  97. # Basic test that read_json(chunks=True) gives the same result as
  98. # read_json(chunks=False)
  99. # GH17048: memory usage when lines=True
  100. if engine == "pyarrow":
  101. # GH 48893
  102. reason = (
  103. "Pyarrow only supports a file path as an input and line delimited json"
  104. "and doesn't support chunksize parameter."
  105. )
  106. request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
  107. unchunked = read_json(StringIO(lines_json_df), lines=True)
  108. with read_json(
  109. StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
  110. ) as reader:
  111. chunked = pd.concat(reader)
  112. tm.assert_frame_equal(chunked, unchunked)
  113. def test_readjson_chunksize_requires_lines(lines_json_df, engine):
  114. msg = "chunksize can only be passed if lines=True"
  115. with pytest.raises(ValueError, match=msg):
  116. with read_json(
  117. StringIO(lines_json_df), lines=False, chunksize=2, engine=engine
  118. ) as _:
  119. pass
  120. def test_readjson_chunks_series(request, engine):
  121. if engine == "pyarrow":
  122. # GH 48893
  123. reason = (
  124. "Pyarrow only supports a file path as an input and line delimited json"
  125. "and doesn't support chunksize parameter."
  126. )
  127. request.applymarker(pytest.mark.xfail(reason=reason))
  128. # Test reading line-format JSON to Series with chunksize param
  129. s = pd.Series({"A": 1, "B": 2})
  130. strio = StringIO(s.to_json(lines=True, orient="records"))
  131. unchunked = read_json(strio, lines=True, typ="Series", engine=engine)
  132. strio = StringIO(s.to_json(lines=True, orient="records"))
  133. with read_json(
  134. strio, lines=True, typ="Series", chunksize=1, engine=engine
  135. ) as reader:
  136. chunked = pd.concat(reader)
  137. tm.assert_series_equal(chunked, unchunked)
  138. def test_readjson_each_chunk(request, lines_json_df, engine):
  139. if engine == "pyarrow":
  140. # GH 48893
  141. reason = (
  142. "Pyarrow only supports a file path as an input and line delimited json"
  143. "and doesn't support chunksize parameter."
  144. )
  145. request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
  146. # Other tests check that the final result of read_json(chunksize=True)
  147. # is correct. This checks the intermediate chunks.
  148. with read_json(
  149. StringIO(lines_json_df), lines=True, chunksize=2, engine=engine
  150. ) as reader:
  151. chunks = list(reader)
  152. assert chunks[0].shape == (2, 2)
  153. assert chunks[1].shape == (1, 2)
  154. def test_readjson_chunks_from_file(request, engine):
  155. if engine == "pyarrow":
  156. # GH 48893
  157. reason = (
  158. "Pyarrow only supports a file path as an input and line delimited json"
  159. "and doesn't support chunksize parameter."
  160. )
  161. request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
  162. with tm.ensure_clean("test.json") as path:
  163. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  164. df.to_json(path, lines=True, orient="records")
  165. with read_json(path, lines=True, chunksize=1, engine=engine) as reader:
  166. chunked = pd.concat(reader)
  167. unchunked = read_json(path, lines=True, engine=engine)
  168. tm.assert_frame_equal(unchunked, chunked)
  169. @pytest.mark.parametrize("chunksize", [None, 1])
  170. def test_readjson_chunks_closes(chunksize):
  171. with tm.ensure_clean("test.json") as path:
  172. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  173. df.to_json(path, lines=True, orient="records")
  174. reader = JsonReader(
  175. path,
  176. orient=None,
  177. typ="frame",
  178. dtype=True,
  179. convert_axes=True,
  180. convert_dates=True,
  181. keep_default_dates=True,
  182. precise_float=False,
  183. date_unit=None,
  184. encoding=None,
  185. lines=True,
  186. chunksize=chunksize,
  187. compression=None,
  188. nrows=None,
  189. )
  190. with reader:
  191. reader.read()
  192. assert (
  193. reader.handles.handle.closed
  194. ), f"didn't close stream with chunksize = {chunksize}"
  195. @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
  196. def test_readjson_invalid_chunksize(lines_json_df, chunksize, engine):
  197. msg = r"'chunksize' must be an integer >=1"
  198. with pytest.raises(ValueError, match=msg):
  199. with read_json(
  200. StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
  201. ) as _:
  202. pass
  203. @pytest.mark.parametrize("chunksize", [None, 1, 2])
  204. def test_readjson_chunks_multiple_empty_lines(chunksize):
  205. j = """
  206. {"A":1,"B":4}
  207. {"A":2,"B":5}
  208. {"A":3,"B":6}
  209. """
  210. orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  211. test = read_json(StringIO(j), lines=True, chunksize=chunksize)
  212. if chunksize is not None:
  213. with test:
  214. test = pd.concat(test)
  215. tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}")
  216. def test_readjson_unicode(request, monkeypatch, engine):
  217. if engine == "pyarrow":
  218. # GH 48893
  219. reason = (
  220. "Pyarrow only supports a file path as an input and line delimited json"
  221. "and doesn't support chunksize parameter."
  222. )
  223. request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
  224. with tm.ensure_clean("test.json") as path:
  225. monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949")
  226. with open(path, "w", encoding="utf-8") as f:
  227. f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}')
  228. result = read_json(path, engine=engine)
  229. expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
  230. tm.assert_frame_equal(result, expected)
  231. @pytest.mark.parametrize("nrows", [1, 2])
  232. def test_readjson_nrows(nrows, engine):
  233. # GH 33916
  234. # Test reading line-format JSON to Series with nrows param
  235. jsonl = """{"a": 1, "b": 2}
  236. {"a": 3, "b": 4}
  237. {"a": 5, "b": 6}
  238. {"a": 7, "b": 8}"""
  239. result = read_json(StringIO(jsonl), lines=True, nrows=nrows)
  240. expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
  241. tm.assert_frame_equal(result, expected)
  242. @pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
  243. def test_readjson_nrows_chunks(request, nrows, chunksize, engine):
  244. # GH 33916
  245. # Test reading line-format JSON to Series with nrows and chunksize param
  246. if engine == "pyarrow":
  247. # GH 48893
  248. reason = (
  249. "Pyarrow only supports a file path as an input and line delimited json"
  250. "and doesn't support chunksize parameter."
  251. )
  252. request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
  253. jsonl = """{"a": 1, "b": 2}
  254. {"a": 3, "b": 4}
  255. {"a": 5, "b": 6}
  256. {"a": 7, "b": 8}"""
  257. if engine != "pyarrow":
  258. with read_json(
  259. StringIO(jsonl), lines=True, nrows=nrows, chunksize=chunksize, engine=engine
  260. ) as reader:
  261. chunked = pd.concat(reader)
  262. else:
  263. with read_json(
  264. jsonl, lines=True, nrows=nrows, chunksize=chunksize, engine=engine
  265. ) as reader:
  266. chunked = pd.concat(reader)
  267. expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
  268. tm.assert_frame_equal(chunked, expected)
  269. def test_readjson_nrows_requires_lines(engine):
  270. # GH 33916
  271. # Test ValueError raised if nrows is set without setting lines in read_json
  272. jsonl = """{"a": 1, "b": 2}
  273. {"a": 3, "b": 4}
  274. {"a": 5, "b": 6}
  275. {"a": 7, "b": 8}"""
  276. msg = "nrows can only be passed if lines=True"
  277. with pytest.raises(ValueError, match=msg):
  278. read_json(jsonl, lines=False, nrows=2, engine=engine)
  279. def test_readjson_lines_chunks_fileurl(request, datapath, engine):
  280. # GH 27135
  281. # Test reading line-format JSON from file url
  282. if engine == "pyarrow":
  283. # GH 48893
  284. reason = (
  285. "Pyarrow only supports a file path as an input and line delimited json"
  286. "and doesn't support chunksize parameter."
  287. )
  288. request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
  289. df_list_expected = [
  290. DataFrame([[1, 2]], columns=["a", "b"], index=[0]),
  291. DataFrame([[3, 4]], columns=["a", "b"], index=[1]),
  292. DataFrame([[5, 6]], columns=["a", "b"], index=[2]),
  293. ]
  294. os_path = datapath("io", "json", "data", "line_delimited.json")
  295. file_url = Path(os_path).as_uri()
  296. with read_json(file_url, lines=True, chunksize=1, engine=engine) as url_reader:
  297. for index, chuck in enumerate(url_reader):
  298. tm.assert_frame_equal(chuck, df_list_expected[index])
  299. def test_chunksize_is_incremental():
  300. # See https://github.com/pandas-dev/pandas/issues/34548
  301. jsonl = (
  302. """{"a": 1, "b": 2}
  303. {"a": 3, "b": 4}
  304. {"a": 5, "b": 6}
  305. {"a": 7, "b": 8}\n"""
  306. * 1000
  307. )
  308. class MyReader:
  309. def __init__(self, contents) -> None:
  310. self.read_count = 0
  311. self.stringio = StringIO(contents)
  312. def read(self, *args):
  313. self.read_count += 1
  314. return self.stringio.read(*args)
  315. def __iter__(self) -> Iterator:
  316. self.read_count += 1
  317. return iter(self.stringio)
  318. reader = MyReader(jsonl)
  319. assert len(list(read_json(reader, lines=True, chunksize=100))) > 1
  320. assert reader.read_count > 10
  321. @pytest.mark.parametrize("orient_", ["split", "index", "table"])
  322. def test_to_json_append_orient(orient_):
  323. # GH 35849
  324. # Test ValueError when orient is not 'records'
  325. df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  326. msg = (
  327. r"mode='a' \(append\) is only supported when "
  328. "lines is True and orient is 'records'"
  329. )
  330. with pytest.raises(ValueError, match=msg):
  331. df.to_json(mode="a", orient=orient_)
  332. def test_to_json_append_lines():
  333. # GH 35849
  334. # Test ValueError when lines is not True
  335. df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  336. msg = (
  337. r"mode='a' \(append\) is only supported when "
  338. "lines is True and orient is 'records'"
  339. )
  340. with pytest.raises(ValueError, match=msg):
  341. df.to_json(mode="a", lines=False, orient="records")
  342. @pytest.mark.parametrize("mode_", ["r", "x"])
  343. def test_to_json_append_mode(mode_):
  344. # GH 35849
  345. # Test ValueError when mode is not supported option
  346. df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  347. msg = (
  348. f"mode={mode_} is not a valid option."
  349. "Only 'w' and 'a' are currently supported."
  350. )
  351. with pytest.raises(ValueError, match=msg):
  352. df.to_json(mode=mode_, lines=False, orient="records")
  353. def test_to_json_append_output_consistent_columns():
  354. # GH 35849
  355. # Testing that resulting output reads in as expected.
  356. # Testing same columns, new rows
  357. df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  358. df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
  359. expected = DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]})
  360. with tm.ensure_clean("test.json") as path:
  361. # Save dataframes to the same file
  362. df1.to_json(path, lines=True, orient="records")
  363. df2.to_json(path, mode="a", lines=True, orient="records")
  364. # Read path file
  365. result = read_json(path, lines=True)
  366. tm.assert_frame_equal(result, expected)
  367. def test_to_json_append_output_inconsistent_columns():
  368. # GH 35849
  369. # Testing that resulting output reads in as expected.
  370. # Testing one new column, one old column, new rows
  371. df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  372. df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
  373. expected = DataFrame(
  374. {
  375. "col1": [1, 2, None, None],
  376. "col2": ["a", "b", "e", "f"],
  377. "col3": [np.nan, np.nan, "!", "#"],
  378. }
  379. )
  380. with tm.ensure_clean("test.json") as path:
  381. # Save dataframes to the same file
  382. df1.to_json(path, mode="a", lines=True, orient="records")
  383. df3.to_json(path, mode="a", lines=True, orient="records")
  384. # Read path file
  385. result = read_json(path, lines=True)
  386. tm.assert_frame_equal(result, expected)
  387. def test_to_json_append_output_different_columns():
  388. # GH 35849
  389. # Testing that resulting output reads in as expected.
  390. # Testing same, differing and new columns
  391. df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  392. df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
  393. df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
  394. df4 = DataFrame({"col4": [True, False]})
  395. expected = DataFrame(
  396. {
  397. "col1": [1, 2, 3, 4, None, None, None, None],
  398. "col2": ["a", "b", "c", "d", "e", "f", np.nan, np.nan],
  399. "col3": [np.nan, np.nan, np.nan, np.nan, "!", "#", np.nan, np.nan],
  400. "col4": [None, None, None, None, None, None, True, False],
  401. }
  402. ).astype({"col4": "float"})
  403. with tm.ensure_clean("test.json") as path:
  404. # Save dataframes to the same file
  405. df1.to_json(path, mode="a", lines=True, orient="records")
  406. df2.to_json(path, mode="a", lines=True, orient="records")
  407. df3.to_json(path, mode="a", lines=True, orient="records")
  408. df4.to_json(path, mode="a", lines=True, orient="records")
  409. # Read path file
  410. result = read_json(path, lines=True)
  411. tm.assert_frame_equal(result, expected)
  412. def test_to_json_append_output_different_columns_reordered():
  413. # GH 35849
  414. # Testing that resulting output reads in as expected.
  415. # Testing specific result column order.
  416. df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  417. df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
  418. df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
  419. df4 = DataFrame({"col4": [True, False]})
  420. # df4, df3, df2, df1 (in that order)
  421. expected = DataFrame(
  422. {
  423. "col4": [True, False, None, None, None, None, None, None],
  424. "col2": [np.nan, np.nan, "e", "f", "c", "d", "a", "b"],
  425. "col3": [np.nan, np.nan, "!", "#", np.nan, np.nan, np.nan, np.nan],
  426. "col1": [None, None, None, None, 3, 4, 1, 2],
  427. }
  428. ).astype({"col4": "float"})
  429. with tm.ensure_clean("test.json") as path:
  430. # Save dataframes to the same file
  431. df4.to_json(path, mode="a", lines=True, orient="records")
  432. df3.to_json(path, mode="a", lines=True, orient="records")
  433. df2.to_json(path, mode="a", lines=True, orient="records")
  434. df1.to_json(path, mode="a", lines=True, orient="records")
  435. # Read path file
  436. result = read_json(path, lines=True)
  437. tm.assert_frame_equal(result, expected)