_json.py 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494
  1. from __future__ import annotations
  2. from abc import (
  3. ABC,
  4. abstractmethod,
  5. )
  6. from collections import abc
  7. from io import StringIO
  8. from itertools import islice
  9. from typing import (
  10. TYPE_CHECKING,
  11. Any,
  12. Callable,
  13. Generic,
  14. Literal,
  15. TypeVar,
  16. final,
  17. overload,
  18. )
  19. import warnings
  20. import numpy as np
  21. from pandas._libs import lib
  22. from pandas._libs.json import (
  23. ujson_dumps,
  24. ujson_loads,
  25. )
  26. from pandas._libs.tslibs import iNaT
  27. from pandas.compat._optional import import_optional_dependency
  28. from pandas.errors import AbstractMethodError
  29. from pandas.util._decorators import doc
  30. from pandas.util._exceptions import find_stack_level
  31. from pandas.util._validators import check_dtype_backend
  32. from pandas.core.dtypes.common import (
  33. ensure_str,
  34. is_string_dtype,
  35. )
  36. from pandas.core.dtypes.dtypes import PeriodDtype
  37. from pandas import (
  38. DataFrame,
  39. Index,
  40. MultiIndex,
  41. Series,
  42. isna,
  43. notna,
  44. to_datetime,
  45. )
  46. from pandas.core.reshape.concat import concat
  47. from pandas.core.shared_docs import _shared_docs
  48. from pandas.io._util import arrow_table_to_pandas
  49. from pandas.io.common import (
  50. IOHandles,
  51. dedup_names,
  52. extension_to_compression,
  53. file_exists,
  54. get_handle,
  55. is_fsspec_url,
  56. is_potential_multi_index,
  57. is_url,
  58. stringify_path,
  59. )
  60. from pandas.io.json._normalize import convert_to_line_delimits
  61. from pandas.io.json._table_schema import (
  62. build_table_schema,
  63. parse_table_schema,
  64. )
  65. from pandas.io.parsers.readers import validate_integer
  66. if TYPE_CHECKING:
  67. from collections.abc import (
  68. Hashable,
  69. Mapping,
  70. )
  71. from types import TracebackType
  72. from pandas._typing import (
  73. CompressionOptions,
  74. DtypeArg,
  75. DtypeBackend,
  76. FilePath,
  77. IndexLabel,
  78. JSONEngine,
  79. JSONSerializable,
  80. ReadBuffer,
  81. Self,
  82. StorageOptions,
  83. WriteBuffer,
  84. )
  85. from pandas.core.generic import NDFrame
  86. FrameSeriesStrT = TypeVar("FrameSeriesStrT", bound=Literal["frame", "series"])
  87. # interface to/from
  88. @overload
  89. def to_json(
  90. path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes],
  91. obj: NDFrame,
  92. orient: str | None = ...,
  93. date_format: str = ...,
  94. double_precision: int = ...,
  95. force_ascii: bool = ...,
  96. date_unit: str = ...,
  97. default_handler: Callable[[Any], JSONSerializable] | None = ...,
  98. lines: bool = ...,
  99. compression: CompressionOptions = ...,
  100. index: bool | None = ...,
  101. indent: int = ...,
  102. storage_options: StorageOptions = ...,
  103. mode: Literal["a", "w"] = ...,
  104. ) -> None:
  105. ...
  106. @overload
  107. def to_json(
  108. path_or_buf: None,
  109. obj: NDFrame,
  110. orient: str | None = ...,
  111. date_format: str = ...,
  112. double_precision: int = ...,
  113. force_ascii: bool = ...,
  114. date_unit: str = ...,
  115. default_handler: Callable[[Any], JSONSerializable] | None = ...,
  116. lines: bool = ...,
  117. compression: CompressionOptions = ...,
  118. index: bool | None = ...,
  119. indent: int = ...,
  120. storage_options: StorageOptions = ...,
  121. mode: Literal["a", "w"] = ...,
  122. ) -> str:
  123. ...
  124. def to_json(
  125. path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] | None,
  126. obj: NDFrame,
  127. orient: str | None = None,
  128. date_format: str = "epoch",
  129. double_precision: int = 10,
  130. force_ascii: bool = True,
  131. date_unit: str = "ms",
  132. default_handler: Callable[[Any], JSONSerializable] | None = None,
  133. lines: bool = False,
  134. compression: CompressionOptions = "infer",
  135. index: bool | None = None,
  136. indent: int = 0,
  137. storage_options: StorageOptions | None = None,
  138. mode: Literal["a", "w"] = "w",
  139. ) -> str | None:
  140. if orient in ["records", "values"] and index is True:
  141. raise ValueError(
  142. "'index=True' is only valid when 'orient' is 'split', 'table', "
  143. "'index', or 'columns'."
  144. )
  145. elif orient in ["index", "columns"] and index is False:
  146. raise ValueError(
  147. "'index=False' is only valid when 'orient' is 'split', 'table', "
  148. "'records', or 'values'."
  149. )
  150. elif index is None:
  151. # will be ignored for orient='records' and 'values'
  152. index = True
  153. if lines and orient != "records":
  154. raise ValueError("'lines' keyword only valid when 'orient' is records")
  155. if mode not in ["a", "w"]:
  156. msg = (
  157. f"mode={mode} is not a valid option."
  158. "Only 'w' and 'a' are currently supported."
  159. )
  160. raise ValueError(msg)
  161. if mode == "a" and (not lines or orient != "records"):
  162. msg = (
  163. "mode='a' (append) is only supported when "
  164. "lines is True and orient is 'records'"
  165. )
  166. raise ValueError(msg)
  167. if orient == "table" and isinstance(obj, Series):
  168. obj = obj.to_frame(name=obj.name or "values")
  169. writer: type[Writer]
  170. if orient == "table" and isinstance(obj, DataFrame):
  171. writer = JSONTableWriter
  172. elif isinstance(obj, Series):
  173. writer = SeriesWriter
  174. elif isinstance(obj, DataFrame):
  175. writer = FrameWriter
  176. else:
  177. raise NotImplementedError("'obj' should be a Series or a DataFrame")
  178. s = writer(
  179. obj,
  180. orient=orient,
  181. date_format=date_format,
  182. double_precision=double_precision,
  183. ensure_ascii=force_ascii,
  184. date_unit=date_unit,
  185. default_handler=default_handler,
  186. index=index,
  187. indent=indent,
  188. ).write()
  189. if lines:
  190. s = convert_to_line_delimits(s)
  191. if path_or_buf is not None:
  192. # apply compression and byte/text conversion
  193. with get_handle(
  194. path_or_buf, mode, compression=compression, storage_options=storage_options
  195. ) as handles:
  196. handles.handle.write(s)
  197. else:
  198. return s
  199. return None
  200. class Writer(ABC):
  201. _default_orient: str
  202. def __init__(
  203. self,
  204. obj: NDFrame,
  205. orient: str | None,
  206. date_format: str,
  207. double_precision: int,
  208. ensure_ascii: bool,
  209. date_unit: str,
  210. index: bool,
  211. default_handler: Callable[[Any], JSONSerializable] | None = None,
  212. indent: int = 0,
  213. ) -> None:
  214. self.obj = obj
  215. if orient is None:
  216. orient = self._default_orient
  217. self.orient = orient
  218. self.date_format = date_format
  219. self.double_precision = double_precision
  220. self.ensure_ascii = ensure_ascii
  221. self.date_unit = date_unit
  222. self.default_handler = default_handler
  223. self.index = index
  224. self.indent = indent
  225. self.is_copy = None
  226. self._format_axes()
  227. def _format_axes(self) -> None:
  228. raise AbstractMethodError(self)
  229. def write(self) -> str:
  230. iso_dates = self.date_format == "iso"
  231. return ujson_dumps(
  232. self.obj_to_write,
  233. orient=self.orient,
  234. double_precision=self.double_precision,
  235. ensure_ascii=self.ensure_ascii,
  236. date_unit=self.date_unit,
  237. iso_dates=iso_dates,
  238. default_handler=self.default_handler,
  239. indent=self.indent,
  240. )
  241. @property
  242. @abstractmethod
  243. def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
  244. """Object to write in JSON format."""
  245. class SeriesWriter(Writer):
  246. _default_orient = "index"
  247. @property
  248. def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
  249. if not self.index and self.orient == "split":
  250. return {"name": self.obj.name, "data": self.obj.values}
  251. else:
  252. return self.obj
  253. def _format_axes(self) -> None:
  254. if not self.obj.index.is_unique and self.orient == "index":
  255. raise ValueError(f"Series index must be unique for orient='{self.orient}'")
  256. class FrameWriter(Writer):
  257. _default_orient = "columns"
  258. @property
  259. def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
  260. if not self.index and self.orient == "split":
  261. obj_to_write = self.obj.to_dict(orient="split")
  262. del obj_to_write["index"]
  263. else:
  264. obj_to_write = self.obj
  265. return obj_to_write
  266. def _format_axes(self) -> None:
  267. """
  268. Try to format axes if they are datelike.
  269. """
  270. if not self.obj.index.is_unique and self.orient in ("index", "columns"):
  271. raise ValueError(
  272. f"DataFrame index must be unique for orient='{self.orient}'."
  273. )
  274. if not self.obj.columns.is_unique and self.orient in (
  275. "index",
  276. "columns",
  277. "records",
  278. ):
  279. raise ValueError(
  280. f"DataFrame columns must be unique for orient='{self.orient}'."
  281. )
  282. class JSONTableWriter(FrameWriter):
  283. _default_orient = "records"
  284. def __init__(
  285. self,
  286. obj,
  287. orient: str | None,
  288. date_format: str,
  289. double_precision: int,
  290. ensure_ascii: bool,
  291. date_unit: str,
  292. index: bool,
  293. default_handler: Callable[[Any], JSONSerializable] | None = None,
  294. indent: int = 0,
  295. ) -> None:
  296. """
  297. Adds a `schema` attribute with the Table Schema, resets
  298. the index (can't do in caller, because the schema inference needs
  299. to know what the index is, forces orient to records, and forces
  300. date_format to 'iso'.
  301. """
  302. super().__init__(
  303. obj,
  304. orient,
  305. date_format,
  306. double_precision,
  307. ensure_ascii,
  308. date_unit,
  309. index,
  310. default_handler=default_handler,
  311. indent=indent,
  312. )
  313. if date_format != "iso":
  314. msg = (
  315. "Trying to write with `orient='table'` and "
  316. f"`date_format='{date_format}'`. Table Schema requires dates "
  317. "to be formatted with `date_format='iso'`"
  318. )
  319. raise ValueError(msg)
  320. self.schema = build_table_schema(obj, index=self.index)
  321. # NotImplemented on a column MultiIndex
  322. if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):
  323. raise NotImplementedError(
  324. "orient='table' is not supported for MultiIndex columns"
  325. )
  326. # TODO: Do this timedelta properly in objToJSON.c See GH #15137
  327. if (
  328. (obj.ndim == 1)
  329. and (obj.name in set(obj.index.names))
  330. or len(obj.columns.intersection(obj.index.names))
  331. ):
  332. msg = "Overlapping names between the index and columns"
  333. raise ValueError(msg)
  334. obj = obj.copy()
  335. timedeltas = obj.select_dtypes(include=["timedelta"]).columns
  336. if len(timedeltas):
  337. obj[timedeltas] = obj[timedeltas].map(lambda x: x.isoformat())
  338. # Convert PeriodIndex to datetimes before serializing
  339. if isinstance(obj.index.dtype, PeriodDtype):
  340. obj.index = obj.index.to_timestamp()
  341. # exclude index from obj if index=False
  342. if not self.index:
  343. self.obj = obj.reset_index(drop=True)
  344. else:
  345. self.obj = obj.reset_index(drop=False)
  346. self.date_format = "iso"
  347. self.orient = "records"
  348. self.index = index
  349. @property
  350. def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
  351. return {"schema": self.schema, "data": self.obj}
  352. @overload
  353. def read_json(
  354. path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
  355. *,
  356. orient: str | None = ...,
  357. typ: Literal["frame"] = ...,
  358. dtype: DtypeArg | None = ...,
  359. convert_axes: bool | None = ...,
  360. convert_dates: bool | list[str] = ...,
  361. keep_default_dates: bool = ...,
  362. precise_float: bool = ...,
  363. date_unit: str | None = ...,
  364. encoding: str | None = ...,
  365. encoding_errors: str | None = ...,
  366. lines: bool = ...,
  367. chunksize: int,
  368. compression: CompressionOptions = ...,
  369. nrows: int | None = ...,
  370. storage_options: StorageOptions = ...,
  371. dtype_backend: DtypeBackend | lib.NoDefault = ...,
  372. engine: JSONEngine = ...,
  373. ) -> JsonReader[Literal["frame"]]:
  374. ...
  375. @overload
  376. def read_json(
  377. path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
  378. *,
  379. orient: str | None = ...,
  380. typ: Literal["series"],
  381. dtype: DtypeArg | None = ...,
  382. convert_axes: bool | None = ...,
  383. convert_dates: bool | list[str] = ...,
  384. keep_default_dates: bool = ...,
  385. precise_float: bool = ...,
  386. date_unit: str | None = ...,
  387. encoding: str | None = ...,
  388. encoding_errors: str | None = ...,
  389. lines: bool = ...,
  390. chunksize: int,
  391. compression: CompressionOptions = ...,
  392. nrows: int | None = ...,
  393. storage_options: StorageOptions = ...,
  394. dtype_backend: DtypeBackend | lib.NoDefault = ...,
  395. engine: JSONEngine = ...,
  396. ) -> JsonReader[Literal["series"]]:
  397. ...
  398. @overload
  399. def read_json(
  400. path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
  401. *,
  402. orient: str | None = ...,
  403. typ: Literal["series"],
  404. dtype: DtypeArg | None = ...,
  405. convert_axes: bool | None = ...,
  406. convert_dates: bool | list[str] = ...,
  407. keep_default_dates: bool = ...,
  408. precise_float: bool = ...,
  409. date_unit: str | None = ...,
  410. encoding: str | None = ...,
  411. encoding_errors: str | None = ...,
  412. lines: bool = ...,
  413. chunksize: None = ...,
  414. compression: CompressionOptions = ...,
  415. nrows: int | None = ...,
  416. storage_options: StorageOptions = ...,
  417. dtype_backend: DtypeBackend | lib.NoDefault = ...,
  418. engine: JSONEngine = ...,
  419. ) -> Series:
  420. ...
  421. @overload
  422. def read_json(
  423. path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
  424. *,
  425. orient: str | None = ...,
  426. typ: Literal["frame"] = ...,
  427. dtype: DtypeArg | None = ...,
  428. convert_axes: bool | None = ...,
  429. convert_dates: bool | list[str] = ...,
  430. keep_default_dates: bool = ...,
  431. precise_float: bool = ...,
  432. date_unit: str | None = ...,
  433. encoding: str | None = ...,
  434. encoding_errors: str | None = ...,
  435. lines: bool = ...,
  436. chunksize: None = ...,
  437. compression: CompressionOptions = ...,
  438. nrows: int | None = ...,
  439. storage_options: StorageOptions = ...,
  440. dtype_backend: DtypeBackend | lib.NoDefault = ...,
  441. engine: JSONEngine = ...,
  442. ) -> DataFrame:
  443. ...
  444. @doc(
  445. storage_options=_shared_docs["storage_options"],
  446. decompression_options=_shared_docs["decompression_options"] % "path_or_buf",
  447. )
  448. def read_json(
  449. path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
  450. *,
  451. orient: str | None = None,
  452. typ: Literal["frame", "series"] = "frame",
  453. dtype: DtypeArg | None = None,
  454. convert_axes: bool | None = None,
  455. convert_dates: bool | list[str] = True,
  456. keep_default_dates: bool = True,
  457. precise_float: bool = False,
  458. date_unit: str | None = None,
  459. encoding: str | None = None,
  460. encoding_errors: str | None = "strict",
  461. lines: bool = False,
  462. chunksize: int | None = None,
  463. compression: CompressionOptions = "infer",
  464. nrows: int | None = None,
  465. storage_options: StorageOptions | None = None,
  466. dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
  467. engine: JSONEngine = "ujson",
  468. ) -> DataFrame | Series | JsonReader:
  469. """
  470. Convert a JSON string to pandas object.
  471. Parameters
  472. ----------
  473. path_or_buf : a valid JSON str, path object or file-like object
  474. Any valid string path is acceptable. The string could be a URL. Valid
  475. URL schemes include http, ftp, s3, and file. For file URLs, a host is
  476. expected. A local file could be:
  477. ``file://localhost/path/to/table.json``.
  478. If you want to pass in a path object, pandas accepts any
  479. ``os.PathLike``.
  480. By file-like object, we refer to objects with a ``read()`` method,
  481. such as a file handle (e.g. via builtin ``open`` function)
  482. or ``StringIO``.
  483. .. deprecated:: 2.1.0
  484. Passing json literal strings is deprecated.
  485. orient : str, optional
  486. Indication of expected JSON string format.
  487. Compatible JSON strings can be produced by ``to_json()`` with a
  488. corresponding orient value.
  489. The set of possible orients is:
  490. - ``'split'`` : dict like
  491. ``{{index -> [index], columns -> [columns], data -> [values]}}``
  492. - ``'records'`` : list like
  493. ``[{{column -> value}}, ... , {{column -> value}}]``
  494. - ``'index'`` : dict like ``{{index -> {{column -> value}}}}``
  495. - ``'columns'`` : dict like ``{{column -> {{index -> value}}}}``
  496. - ``'values'`` : just the values array
  497. - ``'table'`` : dict like ``{{'schema': {{schema}}, 'data': {{data}}}}``
  498. The allowed and default values depend on the value
  499. of the `typ` parameter.
  500. * when ``typ == 'series'``,
  501. - allowed orients are ``{{'split','records','index'}}``
  502. - default is ``'index'``
  503. - The Series index must be unique for orient ``'index'``.
  504. * when ``typ == 'frame'``,
  505. - allowed orients are ``{{'split','records','index',
  506. 'columns','values', 'table'}}``
  507. - default is ``'columns'``
  508. - The DataFrame index must be unique for orients ``'index'`` and
  509. ``'columns'``.
  510. - The DataFrame columns must be unique for orients ``'index'``,
  511. ``'columns'``, and ``'records'``.
  512. typ : {{'frame', 'series'}}, default 'frame'
  513. The type of object to recover.
  514. dtype : bool or dict, default None
  515. If True, infer dtypes; if a dict of column to dtype, then use those;
  516. if False, then don't infer dtypes at all, applies only to the data.
  517. For all ``orient`` values except ``'table'``, default is True.
  518. convert_axes : bool, default None
  519. Try to convert the axes to the proper dtypes.
  520. For all ``orient`` values except ``'table'``, default is True.
  521. convert_dates : bool or list of str, default True
  522. If True then default datelike columns may be converted (depending on
  523. keep_default_dates).
  524. If False, no dates will be converted.
  525. If a list of column names, then those columns will be converted and
  526. default datelike columns may also be converted (depending on
  527. keep_default_dates).
  528. keep_default_dates : bool, default True
  529. If parsing dates (convert_dates is not False), then try to parse the
  530. default datelike columns.
  531. A column label is datelike if
  532. * it ends with ``'_at'``,
  533. * it ends with ``'_time'``,
  534. * it begins with ``'timestamp'``,
  535. * it is ``'modified'``, or
  536. * it is ``'date'``.
  537. precise_float : bool, default False
  538. Set to enable usage of higher precision (strtod) function when
  539. decoding string to double values. Default (False) is to use fast but
  540. less precise builtin functionality.
  541. date_unit : str, default None
  542. The timestamp unit to detect if converting dates. The default behaviour
  543. is to try and detect the correct precision, but if this is not desired
  544. then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
  545. milliseconds, microseconds or nanoseconds respectively.
  546. encoding : str, default is 'utf-8'
  547. The encoding to use to decode py3 bytes.
  548. encoding_errors : str, optional, default "strict"
  549. How encoding errors are treated. `List of possible values
  550. <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
  551. .. versionadded:: 1.3.0
  552. lines : bool, default False
  553. Read the file as a json object per line.
  554. chunksize : int, optional
  555. Return JsonReader object for iteration.
  556. See the `line-delimited json docs
  557. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json>`_
  558. for more information on ``chunksize``.
  559. This can only be passed if `lines=True`.
  560. If this is None, the file will be read into memory all at once.
  561. {decompression_options}
  562. .. versionchanged:: 1.4.0 Zstandard support.
  563. nrows : int, optional
  564. The number of lines from the line-delimited jsonfile that has to be read.
  565. This can only be passed if `lines=True`.
  566. If this is None, all the rows will be returned.
  567. {storage_options}
  568. dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
  569. Back-end data type applied to the resultant :class:`DataFrame`
  570. (still experimental). Behaviour is as follows:
  571. * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
  572. (default).
  573. * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
  574. DataFrame.
  575. .. versionadded:: 2.0
  576. engine : {{"ujson", "pyarrow"}}, default "ujson"
  577. Parser engine to use. The ``"pyarrow"`` engine is only available when
  578. ``lines=True``.
  579. .. versionadded:: 2.0
  580. Returns
  581. -------
  582. Series, DataFrame, or pandas.api.typing.JsonReader
  583. A JsonReader is returned when ``chunksize`` is not ``0`` or ``None``.
  584. Otherwise, the type returned depends on the value of ``typ``.
  585. See Also
  586. --------
  587. DataFrame.to_json : Convert a DataFrame to a JSON string.
  588. Series.to_json : Convert a Series to a JSON string.
  589. json_normalize : Normalize semi-structured JSON data into a flat table.
  590. Notes
  591. -----
  592. Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
  593. :class:`Index` name of `index` gets written with :func:`to_json`, the
  594. subsequent read operation will incorrectly set the :class:`Index` name to
  595. ``None``. This is because `index` is also used by :func:`DataFrame.to_json`
  596. to denote a missing :class:`Index` name, and the subsequent
  597. :func:`read_json` operation cannot distinguish between the two. The same
  598. limitation is encountered with a :class:`MultiIndex` and any names
  599. beginning with ``'level_'``.
  600. Examples
  601. --------
  602. >>> from io import StringIO
  603. >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
  604. ... index=['row 1', 'row 2'],
  605. ... columns=['col 1', 'col 2'])
  606. Encoding/decoding a Dataframe using ``'split'`` formatted JSON:
  607. >>> df.to_json(orient='split')
  608. '\
  609. {{\
  610. "columns":["col 1","col 2"],\
  611. "index":["row 1","row 2"],\
  612. "data":[["a","b"],["c","d"]]\
  613. }}\
  614. '
  615. >>> pd.read_json(StringIO(_), orient='split')
  616. col 1 col 2
  617. row 1 a b
  618. row 2 c d
  619. Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
  620. >>> df.to_json(orient='index')
  621. '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}'
  622. >>> pd.read_json(StringIO(_), orient='index')
  623. col 1 col 2
  624. row 1 a b
  625. row 2 c d
  626. Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
  627. Note that index labels are not preserved with this encoding.
  628. >>> df.to_json(orient='records')
  629. '[{{"col 1":"a","col 2":"b"}},{{"col 1":"c","col 2":"d"}}]'
  630. >>> pd.read_json(StringIO(_), orient='records')
  631. col 1 col 2
  632. 0 a b
  633. 1 c d
  634. Encoding with Table Schema
  635. >>> df.to_json(orient='table')
  636. '\
  637. {{"schema":{{"fields":[\
  638. {{"name":"index","type":"string"}},\
  639. {{"name":"col 1","type":"string"}},\
  640. {{"name":"col 2","type":"string"}}],\
  641. "primaryKey":["index"],\
  642. "pandas_version":"1.4.0"}},\
  643. "data":[\
  644. {{"index":"row 1","col 1":"a","col 2":"b"}},\
  645. {{"index":"row 2","col 1":"c","col 2":"d"}}]\
  646. }}\
  647. '
  648. The following example uses ``dtype_backend="numpy_nullable"``
  649. >>> data = '''{{"index": {{"0": 0, "1": 1}},
  650. ... "a": {{"0": 1, "1": null}},
  651. ... "b": {{"0": 2.5, "1": 4.5}},
  652. ... "c": {{"0": true, "1": false}},
  653. ... "d": {{"0": "a", "1": "b"}},
  654. ... "e": {{"0": 1577.2, "1": 1577.1}}}}'''
  655. >>> pd.read_json(StringIO(data), dtype_backend="numpy_nullable")
  656. index a b c d e
  657. 0 0 1 2.5 True a 1577.2
  658. 1 1 <NA> 4.5 False b 1577.1
  659. """
  660. if orient == "table" and dtype:
  661. raise ValueError("cannot pass both dtype and orient='table'")
  662. if orient == "table" and convert_axes:
  663. raise ValueError("cannot pass both convert_axes and orient='table'")
  664. check_dtype_backend(dtype_backend)
  665. if dtype is None and orient != "table":
  666. # error: Incompatible types in assignment (expression has type "bool", variable
  667. # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float],
  668. # Type[int], Type[complex], Type[bool], Type[object], Dict[Hashable,
  669. # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float],
  670. # Type[int], Type[complex], Type[bool], Type[object]]], None]")
  671. dtype = True # type: ignore[assignment]
  672. if convert_axes is None and orient != "table":
  673. convert_axes = True
  674. json_reader = JsonReader(
  675. path_or_buf,
  676. orient=orient,
  677. typ=typ,
  678. dtype=dtype,
  679. convert_axes=convert_axes,
  680. convert_dates=convert_dates,
  681. keep_default_dates=keep_default_dates,
  682. precise_float=precise_float,
  683. date_unit=date_unit,
  684. encoding=encoding,
  685. lines=lines,
  686. chunksize=chunksize,
  687. compression=compression,
  688. nrows=nrows,
  689. storage_options=storage_options,
  690. encoding_errors=encoding_errors,
  691. dtype_backend=dtype_backend,
  692. engine=engine,
  693. )
  694. if chunksize:
  695. return json_reader
  696. else:
  697. return json_reader.read()
  698. class JsonReader(abc.Iterator, Generic[FrameSeriesStrT]):
  699. """
  700. JsonReader provides an interface for reading in a JSON file.
  701. If initialized with ``lines=True`` and ``chunksize``, can be iterated over
  702. ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
  703. whole document.
  704. """
  705. def __init__(
  706. self,
  707. filepath_or_buffer,
  708. orient,
  709. typ: FrameSeriesStrT,
  710. dtype,
  711. convert_axes: bool | None,
  712. convert_dates,
  713. keep_default_dates: bool,
  714. precise_float: bool,
  715. date_unit,
  716. encoding,
  717. lines: bool,
  718. chunksize: int | None,
  719. compression: CompressionOptions,
  720. nrows: int | None,
  721. storage_options: StorageOptions | None = None,
  722. encoding_errors: str | None = "strict",
  723. dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
  724. engine: JSONEngine = "ujson",
  725. ) -> None:
  726. self.orient = orient
  727. self.typ = typ
  728. self.dtype = dtype
  729. self.convert_axes = convert_axes
  730. self.convert_dates = convert_dates
  731. self.keep_default_dates = keep_default_dates
  732. self.precise_float = precise_float
  733. self.date_unit = date_unit
  734. self.encoding = encoding
  735. self.engine = engine
  736. self.compression = compression
  737. self.storage_options = storage_options
  738. self.lines = lines
  739. self.chunksize = chunksize
  740. self.nrows_seen = 0
  741. self.nrows = nrows
  742. self.encoding_errors = encoding_errors
  743. self.handles: IOHandles[str] | None = None
  744. self.dtype_backend = dtype_backend
  745. if self.engine not in {"pyarrow", "ujson"}:
  746. raise ValueError(
  747. f"The engine type {self.engine} is currently not supported."
  748. )
  749. if self.chunksize is not None:
  750. self.chunksize = validate_integer("chunksize", self.chunksize, 1)
  751. if not self.lines:
  752. raise ValueError("chunksize can only be passed if lines=True")
  753. if self.engine == "pyarrow":
  754. raise ValueError(
  755. "currently pyarrow engine doesn't support chunksize parameter"
  756. )
  757. if self.nrows is not None:
  758. self.nrows = validate_integer("nrows", self.nrows, 0)
  759. if not self.lines:
  760. raise ValueError("nrows can only be passed if lines=True")
  761. if (
  762. isinstance(filepath_or_buffer, str)
  763. and not self.lines
  764. and "\n" in filepath_or_buffer
  765. ):
  766. warnings.warn(
  767. "Passing literal json to 'read_json' is deprecated and "
  768. "will be removed in a future version. To read from a "
  769. "literal string, wrap it in a 'StringIO' object.",
  770. FutureWarning,
  771. stacklevel=find_stack_level(),
  772. )
  773. if self.engine == "pyarrow":
  774. if not self.lines:
  775. raise ValueError(
  776. "currently pyarrow engine only supports "
  777. "the line-delimited JSON format"
  778. )
  779. self.data = filepath_or_buffer
  780. elif self.engine == "ujson":
  781. data = self._get_data_from_filepath(filepath_or_buffer)
  782. self.data = self._preprocess_data(data)
  783. def _preprocess_data(self, data):
  784. """
  785. At this point, the data either has a `read` attribute (e.g. a file
  786. object or a StringIO) or is a string that is a JSON document.
  787. If self.chunksize, we prepare the data for the `__next__` method.
  788. Otherwise, we read it into memory for the `read` method.
  789. """
  790. if hasattr(data, "read") and not (self.chunksize or self.nrows):
  791. with self:
  792. data = data.read()
  793. if not hasattr(data, "read") and (self.chunksize or self.nrows):
  794. data = StringIO(data)
  795. return data
  796. def _get_data_from_filepath(self, filepath_or_buffer):
  797. """
  798. The function read_json accepts three input types:
  799. 1. filepath (string-like)
  800. 2. file-like object (e.g. open file object, StringIO)
  801. 3. JSON string
  802. This method turns (1) into (2) to simplify the rest of the processing.
  803. It returns input types (2) and (3) unchanged.
  804. It raises FileNotFoundError if the input is a string ending in
  805. one of .json, .json.gz, .json.bz2, etc. but no such file exists.
  806. """
  807. # if it is a string but the file does not exist, it might be a JSON string
  808. filepath_or_buffer = stringify_path(filepath_or_buffer)
  809. if (
  810. not isinstance(filepath_or_buffer, str)
  811. or is_url(filepath_or_buffer)
  812. or is_fsspec_url(filepath_or_buffer)
  813. or file_exists(filepath_or_buffer)
  814. ):
  815. self.handles = get_handle(
  816. filepath_or_buffer,
  817. "r",
  818. encoding=self.encoding,
  819. compression=self.compression,
  820. storage_options=self.storage_options,
  821. errors=self.encoding_errors,
  822. )
  823. filepath_or_buffer = self.handles.handle
  824. elif (
  825. isinstance(filepath_or_buffer, str)
  826. and filepath_or_buffer.lower().endswith(
  827. (".json",) + tuple(f".json{c}" for c in extension_to_compression)
  828. )
  829. and not file_exists(filepath_or_buffer)
  830. ):
  831. raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")
  832. else:
  833. warnings.warn(
  834. "Passing literal json to 'read_json' is deprecated and "
  835. "will be removed in a future version. To read from a "
  836. "literal string, wrap it in a 'StringIO' object.",
  837. FutureWarning,
  838. stacklevel=find_stack_level(),
  839. )
  840. return filepath_or_buffer
  841. def _combine_lines(self, lines) -> str:
  842. """
  843. Combines a list of JSON objects into one JSON object.
  844. """
  845. return (
  846. f'[{",".join([line for line in (line.strip() for line in lines) if line])}]'
  847. )
  848. @overload
  849. def read(self: JsonReader[Literal["frame"]]) -> DataFrame:
  850. ...
  851. @overload
  852. def read(self: JsonReader[Literal["series"]]) -> Series:
  853. ...
  854. @overload
  855. def read(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:
  856. ...
  857. def read(self) -> DataFrame | Series:
  858. """
  859. Read the whole JSON input into a pandas object.
  860. """
  861. obj: DataFrame | Series
  862. with self:
  863. if self.engine == "pyarrow":
  864. pyarrow_json = import_optional_dependency("pyarrow.json")
  865. pa_table = pyarrow_json.read_json(self.data)
  866. return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend)
  867. elif self.engine == "ujson":
  868. if self.lines:
  869. if self.chunksize:
  870. obj = concat(self)
  871. elif self.nrows:
  872. lines = list(islice(self.data, self.nrows))
  873. lines_json = self._combine_lines(lines)
  874. obj = self._get_object_parser(lines_json)
  875. else:
  876. data = ensure_str(self.data)
  877. data_lines = data.split("\n")
  878. obj = self._get_object_parser(self._combine_lines(data_lines))
  879. else:
  880. obj = self._get_object_parser(self.data)
  881. if self.dtype_backend is not lib.no_default:
  882. return obj.convert_dtypes(
  883. infer_objects=False, dtype_backend=self.dtype_backend
  884. )
  885. else:
  886. return obj
  887. def _get_object_parser(self, json) -> DataFrame | Series:
  888. """
  889. Parses a json document into a pandas object.
  890. """
  891. typ = self.typ
  892. dtype = self.dtype
  893. kwargs = {
  894. "orient": self.orient,
  895. "dtype": self.dtype,
  896. "convert_axes": self.convert_axes,
  897. "convert_dates": self.convert_dates,
  898. "keep_default_dates": self.keep_default_dates,
  899. "precise_float": self.precise_float,
  900. "date_unit": self.date_unit,
  901. "dtype_backend": self.dtype_backend,
  902. }
  903. obj = None
  904. if typ == "frame":
  905. obj = FrameParser(json, **kwargs).parse()
  906. if typ == "series" or obj is None:
  907. if not isinstance(dtype, bool):
  908. kwargs["dtype"] = dtype
  909. obj = SeriesParser(json, **kwargs).parse()
  910. return obj
  911. def close(self) -> None:
  912. """
  913. If we opened a stream earlier, in _get_data_from_filepath, we should
  914. close it.
  915. If an open stream or file was passed, we leave it open.
  916. """
  917. if self.handles is not None:
  918. self.handles.close()
  919. def __iter__(self) -> Self:
  920. return self
  921. @overload
  922. def __next__(self: JsonReader[Literal["frame"]]) -> DataFrame:
  923. ...
  924. @overload
  925. def __next__(self: JsonReader[Literal["series"]]) -> Series:
  926. ...
  927. @overload
  928. def __next__(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:
  929. ...
  930. def __next__(self) -> DataFrame | Series:
  931. if self.nrows and self.nrows_seen >= self.nrows:
  932. self.close()
  933. raise StopIteration
  934. lines = list(islice(self.data, self.chunksize))
  935. if not lines:
  936. self.close()
  937. raise StopIteration
  938. try:
  939. lines_json = self._combine_lines(lines)
  940. obj = self._get_object_parser(lines_json)
  941. # Make sure that the returned objects have the right index.
  942. obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
  943. self.nrows_seen += len(obj)
  944. except Exception as ex:
  945. self.close()
  946. raise ex
  947. if self.dtype_backend is not lib.no_default:
  948. return obj.convert_dtypes(
  949. infer_objects=False, dtype_backend=self.dtype_backend
  950. )
  951. else:
  952. return obj
  953. def __enter__(self) -> Self:
  954. return self
  955. def __exit__(
  956. self,
  957. exc_type: type[BaseException] | None,
  958. exc_value: BaseException | None,
  959. traceback: TracebackType | None,
  960. ) -> None:
  961. self.close()
  962. class Parser:
  963. _split_keys: tuple[str, ...]
  964. _default_orient: str
  965. _STAMP_UNITS = ("s", "ms", "us", "ns")
  966. _MIN_STAMPS = {
  967. "s": 31536000,
  968. "ms": 31536000000,
  969. "us": 31536000000000,
  970. "ns": 31536000000000000,
  971. }
  972. json: str
  973. def __init__(
  974. self,
  975. json: str,
  976. orient,
  977. dtype: DtypeArg | None = None,
  978. convert_axes: bool = True,
  979. convert_dates: bool | list[str] = True,
  980. keep_default_dates: bool = False,
  981. precise_float: bool = False,
  982. date_unit=None,
  983. dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
  984. ) -> None:
  985. self.json = json
  986. if orient is None:
  987. orient = self._default_orient
  988. self.orient = orient
  989. self.dtype = dtype
  990. if date_unit is not None:
  991. date_unit = date_unit.lower()
  992. if date_unit not in self._STAMP_UNITS:
  993. raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}")
  994. self.min_stamp = self._MIN_STAMPS[date_unit]
  995. else:
  996. self.min_stamp = self._MIN_STAMPS["s"]
  997. self.precise_float = precise_float
  998. self.convert_axes = convert_axes
  999. self.convert_dates = convert_dates
  1000. self.date_unit = date_unit
  1001. self.keep_default_dates = keep_default_dates
  1002. self.obj: DataFrame | Series | None = None
  1003. self.dtype_backend = dtype_backend
  1004. @final
  1005. def check_keys_split(self, decoded: dict) -> None:
  1006. """
  1007. Checks that dict has only the appropriate keys for orient='split'.
  1008. """
  1009. bad_keys = set(decoded.keys()).difference(set(self._split_keys))
  1010. if bad_keys:
  1011. bad_keys_joined = ", ".join(bad_keys)
  1012. raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}")
  1013. @final
  1014. def parse(self):
  1015. self._parse()
  1016. if self.obj is None:
  1017. return None
  1018. if self.convert_axes:
  1019. self._convert_axes()
  1020. self._try_convert_types()
  1021. return self.obj
  1022. def _parse(self) -> None:
  1023. raise AbstractMethodError(self)
  1024. @final
  1025. def _convert_axes(self) -> None:
  1026. """
  1027. Try to convert axes.
  1028. """
  1029. obj = self.obj
  1030. assert obj is not None # for mypy
  1031. for axis_name in obj._AXIS_ORDERS:
  1032. ax = obj._get_axis(axis_name)
  1033. ser = Series(ax, dtype=ax.dtype, copy=False)
  1034. new_ser, result = self._try_convert_data(
  1035. name=axis_name,
  1036. data=ser,
  1037. use_dtypes=False,
  1038. convert_dates=True,
  1039. is_axis=True,
  1040. )
  1041. if result:
  1042. new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False)
  1043. setattr(self.obj, axis_name, new_axis)
  1044. def _try_convert_types(self) -> None:
  1045. raise AbstractMethodError(self)
  1046. @final
  1047. def _try_convert_data(
  1048. self,
  1049. name: Hashable,
  1050. data: Series,
  1051. use_dtypes: bool = True,
  1052. convert_dates: bool | list[str] = True,
  1053. is_axis: bool = False,
  1054. ) -> tuple[Series, bool]:
  1055. """
  1056. Try to parse a Series into a column by inferring dtype.
  1057. """
  1058. # don't try to coerce, unless a force conversion
  1059. if use_dtypes:
  1060. if not self.dtype:
  1061. if all(notna(data)):
  1062. return data, False
  1063. with warnings.catch_warnings():
  1064. warnings.filterwarnings(
  1065. "ignore",
  1066. "Downcasting object dtype arrays",
  1067. category=FutureWarning,
  1068. )
  1069. filled = data.fillna(np.nan)
  1070. return filled, True
  1071. elif self.dtype is True:
  1072. pass
  1073. else:
  1074. # dtype to force
  1075. dtype = (
  1076. self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype
  1077. )
  1078. if dtype is not None:
  1079. try:
  1080. return data.astype(dtype), True
  1081. except (TypeError, ValueError):
  1082. return data, False
  1083. if convert_dates:
  1084. new_data, result = self._try_convert_to_date(data)
  1085. if result:
  1086. return new_data, True
  1087. converted = False
  1088. if self.dtype_backend is not lib.no_default and not is_axis:
  1089. # Fall through for conversion later on
  1090. return data, True
  1091. elif is_string_dtype(data.dtype):
  1092. # try float
  1093. try:
  1094. data = data.astype("float64")
  1095. converted = True
  1096. except (TypeError, ValueError):
  1097. pass
  1098. if data.dtype.kind == "f" and data.dtype != "float64":
  1099. # coerce floats to 64
  1100. try:
  1101. data = data.astype("float64")
  1102. converted = True
  1103. except (TypeError, ValueError):
  1104. pass
  1105. # don't coerce 0-len data
  1106. if len(data) and data.dtype in ("float", "object"):
  1107. # coerce ints if we can
  1108. try:
  1109. new_data = data.astype("int64")
  1110. if (new_data == data).all():
  1111. data = new_data
  1112. converted = True
  1113. except (TypeError, ValueError, OverflowError):
  1114. pass
  1115. if data.dtype == "int" and data.dtype != "int64":
  1116. # coerce ints to 64
  1117. try:
  1118. data = data.astype("int64")
  1119. converted = True
  1120. except (TypeError, ValueError):
  1121. pass
  1122. # if we have an index, we want to preserve dtypes
  1123. if name == "index" and len(data):
  1124. if self.orient == "split":
  1125. return data, False
  1126. return data, converted
  1127. @final
  1128. def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]:
  1129. """
  1130. Try to parse a ndarray like into a date column.
  1131. Try to coerce object in epoch/iso formats and integer/float in epoch
  1132. formats. Return a boolean if parsing was successful.
  1133. """
  1134. # no conversion on empty
  1135. if not len(data):
  1136. return data, False
  1137. new_data = data
  1138. if new_data.dtype == "string":
  1139. new_data = new_data.astype(object)
  1140. if new_data.dtype == "object":
  1141. try:
  1142. new_data = data.astype("int64")
  1143. except OverflowError:
  1144. return data, False
  1145. except (TypeError, ValueError):
  1146. pass
  1147. # ignore numbers that are out of range
  1148. if issubclass(new_data.dtype.type, np.number):
  1149. in_range = (
  1150. isna(new_data._values)
  1151. | (new_data > self.min_stamp)
  1152. | (new_data._values == iNaT)
  1153. )
  1154. if not in_range.all():
  1155. return data, False
  1156. date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
  1157. for date_unit in date_units:
  1158. try:
  1159. with warnings.catch_warnings():
  1160. warnings.filterwarnings(
  1161. "ignore",
  1162. ".*parsing datetimes with mixed time "
  1163. "zones will raise an error",
  1164. category=FutureWarning,
  1165. )
  1166. new_data = to_datetime(new_data, errors="raise", unit=date_unit)
  1167. except (ValueError, OverflowError, TypeError):
  1168. continue
  1169. return new_data, True
  1170. return data, False
  1171. class SeriesParser(Parser):
  1172. _default_orient = "index"
  1173. _split_keys = ("name", "index", "data")
  1174. obj: Series | None
  1175. def _parse(self) -> None:
  1176. data = ujson_loads(self.json, precise_float=self.precise_float)
  1177. if self.orient == "split":
  1178. decoded = {str(k): v for k, v in data.items()}
  1179. self.check_keys_split(decoded)
  1180. self.obj = Series(**decoded)
  1181. else:
  1182. self.obj = Series(data)
  1183. def _try_convert_types(self) -> None:
  1184. if self.obj is None:
  1185. return
  1186. obj, result = self._try_convert_data(
  1187. "data", self.obj, convert_dates=self.convert_dates
  1188. )
  1189. if result:
  1190. self.obj = obj
  1191. class FrameParser(Parser):
  1192. _default_orient = "columns"
  1193. _split_keys = ("columns", "index", "data")
  1194. obj: DataFrame | None
  1195. def _parse(self) -> None:
  1196. json = self.json
  1197. orient = self.orient
  1198. if orient == "columns":
  1199. self.obj = DataFrame(
  1200. ujson_loads(json, precise_float=self.precise_float), dtype=None
  1201. )
  1202. elif orient == "split":
  1203. decoded = {
  1204. str(k): v
  1205. for k, v in ujson_loads(json, precise_float=self.precise_float).items()
  1206. }
  1207. self.check_keys_split(decoded)
  1208. orig_names = [
  1209. (tuple(col) if isinstance(col, list) else col)
  1210. for col in decoded["columns"]
  1211. ]
  1212. decoded["columns"] = dedup_names(
  1213. orig_names,
  1214. is_potential_multi_index(orig_names, None),
  1215. )
  1216. self.obj = DataFrame(dtype=None, **decoded)
  1217. elif orient == "index":
  1218. self.obj = DataFrame.from_dict(
  1219. ujson_loads(json, precise_float=self.precise_float),
  1220. dtype=None,
  1221. orient="index",
  1222. )
  1223. elif orient == "table":
  1224. self.obj = parse_table_schema(json, precise_float=self.precise_float)
  1225. else:
  1226. self.obj = DataFrame(
  1227. ujson_loads(json, precise_float=self.precise_float), dtype=None
  1228. )
  1229. def _process_converter(
  1230. self,
  1231. f: Callable[[Hashable, Series], tuple[Series, bool]],
  1232. filt: Callable[[Hashable], bool] | None = None,
  1233. ) -> None:
  1234. """
  1235. Take a conversion function and possibly recreate the frame.
  1236. """
  1237. if filt is None:
  1238. filt = lambda col: True
  1239. obj = self.obj
  1240. assert obj is not None # for mypy
  1241. needs_new_obj = False
  1242. new_obj = {}
  1243. for i, (col, c) in enumerate(obj.items()):
  1244. if filt(col):
  1245. new_data, result = f(col, c)
  1246. if result:
  1247. c = new_data
  1248. needs_new_obj = True
  1249. new_obj[i] = c
  1250. if needs_new_obj:
  1251. # possibly handle dup columns
  1252. new_frame = DataFrame(new_obj, index=obj.index)
  1253. new_frame.columns = obj.columns
  1254. self.obj = new_frame
  1255. def _try_convert_types(self) -> None:
  1256. if self.obj is None:
  1257. return
  1258. if self.convert_dates:
  1259. self._try_convert_dates()
  1260. self._process_converter(
  1261. lambda col, c: self._try_convert_data(col, c, convert_dates=False)
  1262. )
  1263. def _try_convert_dates(self) -> None:
  1264. if self.obj is None:
  1265. return
  1266. # our columns to parse
  1267. convert_dates_list_bool = self.convert_dates
  1268. if isinstance(convert_dates_list_bool, bool):
  1269. convert_dates_list_bool = []
  1270. convert_dates = set(convert_dates_list_bool)
  1271. def is_ok(col) -> bool:
  1272. """
  1273. Return if this col is ok to try for a date parse.
  1274. """
  1275. if col in convert_dates:
  1276. return True
  1277. if not self.keep_default_dates:
  1278. return False
  1279. if not isinstance(col, str):
  1280. return False
  1281. col_lower = col.lower()
  1282. if (
  1283. col_lower.endswith(("_at", "_time"))
  1284. or col_lower == "modified"
  1285. or col_lower == "date"
  1286. or col_lower == "datetime"
  1287. or col_lower.startswith("timestamp")
  1288. ):
  1289. return True
  1290. return False
  1291. self._process_converter(lambda col, c: self._try_convert_to_date(c), filt=is_ok)