_normalize.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. # ---------------------------------------------------------------------
  2. # JSON normalization routines
  3. from __future__ import annotations
  4. from collections import (
  5. abc,
  6. defaultdict,
  7. )
  8. import copy
  9. from typing import (
  10. TYPE_CHECKING,
  11. Any,
  12. DefaultDict,
  13. )
  14. import numpy as np
  15. from pandas._libs.writers import convert_json_to_lines
  16. import pandas as pd
  17. from pandas import DataFrame
  18. if TYPE_CHECKING:
  19. from collections.abc import Iterable
  20. from pandas._typing import (
  21. IgnoreRaise,
  22. Scalar,
  23. )
  24. def convert_to_line_delimits(s: str) -> str:
  25. """
  26. Helper function that converts JSON lists to line delimited JSON.
  27. """
  28. # Determine we have a JSON list to turn to lines otherwise just return the
  29. # json object, only lists can
  30. if not s[0] == "[" and s[-1] == "]":
  31. return s
  32. s = s[1:-1]
  33. return convert_json_to_lines(s)
  34. def nested_to_record(
  35. ds,
  36. prefix: str = "",
  37. sep: str = ".",
  38. level: int = 0,
  39. max_level: int | None = None,
  40. ):
  41. """
  42. A simplified json_normalize
  43. Converts a nested dict into a flat dict ("record"), unlike json_normalize,
  44. it does not attempt to extract a subset of the data.
  45. Parameters
  46. ----------
  47. ds : dict or list of dicts
  48. prefix: the prefix, optional, default: ""
  49. sep : str, default '.'
  50. Nested records will generate names separated by sep,
  51. e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
  52. level: int, optional, default: 0
  53. The number of levels in the json string.
  54. max_level: int, optional, default: None
  55. The max depth to normalize.
  56. Returns
  57. -------
  58. d - dict or list of dicts, matching `ds`
  59. Examples
  60. --------
  61. >>> nested_to_record(
  62. ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))
  63. ... )
  64. {\
  65. 'flat1': 1, \
  66. 'dict1.c': 1, \
  67. 'dict1.d': 2, \
  68. 'nested.e.c': 1, \
  69. 'nested.e.d': 2, \
  70. 'nested.d': 2\
  71. }
  72. """
  73. singleton = False
  74. if isinstance(ds, dict):
  75. ds = [ds]
  76. singleton = True
  77. new_ds = []
  78. for d in ds:
  79. new_d = copy.deepcopy(d)
  80. for k, v in d.items():
  81. # each key gets renamed with prefix
  82. if not isinstance(k, str):
  83. k = str(k)
  84. if level == 0:
  85. newkey = k
  86. else:
  87. newkey = prefix + sep + k
  88. # flatten if type is dict and
  89. # current dict level < maximum level provided and
  90. # only dicts gets recurse-flattened
  91. # only at level>1 do we rename the rest of the keys
  92. if not isinstance(v, dict) or (
  93. max_level is not None and level >= max_level
  94. ):
  95. if level != 0: # so we skip copying for top level, common case
  96. v = new_d.pop(k)
  97. new_d[newkey] = v
  98. continue
  99. v = new_d.pop(k)
  100. new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
  101. new_ds.append(new_d)
  102. if singleton:
  103. return new_ds[0]
  104. return new_ds
  105. def _normalise_json(
  106. data: Any,
  107. key_string: str,
  108. normalized_dict: dict[str, Any],
  109. separator: str,
  110. ) -> dict[str, Any]:
  111. """
  112. Main recursive function
  113. Designed for the most basic use case of pd.json_normalize(data)
  114. intended as a performance improvement, see #15621
  115. Parameters
  116. ----------
  117. data : Any
  118. Type dependent on types contained within nested Json
  119. key_string : str
  120. New key (with separator(s) in) for data
  121. normalized_dict : dict
  122. The new normalized/flattened Json dict
  123. separator : str, default '.'
  124. Nested records will generate names separated by sep,
  125. e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
  126. """
  127. if isinstance(data, dict):
  128. for key, value in data.items():
  129. new_key = f"{key_string}{separator}{key}"
  130. if not key_string:
  131. new_key = new_key.removeprefix(separator)
  132. _normalise_json(
  133. data=value,
  134. key_string=new_key,
  135. normalized_dict=normalized_dict,
  136. separator=separator,
  137. )
  138. else:
  139. normalized_dict[key_string] = data
  140. return normalized_dict
  141. def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:
  142. """
  143. Order the top level keys and then recursively go to depth
  144. Parameters
  145. ----------
  146. data : dict or list of dicts
  147. separator : str, default '.'
  148. Nested records will generate names separated by sep,
  149. e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
  150. Returns
  151. -------
  152. dict or list of dicts, matching `normalised_json_object`
  153. """
  154. top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
  155. nested_dict_ = _normalise_json(
  156. data={k: v for k, v in data.items() if isinstance(v, dict)},
  157. key_string="",
  158. normalized_dict={},
  159. separator=separator,
  160. )
  161. return {**top_dict_, **nested_dict_}
  162. def _simple_json_normalize(
  163. ds: dict | list[dict],
  164. sep: str = ".",
  165. ) -> dict | list[dict] | Any:
  166. """
  167. A optimized basic json_normalize
  168. Converts a nested dict into a flat dict ("record"), unlike
  169. json_normalize and nested_to_record it doesn't do anything clever.
  170. But for the most basic use cases it enhances performance.
  171. E.g. pd.json_normalize(data)
  172. Parameters
  173. ----------
  174. ds : dict or list of dicts
  175. sep : str, default '.'
  176. Nested records will generate names separated by sep,
  177. e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
  178. Returns
  179. -------
  180. frame : DataFrame
  181. d - dict or list of dicts, matching `normalised_json_object`
  182. Examples
  183. --------
  184. >>> _simple_json_normalize(
  185. ... {
  186. ... "flat1": 1,
  187. ... "dict1": {"c": 1, "d": 2},
  188. ... "nested": {"e": {"c": 1, "d": 2}, "d": 2},
  189. ... }
  190. ... )
  191. {\
  192. 'flat1': 1, \
  193. 'dict1.c': 1, \
  194. 'dict1.d': 2, \
  195. 'nested.e.c': 1, \
  196. 'nested.e.d': 2, \
  197. 'nested.d': 2\
  198. }
  199. """
  200. normalised_json_object = {}
  201. # expect a dictionary, as most jsons are. However, lists are perfectly valid
  202. if isinstance(ds, dict):
  203. normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)
  204. elif isinstance(ds, list):
  205. normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
  206. return normalised_json_list
  207. return normalised_json_object
  208. def json_normalize(
  209. data: dict | list[dict],
  210. record_path: str | list | None = None,
  211. meta: str | list[str | list[str]] | None = None,
  212. meta_prefix: str | None = None,
  213. record_prefix: str | None = None,
  214. errors: IgnoreRaise = "raise",
  215. sep: str = ".",
  216. max_level: int | None = None,
  217. ) -> DataFrame:
  218. """
  219. Normalize semi-structured JSON data into a flat table.
  220. Parameters
  221. ----------
  222. data : dict or list of dicts
  223. Unserialized JSON objects.
  224. record_path : str or list of str, default None
  225. Path in each object to list of records. If not passed, data will be
  226. assumed to be an array of records.
  227. meta : list of paths (str or list of str), default None
  228. Fields to use as metadata for each record in resulting table.
  229. meta_prefix : str, default None
  230. If True, prefix records with dotted (?) path, e.g. foo.bar.field if
  231. meta is ['foo', 'bar'].
  232. record_prefix : str, default None
  233. If True, prefix records with dotted (?) path, e.g. foo.bar.field if
  234. path to records is ['foo', 'bar'].
  235. errors : {'raise', 'ignore'}, default 'raise'
  236. Configures error handling.
  237. * 'ignore' : will ignore KeyError if keys listed in meta are not
  238. always present.
  239. * 'raise' : will raise KeyError if keys listed in meta are not
  240. always present.
  241. sep : str, default '.'
  242. Nested records will generate names separated by sep.
  243. e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
  244. max_level : int, default None
  245. Max number of levels(depth of dict) to normalize.
  246. if None, normalizes all levels.
  247. Returns
  248. -------
  249. frame : DataFrame
  250. Normalize semi-structured JSON data into a flat table.
  251. Examples
  252. --------
  253. >>> data = [
  254. ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
  255. ... {"name": {"given": "Mark", "family": "Regner"}},
  256. ... {"id": 2, "name": "Faye Raker"},
  257. ... ]
  258. >>> pd.json_normalize(data)
  259. id name.first name.last name.given name.family name
  260. 0 1.0 Coleen Volk NaN NaN NaN
  261. 1 NaN NaN NaN Mark Regner NaN
  262. 2 2.0 NaN NaN NaN NaN Faye Raker
  263. >>> data = [
  264. ... {
  265. ... "id": 1,
  266. ... "name": "Cole Volk",
  267. ... "fitness": {"height": 130, "weight": 60},
  268. ... },
  269. ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
  270. ... {
  271. ... "id": 2,
  272. ... "name": "Faye Raker",
  273. ... "fitness": {"height": 130, "weight": 60},
  274. ... },
  275. ... ]
  276. >>> pd.json_normalize(data, max_level=0)
  277. id name fitness
  278. 0 1.0 Cole Volk {'height': 130, 'weight': 60}
  279. 1 NaN Mark Reg {'height': 130, 'weight': 60}
  280. 2 2.0 Faye Raker {'height': 130, 'weight': 60}
  281. Normalizes nested data up to level 1.
  282. >>> data = [
  283. ... {
  284. ... "id": 1,
  285. ... "name": "Cole Volk",
  286. ... "fitness": {"height": 130, "weight": 60},
  287. ... },
  288. ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
  289. ... {
  290. ... "id": 2,
  291. ... "name": "Faye Raker",
  292. ... "fitness": {"height": 130, "weight": 60},
  293. ... },
  294. ... ]
  295. >>> pd.json_normalize(data, max_level=1)
  296. id name fitness.height fitness.weight
  297. 0 1.0 Cole Volk 130 60
  298. 1 NaN Mark Reg 130 60
  299. 2 2.0 Faye Raker 130 60
  300. >>> data = [
  301. ... {
  302. ... "state": "Florida",
  303. ... "shortname": "FL",
  304. ... "info": {"governor": "Rick Scott"},
  305. ... "counties": [
  306. ... {"name": "Dade", "population": 12345},
  307. ... {"name": "Broward", "population": 40000},
  308. ... {"name": "Palm Beach", "population": 60000},
  309. ... ],
  310. ... },
  311. ... {
  312. ... "state": "Ohio",
  313. ... "shortname": "OH",
  314. ... "info": {"governor": "John Kasich"},
  315. ... "counties": [
  316. ... {"name": "Summit", "population": 1234},
  317. ... {"name": "Cuyahoga", "population": 1337},
  318. ... ],
  319. ... },
  320. ... ]
  321. >>> result = pd.json_normalize(
  322. ... data, "counties", ["state", "shortname", ["info", "governor"]]
  323. ... )
  324. >>> result
  325. name population state shortname info.governor
  326. 0 Dade 12345 Florida FL Rick Scott
  327. 1 Broward 40000 Florida FL Rick Scott
  328. 2 Palm Beach 60000 Florida FL Rick Scott
  329. 3 Summit 1234 Ohio OH John Kasich
  330. 4 Cuyahoga 1337 Ohio OH John Kasich
  331. >>> data = {"A": [1, 2]}
  332. >>> pd.json_normalize(data, "A", record_prefix="Prefix.")
  333. Prefix.0
  334. 0 1
  335. 1 2
  336. Returns normalized data with columns prefixed with the given string.
  337. """
  338. def _pull_field(
  339. js: dict[str, Any], spec: list | str, extract_record: bool = False
  340. ) -> Scalar | Iterable:
  341. """Internal function to pull field"""
  342. result = js
  343. try:
  344. if isinstance(spec, list):
  345. for field in spec:
  346. if result is None:
  347. raise KeyError(field)
  348. result = result[field]
  349. else:
  350. result = result[spec]
  351. except KeyError as e:
  352. if extract_record:
  353. raise KeyError(
  354. f"Key {e} not found. If specifying a record_path, all elements of "
  355. f"data should have the path."
  356. ) from e
  357. if errors == "ignore":
  358. return np.nan
  359. else:
  360. raise KeyError(
  361. f"Key {e} not found. To replace missing values of {e} with "
  362. f"np.nan, pass in errors='ignore'"
  363. ) from e
  364. return result
  365. def _pull_records(js: dict[str, Any], spec: list | str) -> list:
  366. """
  367. Internal function to pull field for records, and similar to
  368. _pull_field, but require to return list. And will raise error
  369. if has non iterable value.
  370. """
  371. result = _pull_field(js, spec, extract_record=True)
  372. # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
  373. # null, otherwise return an empty list
  374. if not isinstance(result, list):
  375. if pd.isnull(result):
  376. result = []
  377. else:
  378. raise TypeError(
  379. f"{js} has non list value {result} for path {spec}. "
  380. "Must be list or null."
  381. )
  382. return result
  383. if isinstance(data, list) and not data:
  384. return DataFrame()
  385. elif isinstance(data, dict):
  386. # A bit of a hackjob
  387. data = [data]
  388. elif isinstance(data, abc.Iterable) and not isinstance(data, str):
  389. # GH35923 Fix pd.json_normalize to not skip the first element of a
  390. # generator input
  391. data = list(data)
  392. else:
  393. raise NotImplementedError
  394. # check to see if a simple recursive function is possible to
  395. # improve performance (see #15621) but only for cases such
  396. # as pd.Dataframe(data) or pd.Dataframe(data, sep)
  397. if (
  398. record_path is None
  399. and meta is None
  400. and meta_prefix is None
  401. and record_prefix is None
  402. and max_level is None
  403. ):
  404. return DataFrame(_simple_json_normalize(data, sep=sep))
  405. if record_path is None:
  406. if any([isinstance(x, dict) for x in y.values()] for y in data):
  407. # naive normalization, this is idempotent for flat records
  408. # and potentially will inflate the data considerably for
  409. # deeply nested structures:
  410. # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
  411. #
  412. # TODO: handle record value which are lists, at least error
  413. # reasonably
  414. data = nested_to_record(data, sep=sep, max_level=max_level)
  415. return DataFrame(data)
  416. elif not isinstance(record_path, list):
  417. record_path = [record_path]
  418. if meta is None:
  419. meta = []
  420. elif not isinstance(meta, list):
  421. meta = [meta]
  422. _meta = [m if isinstance(m, list) else [m] for m in meta]
  423. # Disastrously inefficient for now
  424. records: list = []
  425. lengths = []
  426. meta_vals: DefaultDict = defaultdict(list)
  427. meta_keys = [sep.join(val) for val in _meta]
  428. def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
  429. if isinstance(data, dict):
  430. data = [data]
  431. if len(path) > 1:
  432. for obj in data:
  433. for val, key in zip(_meta, meta_keys):
  434. if level + 1 == len(val):
  435. seen_meta[key] = _pull_field(obj, val[-1])
  436. _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
  437. else:
  438. for obj in data:
  439. recs = _pull_records(obj, path[0])
  440. recs = [
  441. nested_to_record(r, sep=sep, max_level=max_level)
  442. if isinstance(r, dict)
  443. else r
  444. for r in recs
  445. ]
  446. # For repeating the metadata later
  447. lengths.append(len(recs))
  448. for val, key in zip(_meta, meta_keys):
  449. if level + 1 > len(val):
  450. meta_val = seen_meta[key]
  451. else:
  452. meta_val = _pull_field(obj, val[level:])
  453. meta_vals[key].append(meta_val)
  454. records.extend(recs)
  455. _recursive_extract(data, record_path, {}, level=0)
  456. result = DataFrame(records)
  457. if record_prefix is not None:
  458. result = result.rename(columns=lambda x: f"{record_prefix}{x}")
  459. # Data types, a problem
  460. for k, v in meta_vals.items():
  461. if meta_prefix is not None:
  462. k = meta_prefix + k
  463. if k in result:
  464. raise ValueError(
  465. f"Conflicting metadata name {k}, need distinguishing prefix "
  466. )
  467. # GH 37782
  468. values = np.array(v, dtype=object)
  469. if values.ndim > 1:
  470. # GH 37782
  471. values = np.empty((len(v),), dtype=object)
  472. for i, v in enumerate(v):
  473. values[i] = v
  474. result[k] = values.repeat(lengths)
  475. return result