_table_schema.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. """
  2. Table Schema builders
  3. https://specs.frictionlessdata.io/table-schema/
  4. """
  5. from __future__ import annotations
  6. from typing import (
  7. TYPE_CHECKING,
  8. Any,
  9. cast,
  10. )
  11. import warnings
  12. from pandas._libs import lib
  13. from pandas._libs.json import ujson_loads
  14. from pandas._libs.tslibs import timezones
  15. from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
  16. from pandas.util._exceptions import find_stack_level
  17. from pandas.core.dtypes.base import _registry as registry
  18. from pandas.core.dtypes.common import (
  19. is_bool_dtype,
  20. is_integer_dtype,
  21. is_numeric_dtype,
  22. is_string_dtype,
  23. )
  24. from pandas.core.dtypes.dtypes import (
  25. CategoricalDtype,
  26. DatetimeTZDtype,
  27. ExtensionDtype,
  28. PeriodDtype,
  29. )
  30. from pandas import DataFrame
  31. import pandas.core.common as com
  32. from pandas.tseries.frequencies import to_offset
  33. if TYPE_CHECKING:
  34. from pandas._typing import (
  35. DtypeObj,
  36. JSONSerializable,
  37. )
  38. from pandas import Series
  39. from pandas.core.indexes.multi import MultiIndex
  40. TABLE_SCHEMA_VERSION = "1.4.0"
  41. def as_json_table_type(x: DtypeObj) -> str:
  42. """
  43. Convert a NumPy / pandas type to its corresponding json_table.
  44. Parameters
  45. ----------
  46. x : np.dtype or ExtensionDtype
  47. Returns
  48. -------
  49. str
  50. the Table Schema data types
  51. Notes
  52. -----
  53. This table shows the relationship between NumPy / pandas dtypes,
  54. and Table Schema dtypes.
  55. ============== =================
  56. Pandas type Table Schema type
  57. ============== =================
  58. int64 integer
  59. float64 number
  60. bool boolean
  61. datetime64[ns] datetime
  62. timedelta64[ns] duration
  63. object str
  64. categorical any
  65. =============== =================
  66. """
  67. if is_integer_dtype(x):
  68. return "integer"
  69. elif is_bool_dtype(x):
  70. return "boolean"
  71. elif is_numeric_dtype(x):
  72. return "number"
  73. elif lib.is_np_dtype(x, "M") or isinstance(x, (DatetimeTZDtype, PeriodDtype)):
  74. return "datetime"
  75. elif lib.is_np_dtype(x, "m"):
  76. return "duration"
  77. elif is_string_dtype(x):
  78. return "string"
  79. else:
  80. return "any"
  81. def set_default_names(data):
  82. """Sets index names to 'index' for regular, or 'level_x' for Multi"""
  83. if com.all_not_none(*data.index.names):
  84. nms = data.index.names
  85. if len(nms) == 1 and data.index.name == "index":
  86. warnings.warn(
  87. "Index name of 'index' is not round-trippable.",
  88. stacklevel=find_stack_level(),
  89. )
  90. elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
  91. warnings.warn(
  92. "Index names beginning with 'level_' are not round-trippable.",
  93. stacklevel=find_stack_level(),
  94. )
  95. return data
  96. data = data.copy()
  97. if data.index.nlevels > 1:
  98. data.index.names = com.fill_missing_names(data.index.names)
  99. else:
  100. data.index.name = data.index.name or "index"
  101. return data
  102. def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]:
  103. dtype = arr.dtype
  104. name: JSONSerializable
  105. if arr.name is None:
  106. name = "values"
  107. else:
  108. name = arr.name
  109. field: dict[str, JSONSerializable] = {
  110. "name": name,
  111. "type": as_json_table_type(dtype),
  112. }
  113. if isinstance(dtype, CategoricalDtype):
  114. cats = dtype.categories
  115. ordered = dtype.ordered
  116. field["constraints"] = {"enum": list(cats)}
  117. field["ordered"] = ordered
  118. elif isinstance(dtype, PeriodDtype):
  119. field["freq"] = dtype.freq.freqstr
  120. elif isinstance(dtype, DatetimeTZDtype):
  121. if timezones.is_utc(dtype.tz):
  122. # timezone.utc has no "zone" attr
  123. field["tz"] = "UTC"
  124. else:
  125. # error: "tzinfo" has no attribute "zone"
  126. field["tz"] = dtype.tz.zone # type: ignore[attr-defined]
  127. elif isinstance(dtype, ExtensionDtype):
  128. field["extDtype"] = dtype.name
  129. return field
  130. def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype:
  131. """
  132. Converts a JSON field descriptor into its corresponding NumPy / pandas type
  133. Parameters
  134. ----------
  135. field
  136. A JSON field descriptor
  137. Returns
  138. -------
  139. dtype
  140. Raises
  141. ------
  142. ValueError
  143. If the type of the provided field is unknown or currently unsupported
  144. Examples
  145. --------
  146. >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"})
  147. 'int64'
  148. >>> convert_json_field_to_pandas_type(
  149. ... {
  150. ... "name": "a_categorical",
  151. ... "type": "any",
  152. ... "constraints": {"enum": ["a", "b", "c"]},
  153. ... "ordered": True,
  154. ... }
  155. ... )
  156. CategoricalDtype(categories=['a', 'b', 'c'], ordered=True, categories_dtype=object)
  157. >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"})
  158. 'datetime64[ns]'
  159. >>> convert_json_field_to_pandas_type(
  160. ... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"}
  161. ... )
  162. 'datetime64[ns, US/Central]'
  163. """
  164. typ = field["type"]
  165. if typ == "string":
  166. return field.get("extDtype", None)
  167. elif typ == "integer":
  168. return field.get("extDtype", "int64")
  169. elif typ == "number":
  170. return field.get("extDtype", "float64")
  171. elif typ == "boolean":
  172. return field.get("extDtype", "bool")
  173. elif typ == "duration":
  174. return "timedelta64"
  175. elif typ == "datetime":
  176. if field.get("tz"):
  177. return f"datetime64[ns, {field['tz']}]"
  178. elif field.get("freq"):
  179. # GH#9586 rename frequency M to ME for offsets
  180. offset = to_offset(field["freq"])
  181. freq_n, freq_name = offset.n, offset.name
  182. freq = freq_to_period_freqstr(freq_n, freq_name)
  183. # GH#47747 using datetime over period to minimize the change surface
  184. return f"period[{freq}]"
  185. else:
  186. return "datetime64[ns]"
  187. elif typ == "any":
  188. if "constraints" in field and "ordered" in field:
  189. return CategoricalDtype(
  190. categories=field["constraints"]["enum"], ordered=field["ordered"]
  191. )
  192. elif "extDtype" in field:
  193. return registry.find(field["extDtype"])
  194. else:
  195. return "object"
  196. raise ValueError(f"Unsupported or invalid field type: {typ}")
  197. def build_table_schema(
  198. data: DataFrame | Series,
  199. index: bool = True,
  200. primary_key: bool | None = None,
  201. version: bool = True,
  202. ) -> dict[str, JSONSerializable]:
  203. """
  204. Create a Table schema from ``data``.
  205. Parameters
  206. ----------
  207. data : Series, DataFrame
  208. index : bool, default True
  209. Whether to include ``data.index`` in the schema.
  210. primary_key : bool or None, default True
  211. Column names to designate as the primary key.
  212. The default `None` will set `'primaryKey'` to the index
  213. level or levels if the index is unique.
  214. version : bool, default True
  215. Whether to include a field `pandas_version` with the version
  216. of pandas that last revised the table schema. This version
  217. can be different from the installed pandas version.
  218. Returns
  219. -------
  220. dict
  221. Notes
  222. -----
  223. See `Table Schema
  224. <https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for
  225. conversion types.
  226. Timedeltas as converted to ISO8601 duration format with
  227. 9 decimal places after the seconds field for nanosecond precision.
  228. Categoricals are converted to the `any` dtype, and use the `enum` field
  229. constraint to list the allowed values. The `ordered` attribute is included
  230. in an `ordered` field.
  231. Examples
  232. --------
  233. >>> from pandas.io.json._table_schema import build_table_schema
  234. >>> df = pd.DataFrame(
  235. ... {'A': [1, 2, 3],
  236. ... 'B': ['a', 'b', 'c'],
  237. ... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
  238. ... }, index=pd.Index(range(3), name='idx'))
  239. >>> build_table_schema(df)
  240. {'fields': \
  241. [{'name': 'idx', 'type': 'integer'}, \
  242. {'name': 'A', 'type': 'integer'}, \
  243. {'name': 'B', 'type': 'string'}, \
  244. {'name': 'C', 'type': 'datetime'}], \
  245. 'primaryKey': ['idx'], \
  246. 'pandas_version': '1.4.0'}
  247. """
  248. if index is True:
  249. data = set_default_names(data)
  250. schema: dict[str, Any] = {}
  251. fields = []
  252. if index:
  253. if data.index.nlevels > 1:
  254. data.index = cast("MultiIndex", data.index)
  255. for level, name in zip(data.index.levels, data.index.names):
  256. new_field = convert_pandas_type_to_json_field(level)
  257. new_field["name"] = name
  258. fields.append(new_field)
  259. else:
  260. fields.append(convert_pandas_type_to_json_field(data.index))
  261. if data.ndim > 1:
  262. for column, s in data.items():
  263. fields.append(convert_pandas_type_to_json_field(s))
  264. else:
  265. fields.append(convert_pandas_type_to_json_field(data))
  266. schema["fields"] = fields
  267. if index and data.index.is_unique and primary_key is None:
  268. if data.index.nlevels == 1:
  269. schema["primaryKey"] = [data.index.name]
  270. else:
  271. schema["primaryKey"] = data.index.names
  272. elif primary_key is not None:
  273. schema["primaryKey"] = primary_key
  274. if version:
  275. schema["pandas_version"] = TABLE_SCHEMA_VERSION
  276. return schema
  277. def parse_table_schema(json, precise_float: bool) -> DataFrame:
  278. """
  279. Builds a DataFrame from a given schema
  280. Parameters
  281. ----------
  282. json :
  283. A JSON table schema
  284. precise_float : bool
  285. Flag controlling precision when decoding string to double values, as
  286. dictated by ``read_json``
  287. Returns
  288. -------
  289. df : DataFrame
  290. Raises
  291. ------
  292. NotImplementedError
  293. If the JSON table schema contains either timezone or timedelta data
  294. Notes
  295. -----
  296. Because :func:`DataFrame.to_json` uses the string 'index' to denote a
  297. name-less :class:`Index`, this function sets the name of the returned
  298. :class:`DataFrame` to ``None`` when said string is encountered with a
  299. normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
  300. applies to any strings beginning with 'level_'. Therefore, an
  301. :class:`Index` name of 'index' and :class:`MultiIndex` names starting
  302. with 'level_' are not supported.
  303. See Also
  304. --------
  305. build_table_schema : Inverse function.
  306. pandas.read_json
  307. """
  308. table = ujson_loads(json, precise_float=precise_float)
  309. col_order = [field["name"] for field in table["schema"]["fields"]]
  310. df = DataFrame(table["data"], columns=col_order)[col_order]
  311. dtypes = {
  312. field["name"]: convert_json_field_to_pandas_type(field)
  313. for field in table["schema"]["fields"]
  314. }
  315. # No ISO constructor for Timedelta as of yet, so need to raise
  316. if "timedelta64" in dtypes.values():
  317. raise NotImplementedError(
  318. 'table="orient" can not yet read ISO-formatted Timedelta data'
  319. )
  320. df = df.astype(dtypes)
  321. if "primaryKey" in table["schema"]:
  322. df = df.set_index(table["schema"]["primaryKey"])
  323. if len(df.index.names) == 1:
  324. if df.index.name == "index":
  325. df.index.name = None
  326. else:
  327. df.index.names = [
  328. None if x.startswith("level_") else x for x in df.index.names
  329. ]
  330. return df