orc.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. """ orc compat """
  2. from __future__ import annotations
  3. import io
  4. from types import ModuleType
  5. from typing import (
  6. TYPE_CHECKING,
  7. Any,
  8. Literal,
  9. )
  10. from pandas._libs import lib
  11. from pandas.compat._optional import import_optional_dependency
  12. from pandas.util._validators import check_dtype_backend
  13. from pandas.core.indexes.api import default_index
  14. from pandas.io._util import arrow_table_to_pandas
  15. from pandas.io.common import (
  16. get_handle,
  17. is_fsspec_url,
  18. )
  19. if TYPE_CHECKING:
  20. import fsspec
  21. import pyarrow.fs
  22. from pandas._typing import (
  23. DtypeBackend,
  24. FilePath,
  25. ReadBuffer,
  26. WriteBuffer,
  27. )
  28. from pandas.core.frame import DataFrame
  29. def read_orc(
  30. path: FilePath | ReadBuffer[bytes],
  31. columns: list[str] | None = None,
  32. dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
  33. filesystem: pyarrow.fs.FileSystem | fsspec.spec.AbstractFileSystem | None = None,
  34. **kwargs: Any,
  35. ) -> DataFrame:
  36. """
  37. Load an ORC object from the file path, returning a DataFrame.
  38. Parameters
  39. ----------
  40. path : str, path object, or file-like object
  41. String, path object (implementing ``os.PathLike[str]``), or file-like
  42. object implementing a binary ``read()`` function. The string could be a URL.
  43. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
  44. expected. A local file could be:
  45. ``file://localhost/path/to/table.orc``.
  46. columns : list, default None
  47. If not None, only these columns will be read from the file.
  48. Output always follows the ordering of the file and not the columns list.
  49. This mirrors the original behaviour of
  50. :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
  51. dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
  52. Back-end data type applied to the resultant :class:`DataFrame`
  53. (still experimental). Behaviour is as follows:
  54. * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
  55. (default).
  56. * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
  57. DataFrame.
  58. .. versionadded:: 2.0
  59. filesystem : fsspec or pyarrow filesystem, default None
  60. Filesystem object to use when reading the parquet file.
  61. .. versionadded:: 2.1.0
  62. **kwargs
  63. Any additional kwargs are passed to pyarrow.
  64. Returns
  65. -------
  66. DataFrame
  67. Notes
  68. -----
  69. Before using this function you should read the :ref:`user guide about ORC <io.orc>`
  70. and :ref:`install optional dependencies <install.warn_orc>`.
  71. If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"),
  72. a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a
  73. pyarrow or fsspec filesystem object into the filesystem keyword to override this
  74. behavior.
  75. Examples
  76. --------
  77. >>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP
  78. """
  79. # we require a newer version of pyarrow than we support for parquet
  80. orc = import_optional_dependency("pyarrow.orc")
  81. check_dtype_backend(dtype_backend)
  82. with get_handle(path, "rb", is_text=False) as handles:
  83. source = handles.handle
  84. if is_fsspec_url(path) and filesystem is None:
  85. pa = import_optional_dependency("pyarrow")
  86. pa_fs = import_optional_dependency("pyarrow.fs")
  87. try:
  88. filesystem, source = pa_fs.FileSystem.from_uri(path)
  89. except (TypeError, pa.ArrowInvalid):
  90. pass
  91. pa_table = orc.read_table(
  92. source=source, columns=columns, filesystem=filesystem, **kwargs
  93. )
  94. return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
  95. def to_orc(
  96. df: DataFrame,
  97. path: FilePath | WriteBuffer[bytes] | None = None,
  98. *,
  99. engine: Literal["pyarrow"] = "pyarrow",
  100. index: bool | None = None,
  101. engine_kwargs: dict[str, Any] | None = None,
  102. ) -> bytes | None:
  103. """
  104. Write a DataFrame to the ORC format.
  105. .. versionadded:: 1.5.0
  106. Parameters
  107. ----------
  108. df : DataFrame
  109. The dataframe to be written to ORC. Raises NotImplementedError
  110. if dtype of one or more columns is category, unsigned integers,
  111. intervals, periods or sparse.
  112. path : str, file-like object or None, default None
  113. If a string, it will be used as Root Directory path
  114. when writing a partitioned dataset. By file-like object,
  115. we refer to objects with a write() method, such as a file handle
  116. (e.g. via builtin open function). If path is None,
  117. a bytes object is returned.
  118. engine : str, default 'pyarrow'
  119. ORC library to use.
  120. index : bool, optional
  121. If ``True``, include the dataframe's index(es) in the file output. If
  122. ``False``, they will not be written to the file.
  123. If ``None``, similar to ``infer`` the dataframe's index(es)
  124. will be saved. However, instead of being saved as values,
  125. the RangeIndex will be stored as a range in the metadata so it
  126. doesn't require much space and is faster. Other indexes will
  127. be included as columns in the file output.
  128. engine_kwargs : dict[str, Any] or None, default None
  129. Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
  130. Returns
  131. -------
  132. bytes if no path argument is provided else None
  133. Raises
  134. ------
  135. NotImplementedError
  136. Dtype of one or more columns is category, unsigned integers, interval,
  137. period or sparse.
  138. ValueError
  139. engine is not pyarrow.
  140. Notes
  141. -----
  142. * Before using this function you should read the
  143. :ref:`user guide about ORC <io.orc>` and
  144. :ref:`install optional dependencies <install.warn_orc>`.
  145. * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
  146. library.
  147. * For supported dtypes please refer to `supported ORC features in Arrow
  148. <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
  149. * Currently timezones in datetime columns are not preserved when a
  150. dataframe is converted into ORC files.
  151. """
  152. if index is None:
  153. index = df.index.names[0] is not None
  154. if engine_kwargs is None:
  155. engine_kwargs = {}
  156. # validate index
  157. # --------------
  158. # validate that we have only a default index
  159. # raise on anything else as we don't serialize the index
  160. if not df.index.equals(default_index(len(df))):
  161. raise ValueError(
  162. "orc does not support serializing a non-default index for the index; "
  163. "you can .reset_index() to make the index into column(s)"
  164. )
  165. if df.index.name is not None:
  166. raise ValueError("orc does not serialize index meta-data on a default index")
  167. if engine != "pyarrow":
  168. raise ValueError("engine must be 'pyarrow'")
  169. engine = import_optional_dependency(engine, min_version="10.0.1")
  170. pa = import_optional_dependency("pyarrow")
  171. orc = import_optional_dependency("pyarrow.orc")
  172. was_none = path is None
  173. if was_none:
  174. path = io.BytesIO()
  175. assert path is not None # For mypy
  176. with get_handle(path, "wb", is_text=False) as handles:
  177. assert isinstance(engine, ModuleType) # For mypy
  178. try:
  179. orc.write_table(
  180. engine.Table.from_pandas(df, preserve_index=index),
  181. handles.handle,
  182. **engine_kwargs,
  183. )
  184. except (TypeError, pa.ArrowNotImplementedError) as e:
  185. raise NotImplementedError(
  186. "The dtype of one or more columns is not supported yet."
  187. ) from e
  188. if was_none:
  189. assert isinstance(path, io.BytesIO) # For mypy
  190. return path.getvalue()
  191. return None