| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444 |
- """ test orc compat """
- import datetime
- from decimal import Decimal
- from io import BytesIO
- import os
- import pathlib
- import numpy as np
- import pytest
- import pandas as pd
- from pandas import read_orc
- import pandas._testing as tm
- from pandas.core.arrays import StringArray
- pytest.importorskip("pyarrow.orc")
- import pyarrow as pa
- pytestmark = pytest.mark.filterwarnings(
- "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
- )
- @pytest.fixture
- def dirpath(datapath):
- return datapath("io", "data", "orc")
- @pytest.fixture(
- params=[
- np.array([1, 20], dtype="uint64"),
- pd.Series(["a", "b", "a"], dtype="category"),
- [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)],
- [pd.Period("2022-01-03", freq="D"), pd.Period("2022-01-04", freq="D")],
- ]
- )
- def orc_writer_dtypes_not_supported(request):
- # Examples of dataframes with dtypes for which conversion to ORC
- # hasn't been implemented yet, that is, Category, unsigned integers,
- # interval, period and sparse.
- return pd.DataFrame({"unimpl": request.param})
- def test_orc_reader_empty(dirpath, using_infer_string):
- columns = [
- "boolean1",
- "byte1",
- "short1",
- "int1",
- "long1",
- "float1",
- "double1",
- "bytes1",
- "string1",
- ]
- dtypes = [
- "bool",
- "int8",
- "int16",
- "int32",
- "int64",
- "float32",
- "float64",
- "object",
- "str" if using_infer_string else "object",
- ]
- expected = pd.DataFrame(index=pd.RangeIndex(0))
- for colname, dtype in zip(columns, dtypes):
- expected[colname] = pd.Series(dtype=dtype)
- expected.columns = expected.columns.astype("str")
- inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
- got = read_orc(inputfile, columns=columns)
- tm.assert_equal(expected, got)
- def test_orc_reader_basic(dirpath):
- data = {
- "boolean1": np.array([False, True], dtype="bool"),
- "byte1": np.array([1, 100], dtype="int8"),
- "short1": np.array([1024, 2048], dtype="int16"),
- "int1": np.array([65536, 65536], dtype="int32"),
- "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
- "float1": np.array([1.0, 2.0], dtype="float32"),
- "double1": np.array([-15.0, -5.0], dtype="float64"),
- "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
- "string1": np.array(["hi", "bye"], dtype="object"),
- }
- expected = pd.DataFrame.from_dict(data)
- inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
- got = read_orc(inputfile, columns=data.keys())
- tm.assert_equal(expected, got)
- def test_orc_reader_decimal(dirpath):
- # Only testing the first 10 rows of data
- data = {
- "_col0": np.array(
- [
- Decimal("-1000.50000"),
- Decimal("-999.60000"),
- Decimal("-998.70000"),
- Decimal("-997.80000"),
- Decimal("-996.90000"),
- Decimal("-995.10000"),
- Decimal("-994.11000"),
- Decimal("-993.12000"),
- Decimal("-992.13000"),
- Decimal("-991.14000"),
- ],
- dtype="object",
- )
- }
- expected = pd.DataFrame.from_dict(data)
- inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
- got = read_orc(inputfile).iloc[:10]
- tm.assert_equal(expected, got)
- def test_orc_reader_date_low(dirpath):
- data = {
- "time": np.array(
- [
- "1900-05-05 12:34:56.100000",
- "1900-05-05 12:34:56.100100",
- "1900-05-05 12:34:56.100200",
- "1900-05-05 12:34:56.100300",
- "1900-05-05 12:34:56.100400",
- "1900-05-05 12:34:56.100500",
- "1900-05-05 12:34:56.100600",
- "1900-05-05 12:34:56.100700",
- "1900-05-05 12:34:56.100800",
- "1900-05-05 12:34:56.100900",
- ],
- dtype="datetime64[ns]",
- ),
- "date": np.array(
- [
- datetime.date(1900, 12, 25),
- datetime.date(1900, 12, 25),
- datetime.date(1900, 12, 25),
- datetime.date(1900, 12, 25),
- datetime.date(1900, 12, 25),
- datetime.date(1900, 12, 25),
- datetime.date(1900, 12, 25),
- datetime.date(1900, 12, 25),
- datetime.date(1900, 12, 25),
- datetime.date(1900, 12, 25),
- ],
- dtype="object",
- ),
- }
- expected = pd.DataFrame.from_dict(data)
- inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")
- got = read_orc(inputfile).iloc[:10]
- tm.assert_equal(expected, got)
- def test_orc_reader_date_high(dirpath):
- data = {
- "time": np.array(
- [
- "2038-05-05 12:34:56.100000",
- "2038-05-05 12:34:56.100100",
- "2038-05-05 12:34:56.100200",
- "2038-05-05 12:34:56.100300",
- "2038-05-05 12:34:56.100400",
- "2038-05-05 12:34:56.100500",
- "2038-05-05 12:34:56.100600",
- "2038-05-05 12:34:56.100700",
- "2038-05-05 12:34:56.100800",
- "2038-05-05 12:34:56.100900",
- ],
- dtype="datetime64[ns]",
- ),
- "date": np.array(
- [
- datetime.date(2038, 12, 25),
- datetime.date(2038, 12, 25),
- datetime.date(2038, 12, 25),
- datetime.date(2038, 12, 25),
- datetime.date(2038, 12, 25),
- datetime.date(2038, 12, 25),
- datetime.date(2038, 12, 25),
- datetime.date(2038, 12, 25),
- datetime.date(2038, 12, 25),
- datetime.date(2038, 12, 25),
- ],
- dtype="object",
- ),
- }
- expected = pd.DataFrame.from_dict(data)
- inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
- got = read_orc(inputfile).iloc[:10]
- tm.assert_equal(expected, got)
- def test_orc_reader_snappy_compressed(dirpath):
- data = {
- "int1": np.array(
- [
- -1160101563,
- 1181413113,
- 2065821249,
- -267157795,
- 172111193,
- 1752363137,
- 1406072123,
- 1911809390,
- -1308542224,
- -467100286,
- ],
- dtype="int32",
- ),
- "string1": np.array(
- [
- "f50dcb8",
- "382fdaaa",
- "90758c6",
- "9e8caf3f",
- "ee97332b",
- "d634da1",
- "2bea4396",
- "d67d89e8",
- "ad71007e",
- "e8c82066",
- ],
- dtype="object",
- ),
- }
- expected = pd.DataFrame.from_dict(data)
- inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
- got = read_orc(inputfile).iloc[:10]
- tm.assert_equal(expected, got)
- def test_orc_roundtrip_file(dirpath):
- # GH44554
- # PyArrow gained ORC write support with the current argument order
- pytest.importorskip("pyarrow")
- data = {
- "boolean1": np.array([False, True], dtype="bool"),
- "byte1": np.array([1, 100], dtype="int8"),
- "short1": np.array([1024, 2048], dtype="int16"),
- "int1": np.array([65536, 65536], dtype="int32"),
- "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
- "float1": np.array([1.0, 2.0], dtype="float32"),
- "double1": np.array([-15.0, -5.0], dtype="float64"),
- "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
- "string1": np.array(["hi", "bye"], dtype="object"),
- }
- expected = pd.DataFrame.from_dict(data)
- with tm.ensure_clean() as path:
- expected.to_orc(path)
- got = read_orc(path)
- tm.assert_equal(expected, got)
- def test_orc_roundtrip_bytesio():
- # GH44554
- # PyArrow gained ORC write support with the current argument order
- pytest.importorskip("pyarrow")
- data = {
- "boolean1": np.array([False, True], dtype="bool"),
- "byte1": np.array([1, 100], dtype="int8"),
- "short1": np.array([1024, 2048], dtype="int16"),
- "int1": np.array([65536, 65536], dtype="int32"),
- "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
- "float1": np.array([1.0, 2.0], dtype="float32"),
- "double1": np.array([-15.0, -5.0], dtype="float64"),
- "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
- "string1": np.array(["hi", "bye"], dtype="object"),
- }
- expected = pd.DataFrame.from_dict(data)
- bytes = expected.to_orc()
- got = read_orc(BytesIO(bytes))
- tm.assert_equal(expected, got)
- def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported):
- # GH44554
- # PyArrow gained ORC write support with the current argument order
- pytest.importorskip("pyarrow")
- msg = "The dtype of one or more columns is not supported yet."
- with pytest.raises(NotImplementedError, match=msg):
- orc_writer_dtypes_not_supported.to_orc()
- def test_orc_dtype_backend_pyarrow(using_infer_string):
- pytest.importorskip("pyarrow")
- df = pd.DataFrame(
- {
- "string": list("abc"),
- "string_with_nan": ["a", np.nan, "c"],
- "string_with_none": ["a", None, "c"],
- "bytes": [b"foo", b"bar", None],
- "int": list(range(1, 4)),
- "float": np.arange(4.0, 7.0, dtype="float64"),
- "float_with_nan": [2.0, np.nan, 3.0],
- "bool": [True, False, True],
- "bool_with_na": [True, False, None],
- "datetime": pd.date_range("20130101", periods=3),
- "datetime_with_nat": [
- pd.Timestamp("20130101"),
- pd.NaT,
- pd.Timestamp("20130103"),
- ],
- }
- )
- bytes_data = df.copy().to_orc()
- result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow")
- expected = pd.DataFrame(
- {
- col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
- for col in df.columns
- }
- )
- if using_infer_string:
- # ORC does not preserve distinction between string and large string
- # -> the default large string comes back as string
- string_dtype = pd.ArrowDtype(pa.string())
- expected["string"] = expected["string"].astype(string_dtype)
- expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype)
- expected["string_with_none"] = expected["string_with_none"].astype(string_dtype)
- tm.assert_frame_equal(result, expected)
- def test_orc_dtype_backend_numpy_nullable():
- # GH#50503
- pytest.importorskip("pyarrow")
- df = pd.DataFrame(
- {
- "string": list("abc"),
- "string_with_nan": ["a", np.nan, "c"],
- "string_with_none": ["a", None, "c"],
- "int": list(range(1, 4)),
- "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
- "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
- "float": np.arange(4.0, 7.0, dtype="float64"),
- "float_with_nan": [2.0, np.nan, 3.0],
- "bool": [True, False, True],
- "bool_with_na": [True, False, None],
- }
- )
- bytes_data = df.copy().to_orc()
- result = read_orc(BytesIO(bytes_data), dtype_backend="numpy_nullable")
- expected = pd.DataFrame(
- {
- "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
- "string_with_nan": StringArray(
- np.array(["a", pd.NA, "c"], dtype=np.object_)
- ),
- "string_with_none": StringArray(
- np.array(["a", pd.NA, "c"], dtype=np.object_)
- ),
- "int": pd.Series([1, 2, 3], dtype="Int64"),
- "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
- "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
- "float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"),
- "float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"),
- "bool": pd.Series([True, False, True], dtype="boolean"),
- "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),
- }
- )
- tm.assert_frame_equal(result, expected)
- def test_orc_uri_path():
- expected = pd.DataFrame({"int": list(range(1, 4))})
- with tm.ensure_clean("tmp.orc") as path:
- expected.to_orc(path)
- uri = pathlib.Path(path).as_uri()
- result = read_orc(uri)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "index",
- [
- pd.RangeIndex(start=2, stop=5, step=1),
- pd.RangeIndex(start=0, stop=3, step=1, name="non-default"),
- pd.Index([1, 2, 3]),
- ],
- )
- def test_to_orc_non_default_index(index):
- df = pd.DataFrame({"a": [1, 2, 3]}, index=index)
- msg = (
- "orc does not support serializing a non-default index|"
- "orc does not serialize index meta-data"
- )
- with pytest.raises(ValueError, match=msg):
- df.to_orc()
- def test_invalid_dtype_backend():
- msg = (
- "dtype_backend numpy is invalid, only 'numpy_nullable' and "
- "'pyarrow' are allowed."
- )
- df = pd.DataFrame({"int": list(range(1, 4))})
- with tm.ensure_clean("tmp.orc") as path:
- df.to_orc(path)
- with pytest.raises(ValueError, match=msg):
- read_orc(path, dtype_backend="numpy")
- def test_string_inference(tmp_path):
- # GH#54431
- path = tmp_path / "test_string_inference.p"
- df = pd.DataFrame(data={"a": ["x", "y"]})
- df.to_orc(path)
- with pd.option_context("future.infer_string", True):
- result = read_orc(path)
- expected = pd.DataFrame(
- data={"a": ["x", "y"]},
- dtype=pd.StringDtype(na_value=np.nan),
- columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
- )
- tm.assert_frame_equal(result, expected)
|