test_orc.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. """ test orc compat """
  2. import datetime
  3. from decimal import Decimal
  4. from io import BytesIO
  5. import os
  6. import pathlib
  7. import numpy as np
  8. import pytest
  9. import pandas as pd
  10. from pandas import read_orc
  11. import pandas._testing as tm
  12. from pandas.core.arrays import StringArray
  13. pytest.importorskip("pyarrow.orc")
  14. import pyarrow as pa
  15. pytestmark = pytest.mark.filterwarnings(
  16. "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
  17. )
  18. @pytest.fixture
  19. def dirpath(datapath):
  20. return datapath("io", "data", "orc")
  21. @pytest.fixture(
  22. params=[
  23. np.array([1, 20], dtype="uint64"),
  24. pd.Series(["a", "b", "a"], dtype="category"),
  25. [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)],
  26. [pd.Period("2022-01-03", freq="D"), pd.Period("2022-01-04", freq="D")],
  27. ]
  28. )
  29. def orc_writer_dtypes_not_supported(request):
  30. # Examples of dataframes with dtypes for which conversion to ORC
  31. # hasn't been implemented yet, that is, Category, unsigned integers,
  32. # interval, period and sparse.
  33. return pd.DataFrame({"unimpl": request.param})
  34. def test_orc_reader_empty(dirpath, using_infer_string):
  35. columns = [
  36. "boolean1",
  37. "byte1",
  38. "short1",
  39. "int1",
  40. "long1",
  41. "float1",
  42. "double1",
  43. "bytes1",
  44. "string1",
  45. ]
  46. dtypes = [
  47. "bool",
  48. "int8",
  49. "int16",
  50. "int32",
  51. "int64",
  52. "float32",
  53. "float64",
  54. "object",
  55. "str" if using_infer_string else "object",
  56. ]
  57. expected = pd.DataFrame(index=pd.RangeIndex(0))
  58. for colname, dtype in zip(columns, dtypes):
  59. expected[colname] = pd.Series(dtype=dtype)
  60. expected.columns = expected.columns.astype("str")
  61. inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
  62. got = read_orc(inputfile, columns=columns)
  63. tm.assert_equal(expected, got)
  64. def test_orc_reader_basic(dirpath):
  65. data = {
  66. "boolean1": np.array([False, True], dtype="bool"),
  67. "byte1": np.array([1, 100], dtype="int8"),
  68. "short1": np.array([1024, 2048], dtype="int16"),
  69. "int1": np.array([65536, 65536], dtype="int32"),
  70. "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
  71. "float1": np.array([1.0, 2.0], dtype="float32"),
  72. "double1": np.array([-15.0, -5.0], dtype="float64"),
  73. "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
  74. "string1": np.array(["hi", "bye"], dtype="object"),
  75. }
  76. expected = pd.DataFrame.from_dict(data)
  77. inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
  78. got = read_orc(inputfile, columns=data.keys())
  79. tm.assert_equal(expected, got)
  80. def test_orc_reader_decimal(dirpath):
  81. # Only testing the first 10 rows of data
  82. data = {
  83. "_col0": np.array(
  84. [
  85. Decimal("-1000.50000"),
  86. Decimal("-999.60000"),
  87. Decimal("-998.70000"),
  88. Decimal("-997.80000"),
  89. Decimal("-996.90000"),
  90. Decimal("-995.10000"),
  91. Decimal("-994.11000"),
  92. Decimal("-993.12000"),
  93. Decimal("-992.13000"),
  94. Decimal("-991.14000"),
  95. ],
  96. dtype="object",
  97. )
  98. }
  99. expected = pd.DataFrame.from_dict(data)
  100. inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
  101. got = read_orc(inputfile).iloc[:10]
  102. tm.assert_equal(expected, got)
  103. def test_orc_reader_date_low(dirpath):
  104. data = {
  105. "time": np.array(
  106. [
  107. "1900-05-05 12:34:56.100000",
  108. "1900-05-05 12:34:56.100100",
  109. "1900-05-05 12:34:56.100200",
  110. "1900-05-05 12:34:56.100300",
  111. "1900-05-05 12:34:56.100400",
  112. "1900-05-05 12:34:56.100500",
  113. "1900-05-05 12:34:56.100600",
  114. "1900-05-05 12:34:56.100700",
  115. "1900-05-05 12:34:56.100800",
  116. "1900-05-05 12:34:56.100900",
  117. ],
  118. dtype="datetime64[ns]",
  119. ),
  120. "date": np.array(
  121. [
  122. datetime.date(1900, 12, 25),
  123. datetime.date(1900, 12, 25),
  124. datetime.date(1900, 12, 25),
  125. datetime.date(1900, 12, 25),
  126. datetime.date(1900, 12, 25),
  127. datetime.date(1900, 12, 25),
  128. datetime.date(1900, 12, 25),
  129. datetime.date(1900, 12, 25),
  130. datetime.date(1900, 12, 25),
  131. datetime.date(1900, 12, 25),
  132. ],
  133. dtype="object",
  134. ),
  135. }
  136. expected = pd.DataFrame.from_dict(data)
  137. inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")
  138. got = read_orc(inputfile).iloc[:10]
  139. tm.assert_equal(expected, got)
  140. def test_orc_reader_date_high(dirpath):
  141. data = {
  142. "time": np.array(
  143. [
  144. "2038-05-05 12:34:56.100000",
  145. "2038-05-05 12:34:56.100100",
  146. "2038-05-05 12:34:56.100200",
  147. "2038-05-05 12:34:56.100300",
  148. "2038-05-05 12:34:56.100400",
  149. "2038-05-05 12:34:56.100500",
  150. "2038-05-05 12:34:56.100600",
  151. "2038-05-05 12:34:56.100700",
  152. "2038-05-05 12:34:56.100800",
  153. "2038-05-05 12:34:56.100900",
  154. ],
  155. dtype="datetime64[ns]",
  156. ),
  157. "date": np.array(
  158. [
  159. datetime.date(2038, 12, 25),
  160. datetime.date(2038, 12, 25),
  161. datetime.date(2038, 12, 25),
  162. datetime.date(2038, 12, 25),
  163. datetime.date(2038, 12, 25),
  164. datetime.date(2038, 12, 25),
  165. datetime.date(2038, 12, 25),
  166. datetime.date(2038, 12, 25),
  167. datetime.date(2038, 12, 25),
  168. datetime.date(2038, 12, 25),
  169. ],
  170. dtype="object",
  171. ),
  172. }
  173. expected = pd.DataFrame.from_dict(data)
  174. inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
  175. got = read_orc(inputfile).iloc[:10]
  176. tm.assert_equal(expected, got)
  177. def test_orc_reader_snappy_compressed(dirpath):
  178. data = {
  179. "int1": np.array(
  180. [
  181. -1160101563,
  182. 1181413113,
  183. 2065821249,
  184. -267157795,
  185. 172111193,
  186. 1752363137,
  187. 1406072123,
  188. 1911809390,
  189. -1308542224,
  190. -467100286,
  191. ],
  192. dtype="int32",
  193. ),
  194. "string1": np.array(
  195. [
  196. "f50dcb8",
  197. "382fdaaa",
  198. "90758c6",
  199. "9e8caf3f",
  200. "ee97332b",
  201. "d634da1",
  202. "2bea4396",
  203. "d67d89e8",
  204. "ad71007e",
  205. "e8c82066",
  206. ],
  207. dtype="object",
  208. ),
  209. }
  210. expected = pd.DataFrame.from_dict(data)
  211. inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
  212. got = read_orc(inputfile).iloc[:10]
  213. tm.assert_equal(expected, got)
  214. def test_orc_roundtrip_file(dirpath):
  215. # GH44554
  216. # PyArrow gained ORC write support with the current argument order
  217. pytest.importorskip("pyarrow")
  218. data = {
  219. "boolean1": np.array([False, True], dtype="bool"),
  220. "byte1": np.array([1, 100], dtype="int8"),
  221. "short1": np.array([1024, 2048], dtype="int16"),
  222. "int1": np.array([65536, 65536], dtype="int32"),
  223. "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
  224. "float1": np.array([1.0, 2.0], dtype="float32"),
  225. "double1": np.array([-15.0, -5.0], dtype="float64"),
  226. "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
  227. "string1": np.array(["hi", "bye"], dtype="object"),
  228. }
  229. expected = pd.DataFrame.from_dict(data)
  230. with tm.ensure_clean() as path:
  231. expected.to_orc(path)
  232. got = read_orc(path)
  233. tm.assert_equal(expected, got)
  234. def test_orc_roundtrip_bytesio():
  235. # GH44554
  236. # PyArrow gained ORC write support with the current argument order
  237. pytest.importorskip("pyarrow")
  238. data = {
  239. "boolean1": np.array([False, True], dtype="bool"),
  240. "byte1": np.array([1, 100], dtype="int8"),
  241. "short1": np.array([1024, 2048], dtype="int16"),
  242. "int1": np.array([65536, 65536], dtype="int32"),
  243. "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
  244. "float1": np.array([1.0, 2.0], dtype="float32"),
  245. "double1": np.array([-15.0, -5.0], dtype="float64"),
  246. "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
  247. "string1": np.array(["hi", "bye"], dtype="object"),
  248. }
  249. expected = pd.DataFrame.from_dict(data)
  250. bytes = expected.to_orc()
  251. got = read_orc(BytesIO(bytes))
  252. tm.assert_equal(expected, got)
  253. def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported):
  254. # GH44554
  255. # PyArrow gained ORC write support with the current argument order
  256. pytest.importorskip("pyarrow")
  257. msg = "The dtype of one or more columns is not supported yet."
  258. with pytest.raises(NotImplementedError, match=msg):
  259. orc_writer_dtypes_not_supported.to_orc()
  260. def test_orc_dtype_backend_pyarrow(using_infer_string):
  261. pytest.importorskip("pyarrow")
  262. df = pd.DataFrame(
  263. {
  264. "string": list("abc"),
  265. "string_with_nan": ["a", np.nan, "c"],
  266. "string_with_none": ["a", None, "c"],
  267. "bytes": [b"foo", b"bar", None],
  268. "int": list(range(1, 4)),
  269. "float": np.arange(4.0, 7.0, dtype="float64"),
  270. "float_with_nan": [2.0, np.nan, 3.0],
  271. "bool": [True, False, True],
  272. "bool_with_na": [True, False, None],
  273. "datetime": pd.date_range("20130101", periods=3),
  274. "datetime_with_nat": [
  275. pd.Timestamp("20130101"),
  276. pd.NaT,
  277. pd.Timestamp("20130103"),
  278. ],
  279. }
  280. )
  281. bytes_data = df.copy().to_orc()
  282. result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow")
  283. expected = pd.DataFrame(
  284. {
  285. col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
  286. for col in df.columns
  287. }
  288. )
  289. if using_infer_string:
  290. # ORC does not preserve distinction between string and large string
  291. # -> the default large string comes back as string
  292. string_dtype = pd.ArrowDtype(pa.string())
  293. expected["string"] = expected["string"].astype(string_dtype)
  294. expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype)
  295. expected["string_with_none"] = expected["string_with_none"].astype(string_dtype)
  296. tm.assert_frame_equal(result, expected)
  297. def test_orc_dtype_backend_numpy_nullable():
  298. # GH#50503
  299. pytest.importorskip("pyarrow")
  300. df = pd.DataFrame(
  301. {
  302. "string": list("abc"),
  303. "string_with_nan": ["a", np.nan, "c"],
  304. "string_with_none": ["a", None, "c"],
  305. "int": list(range(1, 4)),
  306. "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
  307. "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
  308. "float": np.arange(4.0, 7.0, dtype="float64"),
  309. "float_with_nan": [2.0, np.nan, 3.0],
  310. "bool": [True, False, True],
  311. "bool_with_na": [True, False, None],
  312. }
  313. )
  314. bytes_data = df.copy().to_orc()
  315. result = read_orc(BytesIO(bytes_data), dtype_backend="numpy_nullable")
  316. expected = pd.DataFrame(
  317. {
  318. "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
  319. "string_with_nan": StringArray(
  320. np.array(["a", pd.NA, "c"], dtype=np.object_)
  321. ),
  322. "string_with_none": StringArray(
  323. np.array(["a", pd.NA, "c"], dtype=np.object_)
  324. ),
  325. "int": pd.Series([1, 2, 3], dtype="Int64"),
  326. "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
  327. "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
  328. "float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"),
  329. "float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"),
  330. "bool": pd.Series([True, False, True], dtype="boolean"),
  331. "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),
  332. }
  333. )
  334. tm.assert_frame_equal(result, expected)
  335. def test_orc_uri_path():
  336. expected = pd.DataFrame({"int": list(range(1, 4))})
  337. with tm.ensure_clean("tmp.orc") as path:
  338. expected.to_orc(path)
  339. uri = pathlib.Path(path).as_uri()
  340. result = read_orc(uri)
  341. tm.assert_frame_equal(result, expected)
  342. @pytest.mark.parametrize(
  343. "index",
  344. [
  345. pd.RangeIndex(start=2, stop=5, step=1),
  346. pd.RangeIndex(start=0, stop=3, step=1, name="non-default"),
  347. pd.Index([1, 2, 3]),
  348. ],
  349. )
  350. def test_to_orc_non_default_index(index):
  351. df = pd.DataFrame({"a": [1, 2, 3]}, index=index)
  352. msg = (
  353. "orc does not support serializing a non-default index|"
  354. "orc does not serialize index meta-data"
  355. )
  356. with pytest.raises(ValueError, match=msg):
  357. df.to_orc()
  358. def test_invalid_dtype_backend():
  359. msg = (
  360. "dtype_backend numpy is invalid, only 'numpy_nullable' and "
  361. "'pyarrow' are allowed."
  362. )
  363. df = pd.DataFrame({"int": list(range(1, 4))})
  364. with tm.ensure_clean("tmp.orc") as path:
  365. df.to_orc(path)
  366. with pytest.raises(ValueError, match=msg):
  367. read_orc(path, dtype_backend="numpy")
  368. def test_string_inference(tmp_path):
  369. # GH#54431
  370. path = tmp_path / "test_string_inference.p"
  371. df = pd.DataFrame(data={"a": ["x", "y"]})
  372. df.to_orc(path)
  373. with pd.option_context("future.infer_string", True):
  374. result = read_orc(path)
  375. expected = pd.DataFrame(
  376. data={"a": ["x", "y"]},
  377. dtype=pd.StringDtype(na_value=np.nan),
  378. columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
  379. )
  380. tm.assert_frame_equal(result, expected)