test_feather.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. """ test feather-format compat """
  2. import numpy as np
  3. import pytest
  4. from pandas.compat.pyarrow import (
  5. pa_version_under18p0,
  6. pa_version_under19p0,
  7. )
  8. import pandas as pd
  9. import pandas._testing as tm
  10. from pandas.io.feather_format import read_feather, to_feather # isort:skip
  11. pytestmark = pytest.mark.filterwarnings(
  12. "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
  13. )
  14. pa = pytest.importorskip("pyarrow")
  15. @pytest.mark.single_cpu
  16. class TestFeather:
  17. def check_error_on_write(self, df, exc, err_msg):
  18. # check that we are raising the exception
  19. # on writing
  20. with pytest.raises(exc, match=err_msg):
  21. with tm.ensure_clean() as path:
  22. to_feather(df, path)
  23. def check_external_error_on_write(self, df):
  24. # check that we are raising the exception
  25. # on writing
  26. with tm.external_error_raised(Exception):
  27. with tm.ensure_clean() as path:
  28. to_feather(df, path)
  29. def check_round_trip(self, df, expected=None, write_kwargs={}, **read_kwargs):
  30. if expected is None:
  31. expected = df.copy()
  32. with tm.ensure_clean() as path:
  33. to_feather(df, path, **write_kwargs)
  34. result = read_feather(path, **read_kwargs)
  35. tm.assert_frame_equal(result, expected)
  36. def test_error(self):
  37. msg = "feather only support IO with DataFrames"
  38. for obj in [
  39. pd.Series([1, 2, 3]),
  40. 1,
  41. "foo",
  42. pd.Timestamp("20130101"),
  43. np.array([1, 2, 3]),
  44. ]:
  45. self.check_error_on_write(obj, ValueError, msg)
  46. def test_basic(self):
  47. df = pd.DataFrame(
  48. {
  49. "string": list("abc"),
  50. "int": list(range(1, 4)),
  51. "uint": np.arange(3, 6).astype("u1"),
  52. "float": np.arange(4.0, 7.0, dtype="float64"),
  53. "float_with_null": [1.0, np.nan, 3],
  54. "bool": [True, False, True],
  55. "bool_with_null": [True, np.nan, False],
  56. "cat": pd.Categorical(list("abc")),
  57. "dt": pd.DatetimeIndex(
  58. list(pd.date_range("20130101", periods=3)), freq=None
  59. ),
  60. "dttz": pd.DatetimeIndex(
  61. list(pd.date_range("20130101", periods=3, tz="US/Eastern")),
  62. freq=None,
  63. ),
  64. "dt_with_null": [
  65. pd.Timestamp("20130101"),
  66. pd.NaT,
  67. pd.Timestamp("20130103"),
  68. ],
  69. "dtns": pd.DatetimeIndex(
  70. list(pd.date_range("20130101", periods=3, freq="ns")), freq=None
  71. ),
  72. }
  73. )
  74. df["periods"] = pd.period_range("2013", freq="M", periods=3)
  75. df["timedeltas"] = pd.timedelta_range("1 day", periods=3)
  76. df["intervals"] = pd.interval_range(0, 3, 3)
  77. assert df.dttz.dtype.tz.zone == "US/Eastern"
  78. expected = df.copy()
  79. expected.loc[1, "bool_with_null"] = None
  80. self.check_round_trip(df, expected=expected)
  81. def test_duplicate_columns(self):
  82. # https://github.com/wesm/feather/issues/53
  83. # not currently able to handle duplicate columns
  84. df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()
  85. self.check_external_error_on_write(df)
  86. def test_read_columns(self):
  87. # GH 24025
  88. df = pd.DataFrame(
  89. {
  90. "col1": list("abc"),
  91. "col2": list(range(1, 4)),
  92. "col3": list("xyz"),
  93. "col4": list(range(4, 7)),
  94. }
  95. )
  96. columns = ["col1", "col3"]
  97. self.check_round_trip(df, expected=df[columns], columns=columns)
  98. def test_read_columns_different_order(self):
  99. # GH 33878
  100. df = pd.DataFrame({"A": [1, 2], "B": ["x", "y"], "C": [True, False]})
  101. expected = df[["B", "A"]]
  102. self.check_round_trip(df, expected, columns=["B", "A"])
  103. def test_unsupported_other(self):
  104. # mixed python objects
  105. df = pd.DataFrame({"a": ["a", 1, 2.0]})
  106. self.check_external_error_on_write(df)
  107. def test_rw_use_threads(self):
  108. df = pd.DataFrame({"A": np.arange(100000)})
  109. self.check_round_trip(df, use_threads=True)
  110. self.check_round_trip(df, use_threads=False)
  111. def test_path_pathlib(self):
  112. df = pd.DataFrame(
  113. 1.1 * np.arange(120).reshape((30, 4)),
  114. columns=pd.Index(list("ABCD")),
  115. index=pd.Index([f"i-{i}" for i in range(30)]),
  116. ).reset_index()
  117. result = tm.round_trip_pathlib(df.to_feather, read_feather)
  118. tm.assert_frame_equal(df, result)
  119. def test_path_localpath(self):
  120. df = pd.DataFrame(
  121. 1.1 * np.arange(120).reshape((30, 4)),
  122. columns=pd.Index(list("ABCD")),
  123. index=pd.Index([f"i-{i}" for i in range(30)]),
  124. ).reset_index()
  125. result = tm.round_trip_localpath(df.to_feather, read_feather)
  126. tm.assert_frame_equal(df, result)
  127. def test_passthrough_keywords(self):
  128. df = pd.DataFrame(
  129. 1.1 * np.arange(120).reshape((30, 4)),
  130. columns=pd.Index(list("ABCD")),
  131. index=pd.Index([f"i-{i}" for i in range(30)]),
  132. ).reset_index()
  133. self.check_round_trip(df, write_kwargs={"version": 1})
  134. @pytest.mark.network
  135. @pytest.mark.single_cpu
  136. def test_http_path(self, feather_file, httpserver):
  137. # GH 29055
  138. expected = read_feather(feather_file)
  139. with open(feather_file, "rb") as f:
  140. httpserver.serve_content(content=f.read())
  141. res = read_feather(httpserver.url)
  142. tm.assert_frame_equal(expected, res)
  143. def test_read_feather_dtype_backend(
  144. self, string_storage, dtype_backend, using_infer_string
  145. ):
  146. # GH#50765
  147. df = pd.DataFrame(
  148. {
  149. "a": pd.Series([1, np.nan, 3], dtype="Int64"),
  150. "b": pd.Series([1, 2, 3], dtype="Int64"),
  151. "c": pd.Series([1.5, np.nan, 2.5], dtype="Float64"),
  152. "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
  153. "e": [True, False, None],
  154. "f": [True, False, True],
  155. "g": ["a", "b", "c"],
  156. "h": ["a", "b", None],
  157. }
  158. )
  159. with tm.ensure_clean() as path:
  160. to_feather(df, path)
  161. with pd.option_context("mode.string_storage", string_storage):
  162. result = read_feather(path, dtype_backend=dtype_backend)
  163. if dtype_backend == "pyarrow":
  164. pa = pytest.importorskip("pyarrow")
  165. if using_infer_string:
  166. string_dtype = pd.ArrowDtype(pa.large_string())
  167. else:
  168. string_dtype = pd.ArrowDtype(pa.string())
  169. else:
  170. string_dtype = pd.StringDtype(string_storage)
  171. expected = pd.DataFrame(
  172. {
  173. "a": pd.Series([1, np.nan, 3], dtype="Int64"),
  174. "b": pd.Series([1, 2, 3], dtype="Int64"),
  175. "c": pd.Series([1.5, np.nan, 2.5], dtype="Float64"),
  176. "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
  177. "e": pd.Series([True, False, pd.NA], dtype="boolean"),
  178. "f": pd.Series([True, False, True], dtype="boolean"),
  179. "g": pd.Series(["a", "b", "c"], dtype=string_dtype),
  180. "h": pd.Series(["a", "b", None], dtype=string_dtype),
  181. }
  182. )
  183. if dtype_backend == "pyarrow":
  184. from pandas.arrays import ArrowExtensionArray
  185. expected = pd.DataFrame(
  186. {
  187. col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
  188. for col in expected.columns
  189. }
  190. )
  191. if using_infer_string:
  192. expected.columns = expected.columns.astype(
  193. pd.StringDtype(string_storage, na_value=np.nan)
  194. )
  195. tm.assert_frame_equal(result, expected)
  196. def test_int_columns_and_index(self):
  197. df = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index([3, 4, 5], name="test"))
  198. self.check_round_trip(df)
  199. def test_invalid_dtype_backend(self):
  200. msg = (
  201. "dtype_backend numpy is invalid, only 'numpy_nullable' and "
  202. "'pyarrow' are allowed."
  203. )
  204. df = pd.DataFrame({"int": list(range(1, 4))})
  205. with tm.ensure_clean("tmp.feather") as path:
  206. df.to_feather(path)
  207. with pytest.raises(ValueError, match=msg):
  208. read_feather(path, dtype_backend="numpy")
  209. def test_string_inference(self, tmp_path, using_infer_string):
  210. # GH#54431
  211. path = tmp_path / "test_string_inference.p"
  212. df = pd.DataFrame(data={"a": ["x", "y"]})
  213. df.to_feather(path)
  214. with pd.option_context("future.infer_string", True):
  215. result = read_feather(path)
  216. dtype = pd.StringDtype(na_value=np.nan)
  217. expected = pd.DataFrame(
  218. data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan)
  219. )
  220. expected = pd.DataFrame(
  221. data={"a": ["x", "y"]},
  222. dtype=dtype,
  223. columns=pd.Index(
  224. ["a"],
  225. dtype=object
  226. if pa_version_under19p0 and not using_infer_string
  227. else dtype,
  228. ),
  229. )
  230. tm.assert_frame_equal(result, expected)
  231. @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0")
  232. def test_string_inference_string_view_type(self, tmp_path):
  233. # GH#54798
  234. import pyarrow as pa
  235. from pyarrow import feather
  236. path = tmp_path / "string_view.parquet"
  237. table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())})
  238. feather.write_feather(table, path)
  239. with pd.option_context("future.infer_string", True):
  240. result = read_feather(path)
  241. expected = pd.DataFrame(
  242. data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan)
  243. )
  244. tm.assert_frame_equal(result, expected)