test_impl.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616
  1. from datetime import (
  2. datetime,
  3. timezone,
  4. )
  5. import numpy as np
  6. import pytest
  7. from pandas._libs.tslibs import iNaT
  8. from pandas.compat import (
  9. is_ci_environment,
  10. is_platform_windows,
  11. )
  12. from pandas.compat.numpy import np_version_lt1p23
  13. import pandas as pd
  14. import pandas._testing as tm
  15. from pandas.core.interchange.column import PandasColumn
  16. from pandas.core.interchange.dataframe_protocol import (
  17. ColumnNullType,
  18. DtypeKind,
  19. )
  20. from pandas.core.interchange.from_dataframe import from_dataframe
  21. from pandas.core.interchange.utils import ArrowCTypes
  22. @pytest.fixture
  23. def data_categorical():
  24. return {
  25. "ordered": pd.Categorical(list("testdata") * 30, ordered=True),
  26. "unordered": pd.Categorical(list("testdata") * 30, ordered=False),
  27. }
  28. @pytest.fixture
  29. def string_data():
  30. return {
  31. "separator data": [
  32. "abC|DeF,Hik",
  33. "234,3245.67",
  34. "gSaf,qWer|Gre",
  35. "asd3,4sad|",
  36. np.nan,
  37. ]
  38. }
  39. @pytest.mark.parametrize("data", [("ordered", True), ("unordered", False)])
  40. def test_categorical_dtype(data, data_categorical):
  41. df = pd.DataFrame({"A": (data_categorical[data[0]])})
  42. col = df.__dataframe__().get_column_by_name("A")
  43. assert col.dtype[0] == DtypeKind.CATEGORICAL
  44. assert col.null_count == 0
  45. assert col.describe_null == (ColumnNullType.USE_SENTINEL, -1)
  46. assert col.num_chunks() == 1
  47. desc_cat = col.describe_categorical
  48. assert desc_cat["is_ordered"] == data[1]
  49. assert desc_cat["is_dictionary"] is True
  50. assert isinstance(desc_cat["categories"], PandasColumn)
  51. tm.assert_series_equal(
  52. desc_cat["categories"]._col, pd.Series(["a", "d", "e", "s", "t"])
  53. )
  54. tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))
  55. def test_categorical_pyarrow():
  56. # GH 49889
  57. pa = pytest.importorskip("pyarrow", "11.0.0")
  58. arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]
  59. table = pa.table({"weekday": pa.array(arr).dictionary_encode()})
  60. exchange_df = table.__dataframe__()
  61. result = from_dataframe(exchange_df)
  62. weekday = pd.Categorical(
  63. arr, categories=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
  64. )
  65. expected = pd.DataFrame({"weekday": weekday})
  66. tm.assert_frame_equal(result, expected)
  67. def test_empty_categorical_pyarrow():
  68. # https://github.com/pandas-dev/pandas/issues/53077
  69. pa = pytest.importorskip("pyarrow", "11.0.0")
  70. arr = [None]
  71. table = pa.table({"arr": pa.array(arr, "float64").dictionary_encode()})
  72. exchange_df = table.__dataframe__()
  73. result = pd.api.interchange.from_dataframe(exchange_df)
  74. expected = pd.DataFrame({"arr": pd.Categorical([np.nan])})
  75. tm.assert_frame_equal(result, expected)
  76. def test_large_string_pyarrow():
  77. # GH 52795
  78. pa = pytest.importorskip("pyarrow", "11.0.0")
  79. arr = ["Mon", "Tue"]
  80. table = pa.table({"weekday": pa.array(arr, "large_string")})
  81. exchange_df = table.__dataframe__()
  82. result = from_dataframe(exchange_df)
  83. expected = pd.DataFrame({"weekday": ["Mon", "Tue"]})
  84. tm.assert_frame_equal(result, expected)
  85. # check round-trip
  86. assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
  87. @pytest.mark.parametrize(
  88. ("offset", "length", "expected_values"),
  89. [
  90. (0, None, [3.3, float("nan"), 2.1]),
  91. (1, None, [float("nan"), 2.1]),
  92. (2, None, [2.1]),
  93. (0, 2, [3.3, float("nan")]),
  94. (0, 1, [3.3]),
  95. (1, 1, [float("nan")]),
  96. ],
  97. )
  98. def test_bitmasks_pyarrow(offset, length, expected_values):
  99. # GH 52795
  100. pa = pytest.importorskip("pyarrow", "11.0.0")
  101. arr = [3.3, None, 2.1]
  102. table = pa.table({"arr": arr}).slice(offset, length)
  103. exchange_df = table.__dataframe__()
  104. result = from_dataframe(exchange_df)
  105. expected = pd.DataFrame({"arr": expected_values})
  106. tm.assert_frame_equal(result, expected)
  107. # check round-trip
  108. assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
  109. @pytest.mark.parametrize(
  110. "data",
  111. [
  112. lambda: np.random.default_rng(2).integers(-100, 100),
  113. lambda: np.random.default_rng(2).integers(1, 100),
  114. lambda: np.random.default_rng(2).random(),
  115. lambda: np.random.default_rng(2).choice([True, False]),
  116. lambda: datetime(
  117. year=np.random.default_rng(2).integers(1900, 2100),
  118. month=np.random.default_rng(2).integers(1, 12),
  119. day=np.random.default_rng(2).integers(1, 20),
  120. ),
  121. ],
  122. )
  123. def test_dataframe(data):
  124. NCOLS, NROWS = 10, 20
  125. data = {
  126. f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [data() for _ in range(NROWS)]
  127. for i in range(NCOLS)
  128. }
  129. df = pd.DataFrame(data)
  130. df2 = df.__dataframe__()
  131. assert df2.num_columns() == NCOLS
  132. assert df2.num_rows() == NROWS
  133. assert list(df2.column_names()) == list(data.keys())
  134. indices = (0, 2)
  135. names = tuple(list(data.keys())[idx] for idx in indices)
  136. result = from_dataframe(df2.select_columns(indices))
  137. expected = from_dataframe(df2.select_columns_by_name(names))
  138. tm.assert_frame_equal(result, expected)
  139. assert isinstance(result.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"], list)
  140. assert isinstance(expected.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"], list)
  141. def test_missing_from_masked():
  142. df = pd.DataFrame(
  143. {
  144. "x": np.array([1.0, 2.0, 3.0, 4.0, 0.0]),
  145. "y": np.array([1.5, 2.5, 3.5, 4.5, 0]),
  146. "z": np.array([1.0, 0.0, 1.0, 1.0, 1.0]),
  147. }
  148. )
  149. rng = np.random.default_rng(2)
  150. dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns}
  151. for col, num_nulls in dict_null.items():
  152. null_idx = df.index[
  153. rng.choice(np.arange(len(df)), size=num_nulls, replace=False)
  154. ]
  155. df.loc[null_idx, col] = None
  156. df2 = df.__dataframe__()
  157. assert df2.get_column_by_name("x").null_count == dict_null["x"]
  158. assert df2.get_column_by_name("y").null_count == dict_null["y"]
  159. assert df2.get_column_by_name("z").null_count == dict_null["z"]
  160. @pytest.mark.parametrize(
  161. "data",
  162. [
  163. {"x": [1.5, 2.5, 3.5], "y": [9.2, 10.5, 11.8]},
  164. {"x": [1, 2, 0], "y": [9.2, 10.5, 11.8]},
  165. {
  166. "x": np.array([True, True, False]),
  167. "y": np.array([1, 2, 0]),
  168. "z": np.array([9.2, 10.5, 11.8]),
  169. },
  170. ],
  171. )
  172. def test_mixed_data(data):
  173. df = pd.DataFrame(data)
  174. df2 = df.__dataframe__()
  175. for col_name in df.columns:
  176. assert df2.get_column_by_name(col_name).null_count == 0
  177. def test_mixed_missing():
  178. df = pd.DataFrame(
  179. {
  180. "x": np.array([True, None, False, None, True]),
  181. "y": np.array([None, 2, None, 1, 2]),
  182. "z": np.array([9.2, 10.5, None, 11.8, None]),
  183. }
  184. )
  185. df2 = df.__dataframe__()
  186. for col_name in df.columns:
  187. assert df2.get_column_by_name(col_name).null_count == 2
  188. def test_string(string_data):
  189. test_str_data = string_data["separator data"] + [""]
  190. df = pd.DataFrame({"A": test_str_data})
  191. col = df.__dataframe__().get_column_by_name("A")
  192. assert col.size() == 6
  193. assert col.null_count == 1
  194. assert col.dtype[0] == DtypeKind.STRING
  195. assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0)
  196. df_sliced = df[1:]
  197. col = df_sliced.__dataframe__().get_column_by_name("A")
  198. assert col.size() == 5
  199. assert col.null_count == 1
  200. assert col.dtype[0] == DtypeKind.STRING
  201. assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0)
  202. def test_nonstring_object():
  203. df = pd.DataFrame({"A": ["a", 10, 1.0, ()]})
  204. col = df.__dataframe__().get_column_by_name("A")
  205. with pytest.raises(NotImplementedError, match="not supported yet"):
  206. col.dtype
  207. def test_datetime():
  208. df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]})
  209. col = df.__dataframe__().get_column_by_name("A")
  210. assert col.size() == 2
  211. assert col.null_count == 1
  212. assert col.dtype[0] == DtypeKind.DATETIME
  213. assert col.describe_null == (ColumnNullType.USE_SENTINEL, iNaT)
  214. tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))
  215. @pytest.mark.skipif(np_version_lt1p23, reason="Numpy > 1.23 required")
  216. def test_categorical_to_numpy_dlpack():
  217. # https://github.com/pandas-dev/pandas/issues/48393
  218. df = pd.DataFrame({"A": pd.Categorical(["a", "b", "a"])})
  219. col = df.__dataframe__().get_column_by_name("A")
  220. result = np.from_dlpack(col.get_buffers()["data"][0])
  221. expected = np.array([0, 1, 0], dtype="int8")
  222. tm.assert_numpy_array_equal(result, expected)
  223. @pytest.mark.parametrize("data", [{}, {"a": []}])
  224. def test_empty_pyarrow(data):
  225. # GH 53155
  226. pytest.importorskip("pyarrow", "11.0.0")
  227. from pyarrow.interchange import from_dataframe as pa_from_dataframe
  228. expected = pd.DataFrame(data)
  229. arrow_df = pa_from_dataframe(expected)
  230. result = from_dataframe(arrow_df)
  231. tm.assert_frame_equal(result, expected, check_column_type=False)
  232. def test_multi_chunk_pyarrow() -> None:
  233. pa = pytest.importorskip("pyarrow", "11.0.0")
  234. n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
  235. names = ["n_legs"]
  236. table = pa.table([n_legs], names=names)
  237. with pytest.raises(
  238. RuntimeError,
  239. match="Cannot do zero copy conversion into multi-column DataFrame block",
  240. ):
  241. pd.api.interchange.from_dataframe(table, allow_copy=False)
  242. def test_multi_chunk_column() -> None:
  243. pytest.importorskip("pyarrow", "11.0.0")
  244. ser = pd.Series([1, 2, None], dtype="Int64[pyarrow]")
  245. df = pd.concat([ser, ser], ignore_index=True).to_frame("a")
  246. df_orig = df.copy()
  247. with pytest.raises(
  248. RuntimeError, match="Found multi-chunk pyarrow array, but `allow_copy` is False"
  249. ):
  250. pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=False))
  251. result = pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=True))
  252. # Interchange protocol defaults to creating numpy-backed columns, so currently this
  253. # is 'float64'.
  254. expected = pd.DataFrame({"a": [1.0, 2.0, None, 1.0, 2.0, None]}, dtype="float64")
  255. tm.assert_frame_equal(result, expected)
  256. # Check that the rechunking we did didn't modify the original DataFrame.
  257. tm.assert_frame_equal(df, df_orig)
  258. assert len(df["a"].array._pa_array.chunks) == 2
  259. assert len(df_orig["a"].array._pa_array.chunks) == 2
  260. def test_timestamp_ns_pyarrow():
  261. # GH 56712
  262. pytest.importorskip("pyarrow", "11.0.0")
  263. timestamp_args = {
  264. "year": 2000,
  265. "month": 1,
  266. "day": 1,
  267. "hour": 1,
  268. "minute": 1,
  269. "second": 1,
  270. }
  271. df = pd.Series(
  272. [datetime(**timestamp_args)],
  273. dtype="timestamp[ns][pyarrow]",
  274. name="col0",
  275. ).to_frame()
  276. dfi = df.__dataframe__()
  277. result = pd.api.interchange.from_dataframe(dfi)["col0"].item()
  278. expected = pd.Timestamp(**timestamp_args)
  279. assert result == expected
  280. @pytest.mark.parametrize("tz", ["UTC", "US/Pacific"])
  281. @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
  282. def test_datetimetzdtype(tz, unit):
  283. # GH 54239
  284. tz_data = (
  285. pd.date_range("2018-01-01", periods=5, freq="D").tz_localize(tz).as_unit(unit)
  286. )
  287. df = pd.DataFrame({"ts_tz": tz_data})
  288. tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))
  289. def test_interchange_from_non_pandas_tz_aware(request):
  290. # GH 54239, 54287
  291. pa = pytest.importorskip("pyarrow", "11.0.0")
  292. import pyarrow.compute as pc
  293. if is_platform_windows() and is_ci_environment():
  294. mark = pytest.mark.xfail(
  295. raises=pa.ArrowInvalid,
  296. reason=(
  297. "TODO: Set ARROW_TIMEZONE_DATABASE environment variable "
  298. "on CI to path to the tzdata for pyarrow."
  299. ),
  300. )
  301. request.applymarker(mark)
  302. arr = pa.array([datetime(2020, 1, 1), None, datetime(2020, 1, 2)])
  303. arr = pc.assume_timezone(arr, "Asia/Kathmandu")
  304. table = pa.table({"arr": arr})
  305. exchange_df = table.__dataframe__()
  306. result = from_dataframe(exchange_df)
  307. expected = pd.DataFrame(
  308. ["2020-01-01 00:00:00+05:45", "NaT", "2020-01-02 00:00:00+05:45"],
  309. columns=["arr"],
  310. dtype="datetime64[us, Asia/Kathmandu]",
  311. )
  312. tm.assert_frame_equal(expected, result)
  313. def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
  314. # https://github.com/pandas-dev/pandas/issues/54781
  315. df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__()
  316. interchange = df.__dataframe__()
  317. column = interchange.get_column_by_name("a")
  318. buffers = column.get_buffers()
  319. buffers_data = buffers["data"]
  320. buffer_dtype = buffers_data[1]
  321. buffer_dtype = (
  322. DtypeKind.UINT,
  323. 8,
  324. ArrowCTypes.UINT8,
  325. buffer_dtype[3],
  326. )
  327. buffers["data"] = (buffers_data[0], buffer_dtype)
  328. column.get_buffers = lambda: buffers
  329. interchange.get_column_by_name = lambda _: column
  330. monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange)
  331. pd.api.interchange.from_dataframe(df)
  332. def test_empty_string_column():
  333. # https://github.com/pandas-dev/pandas/issues/56703
  334. df = pd.DataFrame({"a": []}, dtype=str)
  335. df2 = df.__dataframe__()
  336. result = pd.api.interchange.from_dataframe(df2)
  337. tm.assert_frame_equal(df, result)
  338. def test_large_string():
  339. # GH#56702
  340. pytest.importorskip("pyarrow")
  341. df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]")
  342. result = pd.api.interchange.from_dataframe(df.__dataframe__())
  343. expected = pd.DataFrame({"a": ["x"]}, dtype="str")
  344. tm.assert_frame_equal(result, expected)
  345. def test_non_str_names():
  346. # https://github.com/pandas-dev/pandas/issues/56701
  347. df = pd.Series([1, 2, 3], name=0).to_frame()
  348. names = df.__dataframe__().column_names()
  349. assert names == ["0"]
  350. def test_non_str_names_w_duplicates():
  351. # https://github.com/pandas-dev/pandas/issues/56701
  352. df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]})
  353. dfi = df.__dataframe__()
  354. with pytest.raises(
  355. TypeError,
  356. match=(
  357. "Expected a Series, got a DataFrame. This likely happened because you "
  358. "called __dataframe__ on a DataFrame which, after converting column "
  359. r"names to string, resulted in duplicated names: Index\(\['0', '0'\], "
  360. r"dtype='(str|object)'\). Please rename these columns before using the "
  361. "interchange protocol."
  362. ),
  363. ):
  364. pd.api.interchange.from_dataframe(dfi, allow_copy=False)
  365. @pytest.mark.parametrize(
  366. ("data", "dtype", "expected_dtype"),
  367. [
  368. ([1, 2, None], "Int64", "int64"),
  369. ([1, 2, None], "Int64[pyarrow]", "int64"),
  370. ([1, 2, None], "Int8", "int8"),
  371. ([1, 2, None], "Int8[pyarrow]", "int8"),
  372. (
  373. [1, 2, None],
  374. "UInt64",
  375. "uint64",
  376. ),
  377. (
  378. [1, 2, None],
  379. "UInt64[pyarrow]",
  380. "uint64",
  381. ),
  382. ([1.0, 2.25, None], "Float32", "float32"),
  383. ([1.0, 2.25, None], "Float32[pyarrow]", "float32"),
  384. ([True, False, None], "boolean", "bool"),
  385. ([True, False, None], "boolean[pyarrow]", "bool"),
  386. (["much ado", "about", None], pd.StringDtype(na_value=np.nan), "large_string"),
  387. (["much ado", "about", None], "string[pyarrow]", "large_string"),
  388. (
  389. [datetime(2020, 1, 1), datetime(2020, 1, 2), None],
  390. "timestamp[ns][pyarrow]",
  391. "timestamp[ns]",
  392. ),
  393. (
  394. [datetime(2020, 1, 1), datetime(2020, 1, 2), None],
  395. "timestamp[us][pyarrow]",
  396. "timestamp[us]",
  397. ),
  398. (
  399. [
  400. datetime(2020, 1, 1, tzinfo=timezone.utc),
  401. datetime(2020, 1, 2, tzinfo=timezone.utc),
  402. None,
  403. ],
  404. "timestamp[us, Asia/Kathmandu][pyarrow]",
  405. "timestamp[us, tz=Asia/Kathmandu]",
  406. ),
  407. ],
  408. )
  409. def test_pandas_nullable_with_missing_values(
  410. data: list, dtype: str, expected_dtype: str
  411. ) -> None:
  412. # https://github.com/pandas-dev/pandas/issues/57643
  413. # https://github.com/pandas-dev/pandas/issues/57664
  414. pa = pytest.importorskip("pyarrow", "11.0.0")
  415. import pyarrow.interchange as pai
  416. if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]":
  417. expected_dtype = pa.timestamp("us", "Asia/Kathmandu")
  418. df = pd.DataFrame({"a": data}, dtype=dtype)
  419. result = pai.from_dataframe(df.__dataframe__())["a"]
  420. assert result.type == expected_dtype
  421. assert result[0].as_py() == data[0]
  422. assert result[1].as_py() == data[1]
  423. assert result[2].as_py() is None
  424. @pytest.mark.parametrize(
  425. ("data", "dtype", "expected_dtype"),
  426. [
  427. ([1, 2, 3], "Int64", "int64"),
  428. ([1, 2, 3], "Int64[pyarrow]", "int64"),
  429. ([1, 2, 3], "Int8", "int8"),
  430. ([1, 2, 3], "Int8[pyarrow]", "int8"),
  431. (
  432. [1, 2, 3],
  433. "UInt64",
  434. "uint64",
  435. ),
  436. (
  437. [1, 2, 3],
  438. "UInt64[pyarrow]",
  439. "uint64",
  440. ),
  441. ([1.0, 2.25, 5.0], "Float32", "float32"),
  442. ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"),
  443. ([True, False, False], "boolean", "bool"),
  444. ([True, False, False], "boolean[pyarrow]", "bool"),
  445. (
  446. ["much ado", "about", "nothing"],
  447. pd.StringDtype(na_value=np.nan),
  448. "large_string",
  449. ),
  450. (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"),
  451. (
  452. [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)],
  453. "timestamp[ns][pyarrow]",
  454. "timestamp[ns]",
  455. ),
  456. (
  457. [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)],
  458. "timestamp[us][pyarrow]",
  459. "timestamp[us]",
  460. ),
  461. (
  462. [
  463. datetime(2020, 1, 1, tzinfo=timezone.utc),
  464. datetime(2020, 1, 2, tzinfo=timezone.utc),
  465. datetime(2020, 1, 3, tzinfo=timezone.utc),
  466. ],
  467. "timestamp[us, Asia/Kathmandu][pyarrow]",
  468. "timestamp[us, tz=Asia/Kathmandu]",
  469. ),
  470. ],
  471. )
  472. def test_pandas_nullable_without_missing_values(
  473. data: list, dtype: str, expected_dtype: str
  474. ) -> None:
  475. # https://github.com/pandas-dev/pandas/issues/57643
  476. pa = pytest.importorskip("pyarrow", "11.0.0")
  477. import pyarrow.interchange as pai
  478. if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]":
  479. expected_dtype = pa.timestamp("us", "Asia/Kathmandu")
  480. df = pd.DataFrame({"a": data}, dtype=dtype)
  481. result = pai.from_dataframe(df.__dataframe__())["a"]
  482. assert result.type == expected_dtype
  483. assert result[0].as_py() == data[0]
  484. assert result[1].as_py() == data[1]
  485. assert result[2].as_py() == data[2]
  486. def test_string_validity_buffer() -> None:
  487. # https://github.com/pandas-dev/pandas/issues/57761
  488. pytest.importorskip("pyarrow", "11.0.0")
  489. df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]")
  490. result = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"]
  491. assert result is None
  492. def test_string_validity_buffer_no_missing() -> None:
  493. # https://github.com/pandas-dev/pandas/issues/57762
  494. pytest.importorskip("pyarrow", "11.0.0")
  495. df = pd.DataFrame({"a": ["x", None]}, dtype="large_string[pyarrow]")
  496. validity = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"]
  497. assert validity is not None
  498. result = validity[1]
  499. expected = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, "=")
  500. assert result == expected
  501. def test_empty_dataframe():
  502. # https://github.com/pandas-dev/pandas/issues/56700
  503. df = pd.DataFrame({"a": []}, dtype="int8")
  504. dfi = df.__dataframe__()
  505. result = pd.api.interchange.from_dataframe(dfi, allow_copy=False)
  506. expected = pd.DataFrame({"a": []}, dtype="int8")
  507. tm.assert_frame_equal(result, expected)
  508. def test_from_dataframe_list_dtype():
  509. pa = pytest.importorskip("pyarrow", "14.0.0")
  510. data = {"a": [[1, 2], [4, 5, 6]]}
  511. tbl = pa.table(data)
  512. result = from_dataframe(tbl)
  513. expected = pd.DataFrame(data)
  514. tm.assert_frame_equal(result, expected)