test_downstream.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. """
  2. Testing that we work in the downstream packages
  3. """
  4. import array
  5. from functools import partial
  6. import importlib
  7. import subprocess
  8. import sys
  9. import numpy as np
  10. import pytest
  11. from pandas.errors import IntCastingNaNError
  12. import pandas as pd
  13. from pandas import (
  14. DataFrame,
  15. DatetimeIndex,
  16. Series,
  17. TimedeltaIndex,
  18. )
  19. import pandas._testing as tm
  20. from pandas.util.version import Version
  21. @pytest.fixture
  22. def df():
  23. return DataFrame({"A": [1, 2, 3]})
  24. def test_dask(df):
  25. # dask sets "compute.use_numexpr" to False, so catch the current value
  26. # and ensure to reset it afterwards to avoid impacting other tests
  27. olduse = pd.get_option("compute.use_numexpr")
  28. try:
  29. pytest.importorskip("toolz")
  30. dd = pytest.importorskip("dask.dataframe")
  31. ddf = dd.from_pandas(df, npartitions=3)
  32. assert ddf.A is not None
  33. assert ddf.compute() is not None
  34. finally:
  35. pd.set_option("compute.use_numexpr", olduse)
  36. # TODO(CoW) see https://github.com/pandas-dev/pandas/pull/51082
  37. @pytest.mark.skip(reason="not implemented with CoW")
  38. def test_dask_ufunc():
  39. # dask sets "compute.use_numexpr" to False, so catch the current value
  40. # and ensure to reset it afterwards to avoid impacting other tests
  41. olduse = pd.get_option("compute.use_numexpr")
  42. try:
  43. da = pytest.importorskip("dask.array")
  44. dd = pytest.importorskip("dask.dataframe")
  45. s = Series([1.5, 2.3, 3.7, 4.0])
  46. ds = dd.from_pandas(s, npartitions=2)
  47. result = da.log(ds).compute()
  48. expected = np.log(s)
  49. tm.assert_series_equal(result, expected)
  50. finally:
  51. pd.set_option("compute.use_numexpr", olduse)
  52. def test_construct_dask_float_array_int_dtype_match_ndarray():
  53. # GH#40110 make sure we treat a float-dtype dask array with the same
  54. # rules we would for an ndarray
  55. dd = pytest.importorskip("dask.dataframe")
  56. arr = np.array([1, 2.5, 3])
  57. darr = dd.from_array(arr)
  58. res = Series(darr)
  59. expected = Series(arr)
  60. tm.assert_series_equal(res, expected)
  61. # GH#49599 in 2.0 we raise instead of silently ignoring the dtype
  62. msg = "Trying to coerce float values to integers"
  63. with pytest.raises(ValueError, match=msg):
  64. Series(darr, dtype="i8")
  65. msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
  66. arr[2] = np.nan
  67. with pytest.raises(IntCastingNaNError, match=msg):
  68. Series(darr, dtype="i8")
  69. # which is the same as we get with a numpy input
  70. with pytest.raises(IntCastingNaNError, match=msg):
  71. Series(arr, dtype="i8")
  72. def test_xarray(df):
  73. pytest.importorskip("xarray")
  74. assert df.to_xarray() is not None
  75. def test_xarray_cftimeindex_nearest():
  76. # https://github.com/pydata/xarray/issues/3751
  77. cftime = pytest.importorskip("cftime")
  78. xarray = pytest.importorskip("xarray")
  79. times = xarray.date_range("0001", periods=2, use_cftime=True)
  80. key = cftime.DatetimeGregorian(2000, 1, 1)
  81. result = times.get_indexer([key], method="nearest")
  82. expected = 1
  83. assert result == expected
  84. @pytest.mark.single_cpu
  85. def test_oo_optimizable():
  86. # GH 21071
  87. subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"])
  88. @pytest.mark.single_cpu
  89. def test_oo_optimized_datetime_index_unpickle():
  90. # GH 42866
  91. subprocess.check_call(
  92. [
  93. sys.executable,
  94. "-OO",
  95. "-c",
  96. (
  97. "import pandas as pd, pickle; "
  98. "pickle.loads(pickle.dumps(pd.date_range('2021-01-01', periods=1)))"
  99. ),
  100. ]
  101. )
  102. def test_statsmodels():
  103. smf = pytest.importorskip("statsmodels.formula.api")
  104. df = DataFrame(
  105. {"Lottery": range(5), "Literacy": range(5), "Pop1831": range(100, 105)}
  106. )
  107. smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=df).fit()
  108. def test_scikit_learn():
  109. pytest.importorskip("sklearn")
  110. from sklearn import (
  111. datasets,
  112. svm,
  113. )
  114. digits = datasets.load_digits()
  115. clf = svm.SVC(gamma=0.001, C=100.0)
  116. clf.fit(digits.data[:-1], digits.target[:-1])
  117. clf.predict(digits.data[-1:])
  118. def test_seaborn(mpl_cleanup):
  119. seaborn = pytest.importorskip("seaborn")
  120. tips = DataFrame(
  121. {"day": pd.date_range("2023", freq="D", periods=5), "total_bill": range(5)}
  122. )
  123. seaborn.stripplot(x="day", y="total_bill", data=tips)
  124. @pytest.mark.xfail(reason="pandas_datareader uses old variant of deprecate_kwarg")
  125. def test_pandas_datareader():
  126. # https://github.com/pandas-dev/pandas/pull/61468
  127. # https://github.com/pydata/pandas-datareader/issues/1005
  128. pytest.importorskip("pandas_datareader")
  129. @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
  130. def test_pyarrow(df):
  131. pyarrow = pytest.importorskip("pyarrow")
  132. table = pyarrow.Table.from_pandas(df)
  133. result = table.to_pandas()
  134. tm.assert_frame_equal(result, df)
  135. def test_yaml_dump(df):
  136. # GH#42748
  137. yaml = pytest.importorskip("yaml")
  138. dumped = yaml.dump(df)
  139. loaded = yaml.load(dumped, Loader=yaml.Loader)
  140. tm.assert_frame_equal(df, loaded)
  141. loaded2 = yaml.load(dumped, Loader=yaml.UnsafeLoader)
  142. tm.assert_frame_equal(df, loaded2)
  143. @pytest.mark.parametrize("dependency", ["numpy", "dateutil"])
  144. def test_missing_required_dependency(monkeypatch, dependency):
  145. # GH#61030
  146. original_import = __import__
  147. mock_error = ImportError(f"Mock error for {dependency}")
  148. def mock_import(name, *args, **kwargs):
  149. if name == dependency:
  150. raise mock_error
  151. return original_import(name, *args, **kwargs)
  152. monkeypatch.setattr("builtins.__import__", mock_import)
  153. with pytest.raises(ImportError, match=dependency):
  154. importlib.reload(importlib.import_module("pandas"))
  155. def test_frame_setitem_dask_array_into_new_col(request):
  156. # GH#47128
  157. # dask sets "compute.use_numexpr" to False, so catch the current value
  158. # and ensure to reset it afterwards to avoid impacting other tests
  159. olduse = pd.get_option("compute.use_numexpr")
  160. try:
  161. dask = pytest.importorskip("dask")
  162. da = pytest.importorskip("dask.array")
  163. if Version(dask.__version__) <= Version("2025.1.0") and Version(
  164. np.__version__
  165. ) >= Version("2.1"):
  166. request.applymarker(
  167. pytest.mark.xfail(reason="loc.__setitem__ incorrectly mutated column c")
  168. )
  169. dda = da.array([1, 2])
  170. df = DataFrame({"a": ["a", "b"]})
  171. df["b"] = dda
  172. df["c"] = dda
  173. df.loc[[False, True], "b"] = 100
  174. result = df.loc[[1], :]
  175. expected = DataFrame({"a": ["b"], "b": [100], "c": [2]}, index=[1])
  176. tm.assert_frame_equal(result, expected)
  177. finally:
  178. pd.set_option("compute.use_numexpr", olduse)
  179. def test_pandas_priority():
  180. # GH#48347
  181. class MyClass:
  182. __pandas_priority__ = 5000
  183. def __radd__(self, other):
  184. return self
  185. left = MyClass()
  186. right = Series(range(3))
  187. assert right.__add__(left) is NotImplemented
  188. assert right + left is left
  189. @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
  190. @pytest.mark.parametrize(
  191. "box", [memoryview, partial(array.array, "i"), "dask", "xarray"]
  192. )
  193. def test_from_obscure_array(dtype, box):
  194. # GH#24539 recognize e.g xarray, dask, ...
  195. # Note: we dont do this for PeriodArray bc _from_sequence won't accept
  196. # an array of integers
  197. # TODO: could check with arraylike of Period objects
  198. # GH#24539 recognize e.g xarray, dask, ...
  199. arr = np.array([1, 2, 3], dtype=np.int64)
  200. if box == "dask":
  201. da = pytest.importorskip("dask.array")
  202. data = da.array(arr)
  203. elif box == "xarray":
  204. xr = pytest.importorskip("xarray")
  205. data = xr.DataArray(arr)
  206. else:
  207. data = box(arr)
  208. func = {"M8[ns]": pd.to_datetime, "m8[ns]": pd.to_timedelta}[dtype]
  209. result = func(arr).array
  210. expected = func(data).array
  211. tm.assert_equal(result, expected)
  212. # Let's check the Indexes while we're here
  213. idx_cls = {"M8[ns]": DatetimeIndex, "m8[ns]": TimedeltaIndex}[dtype]
  214. result = idx_cls(arr)
  215. expected = idx_cls(data)
  216. tm.assert_index_equal(result, expected)
  217. def test_xarray_coerce_unit():
  218. # GH44053
  219. xr = pytest.importorskip("xarray")
  220. arr = xr.DataArray([1, 2, 3])
  221. result = pd.to_datetime(arr, unit="ns")
  222. expected = DatetimeIndex(
  223. [
  224. "1970-01-01 00:00:00.000000001",
  225. "1970-01-01 00:00:00.000000002",
  226. "1970-01-01 00:00:00.000000003",
  227. ],
  228. dtype="datetime64[ns]",
  229. freq=None,
  230. )
  231. tm.assert_index_equal(result, expected)