test_downstream.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. """
  2. Testing that we work in the downstream packages
  3. """
  4. import array
  5. import subprocess
  6. import sys
  7. import numpy as np
  8. import pytest
  9. from pandas.errors import IntCastingNaNError
  10. import pandas.util._test_decorators as td
  11. import pandas as pd
  12. from pandas import (
  13. DataFrame,
  14. DatetimeIndex,
  15. Series,
  16. TimedeltaIndex,
  17. )
  18. import pandas._testing as tm
  19. from pandas.core.arrays import (
  20. DatetimeArray,
  21. TimedeltaArray,
  22. )
  23. from pandas.util.version import Version
  24. @pytest.fixture
  25. def df():
  26. return DataFrame({"A": [1, 2, 3]})
  27. def test_dask(df):
  28. # dask sets "compute.use_numexpr" to False, so catch the current value
  29. # and ensure to reset it afterwards to avoid impacting other tests
  30. olduse = pd.get_option("compute.use_numexpr")
  31. try:
  32. pytest.importorskip("toolz")
  33. dd = pytest.importorskip("dask.dataframe")
  34. ddf = dd.from_pandas(df, npartitions=3)
  35. assert ddf.A is not None
  36. assert ddf.compute() is not None
  37. finally:
  38. pd.set_option("compute.use_numexpr", olduse)
  39. def test_dask_ufunc():
  40. # dask sets "compute.use_numexpr" to False, so catch the current value
  41. # and ensure to reset it afterwards to avoid impacting other tests
  42. olduse = pd.get_option("compute.use_numexpr")
  43. try:
  44. da = pytest.importorskip("dask.array")
  45. dd = pytest.importorskip("dask.dataframe")
  46. s = Series([1.5, 2.3, 3.7, 4.0])
  47. ds = dd.from_pandas(s, npartitions=2)
  48. result = da.fix(ds).compute()
  49. expected = np.fix(s)
  50. tm.assert_series_equal(result, expected)
  51. finally:
  52. pd.set_option("compute.use_numexpr", olduse)
  53. def test_construct_dask_float_array_int_dtype_match_ndarray():
  54. # GH#40110 make sure we treat a float-dtype dask array with the same
  55. # rules we would for an ndarray
  56. dd = pytest.importorskip("dask.dataframe")
  57. arr = np.array([1, 2.5, 3])
  58. darr = dd.from_array(arr)
  59. res = Series(darr)
  60. expected = Series(arr)
  61. tm.assert_series_equal(res, expected)
  62. # GH#49599 in 2.0 we raise instead of silently ignoring the dtype
  63. msg = "Trying to coerce float values to integers"
  64. with pytest.raises(ValueError, match=msg):
  65. Series(darr, dtype="i8")
  66. msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
  67. arr[2] = np.nan
  68. with pytest.raises(IntCastingNaNError, match=msg):
  69. Series(darr, dtype="i8")
  70. # which is the same as we get with a numpy input
  71. with pytest.raises(IntCastingNaNError, match=msg):
  72. Series(arr, dtype="i8")
  73. def test_xarray(df):
  74. pytest.importorskip("xarray")
  75. assert df.to_xarray() is not None
  76. def test_xarray_cftimeindex_nearest():
  77. # https://github.com/pydata/xarray/issues/3751
  78. cftime = pytest.importorskip("cftime")
  79. xarray = pytest.importorskip("xarray")
  80. times = xarray.cftime_range("0001", periods=2)
  81. key = cftime.DatetimeGregorian(2000, 1, 1)
  82. result = times.get_indexer([key], method="nearest")
  83. expected = 1
  84. assert result == expected
  85. @pytest.mark.single_cpu
  86. def test_oo_optimizable():
  87. # GH 21071
  88. subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"])
  89. @pytest.mark.single_cpu
  90. def test_oo_optimized_datetime_index_unpickle():
  91. # GH 42866
  92. subprocess.check_call(
  93. [
  94. sys.executable,
  95. "-OO",
  96. "-c",
  97. (
  98. "import pandas as pd, pickle; "
  99. "pickle.loads(pickle.dumps(pd.date_range('2021-01-01', periods=1)))"
  100. ),
  101. ]
  102. )
  103. def test_statsmodels():
  104. smf = pytest.importorskip("statsmodels.formula.api")
  105. df = DataFrame(
  106. {"Lottery": range(5), "Literacy": range(5), "Pop1831": range(100, 105)}
  107. )
  108. smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=df).fit()
  109. def test_scikit_learn():
  110. pytest.importorskip("sklearn")
  111. from sklearn import (
  112. datasets,
  113. svm,
  114. )
  115. digits = datasets.load_digits()
  116. clf = svm.SVC(gamma=0.001, C=100.0)
  117. clf.fit(digits.data[:-1], digits.target[:-1])
  118. clf.predict(digits.data[-1:])
  119. def test_seaborn():
  120. seaborn = pytest.importorskip("seaborn")
  121. tips = DataFrame(
  122. {"day": pd.date_range("2023", freq="D", periods=5), "total_bill": range(5)}
  123. )
  124. seaborn.stripplot(x="day", y="total_bill", data=tips)
  125. def test_pandas_datareader():
  126. pytest.importorskip("pandas_datareader")
  127. @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
  128. def test_pyarrow(df):
  129. pyarrow = pytest.importorskip("pyarrow")
  130. table = pyarrow.Table.from_pandas(df)
  131. result = table.to_pandas()
  132. tm.assert_frame_equal(result, df)
  133. def test_yaml_dump(df):
  134. # GH#42748
  135. yaml = pytest.importorskip("yaml")
  136. dumped = yaml.dump(df)
  137. loaded = yaml.load(dumped, Loader=yaml.Loader)
  138. tm.assert_frame_equal(df, loaded)
  139. loaded2 = yaml.load(dumped, Loader=yaml.UnsafeLoader)
  140. tm.assert_frame_equal(df, loaded2)
  141. @pytest.mark.single_cpu
  142. def test_missing_required_dependency():
  143. # GH 23868
  144. # To ensure proper isolation, we pass these flags
  145. # -S : disable site-packages
  146. # -s : disable user site-packages
  147. # -E : disable PYTHON* env vars, especially PYTHONPATH
  148. # https://github.com/MacPython/pandas-wheels/pull/50
  149. pyexe = sys.executable.replace("\\", "/")
  150. # We skip this test if pandas is installed as a site package. We first
  151. # import the package normally and check the path to the module before
  152. # executing the test which imports pandas with site packages disabled.
  153. call = [pyexe, "-c", "import pandas;print(pandas.__file__)"]
  154. output = subprocess.check_output(call).decode()
  155. if "site-packages" in output:
  156. pytest.skip("pandas installed as site package")
  157. # This test will fail if pandas is installed as a site package. The flags
  158. # prevent pandas being imported and the test will report Failed: DID NOT
  159. # RAISE <class 'subprocess.CalledProcessError'>
  160. call = [pyexe, "-sSE", "-c", "import pandas"]
  161. msg = (
  162. rf"Command '\['{pyexe}', '-sSE', '-c', 'import pandas'\]' "
  163. "returned non-zero exit status 1."
  164. )
  165. with pytest.raises(subprocess.CalledProcessError, match=msg) as exc:
  166. subprocess.check_output(call, stderr=subprocess.STDOUT)
  167. output = exc.value.stdout.decode()
  168. for name in ["numpy", "pytz", "dateutil"]:
  169. assert name in output
  170. def test_frame_setitem_dask_array_into_new_col(request):
  171. # GH#47128
  172. # dask sets "compute.use_numexpr" to False, so catch the current value
  173. # and ensure to reset it afterwards to avoid impacting other tests
  174. olduse = pd.get_option("compute.use_numexpr")
  175. try:
  176. dask = pytest.importorskip("dask")
  177. da = pytest.importorskip("dask.array")
  178. if Version(dask.__version__) <= Version("2025.1.0") and Version(
  179. np.__version__
  180. ) >= Version("2.1"):
  181. request.applymarker(
  182. pytest.mark.xfail(reason="loc.__setitem__ incorrectly mutated column c")
  183. )
  184. dda = da.array([1, 2])
  185. df = DataFrame({"a": ["a", "b"]})
  186. df["b"] = dda
  187. df["c"] = dda
  188. df.loc[[False, True], "b"] = 100
  189. result = df.loc[[1], :]
  190. expected = DataFrame({"a": ["b"], "b": [100], "c": [2]}, index=[1])
  191. tm.assert_frame_equal(result, expected)
  192. finally:
  193. pd.set_option("compute.use_numexpr", olduse)
  194. def test_pandas_priority():
  195. # GH#48347
  196. class MyClass:
  197. __pandas_priority__ = 5000
  198. def __radd__(self, other):
  199. return self
  200. left = MyClass()
  201. right = Series(range(3))
  202. assert right.__add__(left) is NotImplemented
  203. assert right + left is left
  204. @pytest.fixture(
  205. params=[
  206. "memoryview",
  207. "array",
  208. pytest.param("dask", marks=td.skip_if_no("dask.array")),
  209. pytest.param("xarray", marks=td.skip_if_no("xarray")),
  210. ]
  211. )
  212. def array_likes(request):
  213. """
  214. Fixture giving a numpy array and a parametrized 'data' object, which can
  215. be a memoryview, array, dask or xarray object created from the numpy array.
  216. """
  217. # GH#24539 recognize e.g xarray, dask, ...
  218. arr = np.array([1, 2, 3], dtype=np.int64)
  219. name = request.param
  220. if name == "memoryview":
  221. data = memoryview(arr)
  222. elif name == "array":
  223. data = array.array("i", arr)
  224. elif name == "dask":
  225. import dask.array
  226. data = dask.array.array(arr)
  227. elif name == "xarray":
  228. import xarray as xr
  229. data = xr.DataArray(arr)
  230. return arr, data
  231. @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
  232. def test_from_obscure_array(dtype, array_likes):
  233. # GH#24539 recognize e.g xarray, dask, ...
  234. # Note: we dont do this for PeriodArray bc _from_sequence won't accept
  235. # an array of integers
  236. # TODO: could check with arraylike of Period objects
  237. arr, data = array_likes
  238. cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype]
  239. depr_msg = f"{cls.__name__}.__init__ is deprecated"
  240. with tm.assert_produces_warning(FutureWarning, match=depr_msg):
  241. expected = cls(arr)
  242. result = cls._from_sequence(data, dtype=dtype)
  243. tm.assert_extension_array_equal(result, expected)
  244. if not isinstance(data, memoryview):
  245. # FIXME(GH#44431) these raise on memoryview and attempted fix
  246. # fails on py3.10
  247. func = {"M8[ns]": pd.to_datetime, "m8[ns]": pd.to_timedelta}[dtype]
  248. result = func(arr).array
  249. expected = func(data).array
  250. tm.assert_equal(result, expected)
  251. # Let's check the Indexes while we're here
  252. idx_cls = {"M8[ns]": DatetimeIndex, "m8[ns]": TimedeltaIndex}[dtype]
  253. result = idx_cls(arr)
  254. expected = idx_cls(data)
  255. tm.assert_index_equal(result, expected)
  256. def test_dataframe_consortium() -> None:
  257. """
  258. Test some basic methods of the dataframe consortium standard.
  259. Full testing is done at https://github.com/data-apis/dataframe-api-compat,
  260. this is just to check that the entry point works as expected.
  261. """
  262. pytest.importorskip("dataframe_api_compat")
  263. df_pd = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
  264. df = df_pd.__dataframe_consortium_standard__()
  265. result_1 = df.get_column_names()
  266. expected_1 = ["a", "b"]
  267. assert result_1 == expected_1
  268. ser = Series([1, 2, 3], name="a")
  269. col = ser.__column_consortium_standard__()
  270. assert col.name == "a"
  271. def test_xarray_coerce_unit():
  272. # GH44053
  273. xr = pytest.importorskip("xarray")
  274. arr = xr.DataArray([1, 2, 3])
  275. result = pd.to_datetime(arr, unit="ns")
  276. expected = DatetimeIndex(
  277. [
  278. "1970-01-01 00:00:00.000000001",
  279. "1970-01-01 00:00:00.000000002",
  280. "1970-01-01 00:00:00.000000003",
  281. ],
  282. dtype="datetime64[ns]",
  283. freq=None,
  284. )
  285. tm.assert_index_equal(result, expected)