spss.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. from __future__ import annotations
  2. from typing import TYPE_CHECKING
  3. from pandas._libs import lib
  4. from pandas.compat._optional import import_optional_dependency
  5. from pandas.util._validators import check_dtype_backend
  6. from pandas.core.dtypes.inference import is_list_like
  7. from pandas.io.common import stringify_path
  8. if TYPE_CHECKING:
  9. from collections.abc import Sequence
  10. from pathlib import Path
  11. from pandas._typing import DtypeBackend
  12. from pandas import DataFrame
  13. def read_spss(
  14. path: str | Path,
  15. usecols: Sequence[str] | None = None,
  16. convert_categoricals: bool = True,
  17. dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
  18. ) -> DataFrame:
  19. """
  20. Load an SPSS file from the file path, returning a DataFrame.
  21. Parameters
  22. ----------
  23. path : str or Path
  24. File path.
  25. usecols : list-like, optional
  26. Return a subset of the columns. If None, return all columns.
  27. convert_categoricals : bool, default is True
  28. Convert categorical columns into pd.Categorical.
  29. dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
  30. Back-end data type applied to the resultant :class:`DataFrame`
  31. (still experimental). Behaviour is as follows:
  32. * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
  33. (default).
  34. * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
  35. DataFrame.
  36. .. versionadded:: 2.0
  37. Returns
  38. -------
  39. DataFrame
  40. Examples
  41. --------
  42. >>> df = pd.read_spss("spss_data.sav") # doctest: +SKIP
  43. """
  44. pyreadstat = import_optional_dependency("pyreadstat")
  45. check_dtype_backend(dtype_backend)
  46. if usecols is not None:
  47. if not is_list_like(usecols):
  48. raise TypeError("usecols must be list-like.")
  49. usecols = list(usecols) # pyreadstat requires a list
  50. df, metadata = pyreadstat.read_sav(
  51. stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals
  52. )
  53. df.attrs = metadata.__dict__
  54. if dtype_backend is not lib.no_default:
  55. df = df.convert_dtypes(dtype_backend=dtype_backend)
  56. return df