sasreader.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. """
  2. Read SAS sas7bdat or xport files.
  3. """
  4. from __future__ import annotations
  5. from abc import (
  6. ABC,
  7. abstractmethod,
  8. )
  9. from typing import (
  10. TYPE_CHECKING,
  11. overload,
  12. )
  13. from pandas.util._decorators import doc
  14. from pandas.core.shared_docs import _shared_docs
  15. from pandas.io.common import stringify_path
  16. if TYPE_CHECKING:
  17. from collections.abc import Hashable
  18. from types import TracebackType
  19. from pandas._typing import (
  20. CompressionOptions,
  21. FilePath,
  22. ReadBuffer,
  23. Self,
  24. )
  25. from pandas import DataFrame
  26. class ReaderBase(ABC):
  27. """
  28. Protocol for XportReader and SAS7BDATReader classes.
  29. """
  30. @abstractmethod
  31. def read(self, nrows: int | None = None) -> DataFrame:
  32. ...
  33. @abstractmethod
  34. def close(self) -> None:
  35. ...
  36. def __enter__(self) -> Self:
  37. return self
  38. def __exit__(
  39. self,
  40. exc_type: type[BaseException] | None,
  41. exc_value: BaseException | None,
  42. traceback: TracebackType | None,
  43. ) -> None:
  44. self.close()
  45. @overload
  46. def read_sas(
  47. filepath_or_buffer: FilePath | ReadBuffer[bytes],
  48. *,
  49. format: str | None = ...,
  50. index: Hashable | None = ...,
  51. encoding: str | None = ...,
  52. chunksize: int = ...,
  53. iterator: bool = ...,
  54. compression: CompressionOptions = ...,
  55. ) -> ReaderBase:
  56. ...
  57. @overload
  58. def read_sas(
  59. filepath_or_buffer: FilePath | ReadBuffer[bytes],
  60. *,
  61. format: str | None = ...,
  62. index: Hashable | None = ...,
  63. encoding: str | None = ...,
  64. chunksize: None = ...,
  65. iterator: bool = ...,
  66. compression: CompressionOptions = ...,
  67. ) -> DataFrame | ReaderBase:
  68. ...
  69. @doc(decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer")
  70. def read_sas(
  71. filepath_or_buffer: FilePath | ReadBuffer[bytes],
  72. *,
  73. format: str | None = None,
  74. index: Hashable | None = None,
  75. encoding: str | None = None,
  76. chunksize: int | None = None,
  77. iterator: bool = False,
  78. compression: CompressionOptions = "infer",
  79. ) -> DataFrame | ReaderBase:
  80. """
  81. Read SAS files stored as either XPORT or SAS7BDAT format files.
  82. Parameters
  83. ----------
  84. filepath_or_buffer : str, path object, or file-like object
  85. String, path object (implementing ``os.PathLike[str]``), or file-like
  86. object implementing a binary ``read()`` function. The string could be a URL.
  87. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
  88. expected. A local file could be:
  89. ``file://localhost/path/to/table.sas7bdat``.
  90. format : str {{'xport', 'sas7bdat'}} or None
  91. If None, file format is inferred from file extension. If 'xport' or
  92. 'sas7bdat', uses the corresponding format.
  93. index : identifier of index column, defaults to None
  94. Identifier of column that should be used as index of the DataFrame.
  95. encoding : str, default is None
  96. Encoding for text data. If None, text data are stored as raw bytes.
  97. chunksize : int
  98. Read file `chunksize` lines at a time, returns iterator.
  99. iterator : bool, defaults to False
  100. If True, returns an iterator for reading the file incrementally.
  101. {decompression_options}
  102. Returns
  103. -------
  104. DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
  105. or XportReader
  106. Examples
  107. --------
  108. >>> df = pd.read_sas("sas_data.sas7bdat") # doctest: +SKIP
  109. """
  110. if format is None:
  111. buffer_error_msg = (
  112. "If this is a buffer object rather "
  113. "than a string name, you must specify a format string"
  114. )
  115. filepath_or_buffer = stringify_path(filepath_or_buffer)
  116. if not isinstance(filepath_or_buffer, str):
  117. raise ValueError(buffer_error_msg)
  118. fname = filepath_or_buffer.lower()
  119. if ".xpt" in fname:
  120. format = "xport"
  121. elif ".sas7bdat" in fname:
  122. format = "sas7bdat"
  123. else:
  124. raise ValueError(
  125. f"unable to infer format of SAS file from filename: {repr(fname)}"
  126. )
  127. reader: ReaderBase
  128. if format.lower() == "xport":
  129. from pandas.io.sas.sas_xport import XportReader
  130. reader = XportReader(
  131. filepath_or_buffer,
  132. index=index,
  133. encoding=encoding,
  134. chunksize=chunksize,
  135. compression=compression,
  136. )
  137. elif format.lower() == "sas7bdat":
  138. from pandas.io.sas.sas7bdat import SAS7BDATReader
  139. reader = SAS7BDATReader(
  140. filepath_or_buffer,
  141. index=index,
  142. encoding=encoding,
  143. chunksize=chunksize,
  144. compression=compression,
  145. )
  146. else:
  147. raise ValueError("unknown SAS format")
  148. if iterator or chunksize:
  149. return reader
  150. with reader:
  151. return reader.read()