_odfreader.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. from __future__ import annotations
  2. from typing import (
  3. TYPE_CHECKING,
  4. cast,
  5. )
  6. import numpy as np
  7. from pandas._typing import (
  8. FilePath,
  9. ReadBuffer,
  10. Scalar,
  11. StorageOptions,
  12. )
  13. from pandas.compat._optional import import_optional_dependency
  14. from pandas.util._decorators import doc
  15. import pandas as pd
  16. from pandas.core.shared_docs import _shared_docs
  17. from pandas.io.excel._base import BaseExcelReader
  18. if TYPE_CHECKING:
  19. from odf.opendocument import OpenDocument
  20. from pandas._libs.tslibs.nattype import NaTType
  21. @doc(storage_options=_shared_docs["storage_options"])
  22. class ODFReader(BaseExcelReader["OpenDocument"]):
  23. def __init__(
  24. self,
  25. filepath_or_buffer: FilePath | ReadBuffer[bytes],
  26. storage_options: StorageOptions | None = None,
  27. engine_kwargs: dict | None = None,
  28. ) -> None:
  29. """
  30. Read tables out of OpenDocument formatted files.
  31. Parameters
  32. ----------
  33. filepath_or_buffer : str, path to be parsed or
  34. an open readable stream.
  35. {storage_options}
  36. engine_kwargs : dict, optional
  37. Arbitrary keyword arguments passed to excel engine.
  38. """
  39. import_optional_dependency("odf")
  40. super().__init__(
  41. filepath_or_buffer,
  42. storage_options=storage_options,
  43. engine_kwargs=engine_kwargs,
  44. )
  45. @property
  46. def _workbook_class(self) -> type[OpenDocument]:
  47. from odf.opendocument import OpenDocument
  48. return OpenDocument
  49. def load_workbook(
  50. self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs
  51. ) -> OpenDocument:
  52. from odf.opendocument import load
  53. return load(filepath_or_buffer, **engine_kwargs)
  54. @property
  55. def empty_value(self) -> str:
  56. """Property for compat with other readers."""
  57. return ""
  58. @property
  59. def sheet_names(self) -> list[str]:
  60. """Return a list of sheet names present in the document"""
  61. from odf.table import Table
  62. tables = self.book.getElementsByType(Table)
  63. return [t.getAttribute("name") for t in tables]
  64. def get_sheet_by_index(self, index: int):
  65. from odf.table import Table
  66. self.raise_if_bad_sheet_by_index(index)
  67. tables = self.book.getElementsByType(Table)
  68. return tables[index]
  69. def get_sheet_by_name(self, name: str):
  70. from odf.table import Table
  71. self.raise_if_bad_sheet_by_name(name)
  72. tables = self.book.getElementsByType(Table)
  73. for table in tables:
  74. if table.getAttribute("name") == name:
  75. return table
  76. self.close()
  77. raise ValueError(f"sheet {name} not found")
  78. def get_sheet_data(
  79. self, sheet, file_rows_needed: int | None = None
  80. ) -> list[list[Scalar | NaTType]]:
  81. """
  82. Parse an ODF Table into a list of lists
  83. """
  84. from odf.table import (
  85. CoveredTableCell,
  86. TableCell,
  87. TableRow,
  88. )
  89. covered_cell_name = CoveredTableCell().qname
  90. table_cell_name = TableCell().qname
  91. cell_names = {covered_cell_name, table_cell_name}
  92. sheet_rows = sheet.getElementsByType(TableRow)
  93. empty_rows = 0
  94. max_row_len = 0
  95. table: list[list[Scalar | NaTType]] = []
  96. for sheet_row in sheet_rows:
  97. sheet_cells = [
  98. x
  99. for x in sheet_row.childNodes
  100. if hasattr(x, "qname") and x.qname in cell_names
  101. ]
  102. empty_cells = 0
  103. table_row: list[Scalar | NaTType] = []
  104. for sheet_cell in sheet_cells:
  105. if sheet_cell.qname == table_cell_name:
  106. value = self._get_cell_value(sheet_cell)
  107. else:
  108. value = self.empty_value
  109. column_repeat = self._get_column_repeat(sheet_cell)
  110. # Queue up empty values, writing only if content succeeds them
  111. if value == self.empty_value:
  112. empty_cells += column_repeat
  113. else:
  114. table_row.extend([self.empty_value] * empty_cells)
  115. empty_cells = 0
  116. table_row.extend([value] * column_repeat)
  117. if max_row_len < len(table_row):
  118. max_row_len = len(table_row)
  119. row_repeat = self._get_row_repeat(sheet_row)
  120. if len(table_row) == 0:
  121. empty_rows += row_repeat
  122. else:
  123. # add blank rows to our table
  124. table.extend([[self.empty_value]] * empty_rows)
  125. empty_rows = 0
  126. table.extend(table_row for _ in range(row_repeat))
  127. if file_rows_needed is not None and len(table) >= file_rows_needed:
  128. break
  129. # Make our table square
  130. for row in table:
  131. if len(row) < max_row_len:
  132. row.extend([self.empty_value] * (max_row_len - len(row)))
  133. return table
  134. def _get_row_repeat(self, row) -> int:
  135. """
  136. Return number of times this row was repeated
  137. Repeating an empty row appeared to be a common way
  138. of representing sparse rows in the table.
  139. """
  140. from odf.namespaces import TABLENS
  141. return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1))
  142. def _get_column_repeat(self, cell) -> int:
  143. from odf.namespaces import TABLENS
  144. return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1))
  145. def _get_cell_value(self, cell) -> Scalar | NaTType:
  146. from odf.namespaces import OFFICENS
  147. if str(cell) == "#N/A":
  148. return np.nan
  149. cell_type = cell.attributes.get((OFFICENS, "value-type"))
  150. if cell_type == "boolean":
  151. if str(cell) == "TRUE":
  152. return True
  153. return False
  154. if cell_type is None:
  155. return self.empty_value
  156. elif cell_type == "float":
  157. # GH5394
  158. cell_value = float(cell.attributes.get((OFFICENS, "value")))
  159. val = int(cell_value)
  160. if val == cell_value:
  161. return val
  162. return cell_value
  163. elif cell_type == "percentage":
  164. cell_value = cell.attributes.get((OFFICENS, "value"))
  165. return float(cell_value)
  166. elif cell_type == "string":
  167. return self._get_cell_string_value(cell)
  168. elif cell_type == "currency":
  169. cell_value = cell.attributes.get((OFFICENS, "value"))
  170. return float(cell_value)
  171. elif cell_type == "date":
  172. cell_value = cell.attributes.get((OFFICENS, "date-value"))
  173. return pd.Timestamp(cell_value)
  174. elif cell_type == "time":
  175. stamp = pd.Timestamp(str(cell))
  176. # cast needed here because Scalar doesn't include datetime.time
  177. return cast(Scalar, stamp.time())
  178. else:
  179. self.close()
  180. raise ValueError(f"Unrecognized type {cell_type}")
  181. def _get_cell_string_value(self, cell) -> str:
  182. """
  183. Find and decode OpenDocument text:s tags that represent
  184. a run length encoded sequence of space characters.
  185. """
  186. from odf.element import Element
  187. from odf.namespaces import TEXTNS
  188. from odf.office import Annotation
  189. from odf.text import S
  190. office_annotation = Annotation().qname
  191. text_s = S().qname
  192. value = []
  193. for fragment in cell.childNodes:
  194. if isinstance(fragment, Element):
  195. if fragment.qname == text_s:
  196. spaces = int(fragment.attributes.get((TEXTNS, "c"), 1))
  197. value.append(" " * spaces)
  198. elif fragment.qname == office_annotation:
  199. continue
  200. else:
  201. # recursive impl needed in case of nested fragments
  202. # with multiple spaces
  203. # https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704
  204. value.append(self._get_cell_string_value(fragment))
  205. else:
  206. value.append(str(fragment).strip("\n"))
  207. return "".join(value)