_calamine.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. from __future__ import annotations
  2. from datetime import (
  3. date,
  4. datetime,
  5. time,
  6. timedelta,
  7. )
  8. from typing import (
  9. TYPE_CHECKING,
  10. Any,
  11. Union,
  12. )
  13. from pandas.compat._optional import import_optional_dependency
  14. from pandas.util._decorators import doc
  15. import pandas as pd
  16. from pandas.core.shared_docs import _shared_docs
  17. from pandas.io.excel._base import BaseExcelReader
  18. if TYPE_CHECKING:
  19. from python_calamine import (
  20. CalamineSheet,
  21. CalamineWorkbook,
  22. )
  23. from pandas._typing import (
  24. FilePath,
  25. NaTType,
  26. ReadBuffer,
  27. Scalar,
  28. StorageOptions,
  29. )
  30. _CellValue = Union[int, float, str, bool, time, date, datetime, timedelta]
  31. class CalamineReader(BaseExcelReader["CalamineWorkbook"]):
  32. @doc(storage_options=_shared_docs["storage_options"])
  33. def __init__(
  34. self,
  35. filepath_or_buffer: FilePath | ReadBuffer[bytes],
  36. storage_options: StorageOptions | None = None,
  37. engine_kwargs: dict | None = None,
  38. ) -> None:
  39. """
  40. Reader using calamine engine (xlsx/xls/xlsb/ods).
  41. Parameters
  42. ----------
  43. filepath_or_buffer : str, path to be parsed or
  44. an open readable stream.
  45. {storage_options}
  46. engine_kwargs : dict, optional
  47. Arbitrary keyword arguments passed to excel engine.
  48. """
  49. import_optional_dependency("python_calamine")
  50. super().__init__(
  51. filepath_or_buffer,
  52. storage_options=storage_options,
  53. engine_kwargs=engine_kwargs,
  54. )
  55. @property
  56. def _workbook_class(self) -> type[CalamineWorkbook]:
  57. from python_calamine import CalamineWorkbook
  58. return CalamineWorkbook
  59. def load_workbook(
  60. self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs: Any
  61. ) -> CalamineWorkbook:
  62. from python_calamine import load_workbook
  63. return load_workbook(filepath_or_buffer, **engine_kwargs)
  64. @property
  65. def sheet_names(self) -> list[str]:
  66. from python_calamine import SheetTypeEnum
  67. return [
  68. sheet.name
  69. for sheet in self.book.sheets_metadata
  70. if sheet.typ == SheetTypeEnum.WorkSheet
  71. ]
  72. def get_sheet_by_name(self, name: str) -> CalamineSheet:
  73. self.raise_if_bad_sheet_by_name(name)
  74. return self.book.get_sheet_by_name(name)
  75. def get_sheet_by_index(self, index: int) -> CalamineSheet:
  76. self.raise_if_bad_sheet_by_index(index)
  77. return self.book.get_sheet_by_index(index)
  78. def get_sheet_data(
  79. self, sheet: CalamineSheet, file_rows_needed: int | None = None
  80. ) -> list[list[Scalar | NaTType | time]]:
  81. def _convert_cell(value: _CellValue) -> Scalar | NaTType | time:
  82. if isinstance(value, float):
  83. val = int(value)
  84. if val == value:
  85. return val
  86. else:
  87. return value
  88. elif isinstance(value, date):
  89. return pd.Timestamp(value)
  90. elif isinstance(value, timedelta):
  91. return pd.Timedelta(value)
  92. elif isinstance(value, time):
  93. return value
  94. return value
  95. rows: list[list[_CellValue]] = sheet.to_python(
  96. skip_empty_area=False, nrows=file_rows_needed
  97. )
  98. data = [[_convert_cell(cell) for cell in row] for row in rows]
  99. return data