tablepyxl.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. # Do imports like python3 so our package works for 2 and 3
  2. from __future__ import absolute_import
  3. from tablepyxl.style import Table
  4. from paddle.utils import try_import
  5. def string_to_int(s):
  6. if s.isdigit():
  7. return int(s)
  8. return 0
  9. def get_Tables(doc):
  10. try_import("lxml")
  11. from lxml import etree, html
  12. tree = html.fromstring(doc)
  13. comments = tree.xpath("//comment()")
  14. for comment in comments:
  15. comment.drop_tag()
  16. return [Table(table) for table in tree.xpath("//table")]
  17. def write_rows(worksheet, elem, row, column=1):
  18. """
  19. Writes every tr child element of elem to a row in the worksheet
  20. returns the next row after all rows are written
  21. """
  22. try_import("openpyxl")
  23. from openpyxl.cell.cell import MergedCell
  24. from openpyxl.utils import get_column_letter
  25. initial_column = column
  26. for table_row in elem.rows:
  27. for table_cell in table_row.cells:
  28. cell = worksheet.cell(row=row, column=column)
  29. while isinstance(cell, MergedCell):
  30. column += 1
  31. cell = worksheet.cell(row=row, column=column)
  32. colspan = string_to_int(table_cell.element.get("colspan", "1"))
  33. rowspan = string_to_int(table_cell.element.get("rowspan", "1"))
  34. if rowspan > 1 or colspan > 1:
  35. worksheet.merge_cells(
  36. start_row=row,
  37. start_column=column,
  38. end_row=row + rowspan - 1,
  39. end_column=column + colspan - 1,
  40. )
  41. cell.value = table_cell.value
  42. table_cell.format(cell)
  43. min_width = table_cell.get_dimension("min-width")
  44. max_width = table_cell.get_dimension("max-width")
  45. if colspan == 1:
  46. # Initially, when iterating for the first time through the loop, the width of all the cells is None.
  47. # As we start filling in contents, the initial width of the cell (which can be retrieved by:
  48. # worksheet.column_dimensions[get_column_letter(column)].width) is equal to the width of the previous
  49. # cell in the same column (i.e. width of A2 = width of A1)
  50. width = max(
  51. worksheet.column_dimensions[get_column_letter(column)].width or 0,
  52. len(table_cell.value) + 2,
  53. )
  54. if max_width and width > max_width:
  55. width = max_width
  56. elif min_width and width < min_width:
  57. width = min_width
  58. worksheet.column_dimensions[get_column_letter(column)].width = width
  59. column += colspan
  60. row += 1
  61. column = initial_column
  62. return row
  63. def table_to_sheet(table, wb):
  64. """
  65. Takes a table and workbook and writes the table to a new sheet.
  66. The sheet title will be the same as the table attribute name.
  67. """
  68. ws = wb.create_sheet(title=table.element.get("name"))
  69. insert_table(table, ws, 1, 1)
  70. def document_to_workbook(doc, wb=None, base_url=None):
  71. """
  72. Takes a string representation of an html document and writes one sheet for
  73. every table in the document.
  74. The workbook is returned
  75. """
  76. try_import("premailer")
  77. try_import("openpyxl")
  78. from premailer import Premailer
  79. from openpyxl import Workbook
  80. if not wb:
  81. wb = Workbook()
  82. wb.remove(wb.active)
  83. inline_styles_doc = Premailer(
  84. doc, base_url=base_url, remove_classes=False
  85. ).transform()
  86. tables = get_Tables(inline_styles_doc)
  87. for table in tables:
  88. table_to_sheet(table, wb)
  89. return wb
  90. def document_to_xl(doc, filename, base_url=None):
  91. """
  92. Takes a string representation of an html document and writes one sheet for
  93. every table in the document. The workbook is written out to a file called filename
  94. """
  95. wb = document_to_workbook(doc, base_url=base_url)
  96. wb.save(filename)
  97. def insert_table(table, worksheet, column, row):
  98. if table.head:
  99. row = write_rows(worksheet, table.head, row, column)
  100. if table.body:
  101. row = write_rows(worksheet, table.body, row, column)
  102. def insert_table_at_cell(table, cell):
  103. """
  104. Inserts a table at the location of an openpyxl Cell object.
  105. """
  106. ws = cell.parent
  107. column, row = cell.column, cell.row
  108. insert_table(table, ws, column, row)