result.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import re
  16. from pathlib import Path
  17. from ...common.result import BaseCVResult, LatexMixin, MarkdownMixin, WordMixin
  18. class MarkdownResult(BaseCVResult, MarkdownMixin, WordMixin, LatexMixin):
  19. def __init__(self, data) -> None:
  20. """Initializes a new instance of the class with the specified data."""
  21. super().__init__(data)
  22. MarkdownMixin.__init__(self)
  23. def _get_input_fn(self):
  24. fn = super()._get_input_fn()
  25. if (page_idx := self.get("page_index", None)) is not None:
  26. fp = Path(fn)
  27. stem, suffix = fp.stem, fp.suffix
  28. fn = f"{stem}_{page_idx}{suffix}"
  29. if (language := self.get("language", None)) is not None:
  30. fp = Path(fn)
  31. stem, suffix = fp.stem, fp.suffix
  32. fn = f"{stem}_{language}{suffix}"
  33. return fn
  34. def _to_markdown(self, pretty=True, show_formula_number=False) -> dict:
  35. return self
  36. # in order to make MarkdownResult support save_to_word
  37. def _to_word(self, save_path) -> dict:
  38. from bs4 import BeautifulSoup
  39. md_text = self.get("markdown_texts", "")
  40. def set_paragraph_style(
  41. paragraph, bold=False, align="left", font_size=11, color=None
  42. ):
  43. from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
  44. from docx.oxml.ns import qn
  45. from docx.shared import Pt, RGBColor
  46. # Set paragraph style uniformly
  47. run = paragraph.runs[0] if paragraph.runs else paragraph.add_run()
  48. run.font.name = "Times New Roman"
  49. run._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
  50. run.font.size = Pt(font_size)
  51. run.bold = bold
  52. if color:
  53. run.font.color.rgb = RGBColor(*color)
  54. if align == "center":
  55. paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
  56. elif align == "right":
  57. paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
  58. else:
  59. paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
  60. def add_image(paragraph, src, width_percent):
  61. from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
  62. from docx.shared import Inches
  63. if os.path.exists(src):
  64. try:
  65. width_in_inches = Inches(width_percent / 100 * 6.0)
  66. run = paragraph.add_run()
  67. run.add_picture(src, width=width_in_inches)
  68. paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
  69. except Exception as e:
  70. paragraph.add_run(f"[fail load image: {src}]")
  71. else:
  72. paragraph.add_run(f"[image not exist: {src}]")
  73. def add_table(document, table_html):
  74. """
  75. Parsing HTML table and add to Word
  76. """
  77. soup = BeautifulSoup(table_html, "html.parser")
  78. table_tag = soup.find("table")
  79. if not table_tag:
  80. return
  81. rows = table_tag.find_all("tr")
  82. if not rows:
  83. return
  84. # Calculate the maximum number of columns to avoid out-of-bounds errors
  85. max_cols = max(len(row.find_all(["td", "th"])) for row in rows)
  86. table = document.add_table(rows=len(rows), cols=max_cols)
  87. table.style = "Table Grid"
  88. for i, row in enumerate(rows):
  89. cells = row.find_all(["td", "th"])
  90. for j in range(max_cols):
  91. if j < len(cells):
  92. text = cells[j].get_text(strip=True)
  93. table.cell(i, j).text = text
  94. else:
  95. table.cell(i, j).text = ""
  96. def process_md_page(document, md_text, output_path):
  97. # Process single page conten
  98. lines = md_text.strip().split("\n")
  99. for line in lines:
  100. line = line.strip()
  101. if not line:
  102. continue
  103. title_color = (0, 0, 255)
  104. if line.startswith("##### "):
  105. p = document.add_paragraph(line[6:])
  106. set_paragraph_style(p, bold=True, font_size=10)
  107. elif line.startswith("#### "):
  108. p = document.add_paragraph(line[5:])
  109. set_paragraph_style(p, bold=True, font_size=11)
  110. elif line.startswith("### "):
  111. p = document.add_paragraph(line[4:])
  112. set_paragraph_style(p, bold=True, font_size=12)
  113. elif line.startswith("## "):
  114. p = document.add_paragraph(line[3:])
  115. set_paragraph_style(p, bold=True, font_size=14)
  116. elif line.startswith("# "):
  117. p = document.add_paragraph(line[2:])
  118. set_paragraph_style(p, bold=True, font_size=16)
  119. # Handle centered content
  120. elif line.startswith("<div") and "text-align: center" in line:
  121. soup = BeautifulSoup(line, "html.parser")
  122. div = soup.find("div")
  123. if not div:
  124. continue
  125. if div.img:
  126. img = div.img
  127. src = img.get("src")
  128. width_attr = img.get("width", "100%").replace("%", "")
  129. width_percent = float(width_attr) if width_attr else 100
  130. p = document.add_paragraph()
  131. add_image(p, f"{output_path}/{src}", width_percent)
  132. elif div.table:
  133. add_table(document, str(div))
  134. else:
  135. text = div.get_text(strip=True)
  136. if text:
  137. p = document.add_paragraph(text)
  138. set_paragraph_style(
  139. p, bold=True, align="center", color=title_color
  140. )
  141. # Handle HTML tables
  142. elif "<table" in line:
  143. add_table(document, line)
  144. # Normal paragraph
  145. else:
  146. p = document.add_paragraph(line)
  147. set_paragraph_style(p, font_size=11)
  148. from docx import Document
  149. document = Document()
  150. process_md_page(document, md_text, save_path)
  151. return document
  152. # in order to make MarkdownResult support save_to_latex
  153. def _to_latex(self, save_path) -> str:
  154. from bs4 import BeautifulSoup
  155. def escape_latex_outside_formula(s: str) -> str:
  156. """
  157. Escape LaTeX special characters while preserving formulas.
  158. """
  159. if not s:
  160. return ""
  161. placeholders = []
  162. def repl(m):
  163. placeholders.append(m.group(0))
  164. return f"@@FORMULA{len(placeholders)-1}@@"
  165. # Extract formulas
  166. formula_pat = re.compile(
  167. r"(\$\$.*?\$\$|\$.*?\$|\\\[.*?\\\]|\\\(.*?\\\))", re.DOTALL
  168. )
  169. tmp = formula_pat.sub(repl, s)
  170. tmp = (
  171. tmp.replace("\\", "\\textbackslash{}")
  172. .replace("&", "\\&")
  173. .replace("%", "\\%")
  174. .replace("$", "\\$")
  175. .replace("#", "\\#")
  176. .replace("_", "\\_")
  177. .replace("{", "\\{")
  178. .replace("}", "\\}")
  179. .replace("~", "\\textasciitilde{}")
  180. .replace("^", "\\textasciicircum{}")
  181. )
  182. # Restore formulas
  183. for i, f in enumerate(placeholders):
  184. tmp = tmp.replace(f"@@FORMULA{i}@@", f)
  185. return tmp
  186. #
  187. def get_image_width_from_md_line(line, default_ratio=0.8):
  188. """
  189. Parse the image width attribute.
  190. """
  191. m = re.search(r'width\s*=\s*["\']?(\d+)%?["\']?', line)
  192. if m:
  193. val = int(m.group(1))
  194. return max(0.01, min(val / 100.0, 1.0))
  195. m2 = re.search(r"width\s*:\s*(\d+)%", line)
  196. if m2:
  197. val = int(m2.group(1))
  198. return max(0.01, min(val / 100.0, 1.0))
  199. return default_ratio
  200. def process_table_html(content) -> str:
  201. """
  202. Process table content.
  203. """
  204. if "<table" in content:
  205. soup = BeautifulSoup(content, "html.parser")
  206. rows = []
  207. for tr in soup.find_all("tr"):
  208. row = []
  209. for td in tr.find_all(["td", "th"]):
  210. text = td.get_text(strip=True)
  211. row.append(escape_latex_outside_formula(text))
  212. rows.append(row)
  213. else:
  214. rows = [
  215. [escape_latex_outside_formula(c) for c in row.split("\t")]
  216. for row in content.splitlines()
  217. if row.strip()
  218. ]
  219. if not rows:
  220. return ""
  221. col_count = max(len(r) for r in rows)
  222. norm_rows = [r + [""] * (col_count - len(r)) for r in rows]
  223. col_format = " ".join(
  224. [">{\\raggedright\\arraybackslash}X" for _ in range(col_count)]
  225. )
  226. latex = "\\begin{center}\n\\renewcommand{\\arraystretch}{1.5}\n"
  227. latex += f"\\begin{{tabularx}}{{\\textwidth}}{{{col_format}}}\n\\toprule\n"
  228. for i, row in enumerate(norm_rows):
  229. latex += " & ".join(row) + " \\\\\n"
  230. if i == 0:
  231. latex += "\\midrule\n"
  232. latex += "\\bottomrule\n\\end{tabularx}\n\\end{center}\n\n"
  233. return latex
  234. def process_paragraph(s: str) -> str:
  235. """
  236. Process text paragraphs, preserving formulas.
  237. """
  238. paragraphs = re.split(r"\n\s*\n", s)
  239. processed_paras = []
  240. for p in paragraphs:
  241. p = p.strip()
  242. if not p:
  243. continue
  244. processed_paras.append("\\par " + escape_latex_outside_formula(p))
  245. return "\n\n".join(processed_paras) + "\n\n"
  246. def process_md_line(line: str, save_path) -> str:
  247. """
  248. Process a single line.
  249. """
  250. line = line.strip()
  251. if not line:
  252. return ""
  253. if line.startswith("##### "):
  254. return f"\\paragraph*{{{escape_latex_outside_formula(line[6:].strip())}}}\n\n"
  255. if line.startswith("#### "):
  256. return f"\\subsubsection*{{{escape_latex_outside_formula(line[5:].strip())}}}\n\n"
  257. if line.startswith("### "):
  258. return f"\\subsection*{{{escape_latex_outside_formula(line[4:].strip())}}}\n\n"
  259. if line.startswith("## "):
  260. return f"\\section*{{{escape_latex_outside_formula(line[3:].strip())}}}\n\n"
  261. if line.startswith("# "):
  262. return f"\\section*{{{escape_latex_outside_formula(line[2:].strip())}}}\n\n"
  263. if "<div" in line and "text-align: center" in line:
  264. soup = BeautifulSoup(line, "html.parser")
  265. div = soup.find("div")
  266. if div:
  267. if div.img:
  268. img = div.img
  269. src = img.get("src")
  270. src = f"{save_path}/{src}"
  271. width_ratio = get_image_width_from_md_line(str(img))
  272. return (
  273. f"\\begin{{figure}}[h]\n\\centering\n"
  274. f"\\includegraphics[width={width_ratio:.2f}\\linewidth]{{{src}}}\n"
  275. f"\\end{{figure}}\n\n"
  276. )
  277. if div.table:
  278. return process_table_html(str(div))
  279. text = div.get_text(strip=True)
  280. if text:
  281. return f"\\begin{{center}}{escape_latex_outside_formula(text)}\\end{{center}}\n\n"
  282. if "<table" in line:
  283. return process_table_html(line)
  284. return process_paragraph(line)
  285. latex_lines = [
  286. "\\documentclass[12pt]{article}",
  287. "\\usepackage{xeCJK}",
  288. "\\usepackage{fontspec}",
  289. "\\usepackage{graphicx}",
  290. "\\usepackage{amsmath}",
  291. "\\usepackage{geometry}",
  292. "\\usepackage{fancyhdr}",
  293. "\\usepackage{indentfirst}",
  294. "\\usepackage{caption}",
  295. "\\usepackage{tabularx, booktabs}",
  296. "\\usepackage{amssymb}",
  297. "\\usepackage{amsfonts}",
  298. "\\geometry{a4paper, margin=1in}",
  299. "\\setCJKmainfont{Droid Sans Fallback}",
  300. "\\setmainfont{DejaVu Serif}",
  301. "\\setsansfont{Lato}",
  302. "\\setmonofont{Latin Modern Mono}",
  303. "\\pagestyle{fancy}",
  304. "\\setlength{\\parindent}{2em}",
  305. "\\begin{document}\n",
  306. ]
  307. md_text = self.get("markdown_texts", "")
  308. for line in md_text.splitlines():
  309. latex_lines.append(process_md_line(line, save_path))
  310. latex_lines.append("\\end{document}")
  311. return "\n".join(latex_lines)
  312. class DocumentResult(BaseCVResult, WordMixin):
  313. def __init__(self, data) -> None:
  314. """
  315. Initializes a new instance of the class with the specified data.
  316. """
  317. super().__init__(data)
  318. WordMixin.__init__(self)
  319. def _get_input_fn(self):
  320. fn = super()._get_input_fn()
  321. if (page_idx := self.get("page_index", None)) is not None:
  322. fp = Path(fn)
  323. stem, suffix = fp.stem, fp.suffix
  324. fn = f"{stem}_{page_idx}{suffix}"
  325. if (language := self.get("language", None)) is not None:
  326. fp = Path(fn)
  327. stem, suffix = fp.stem, fp.suffix
  328. fn = f"{stem}_{language}{suffix}"
  329. return fn
  330. def _to_word(self) -> dict:
  331. return self
  332. class LatexResult(BaseCVResult, LatexMixin):
  333. def __init__(self, data) -> None:
  334. super().__init__(data)
  335. LatexMixin.__init__(self)
  336. def _get_input_fn(self):
  337. fn = super()._get_input_fn()
  338. if (page_idx := self.get("page_index", None)) is not None:
  339. fp = Path(fn)
  340. stem, suffix = fp.stem, fp.suffix
  341. fn = f"{stem}_{page_idx}{suffix}"
  342. if (language := self.get("language", None)) is not None:
  343. fp = Path(fn)
  344. stem, suffix = fp.stem, fp.suffix
  345. fn = f"{stem}_{language}{suffix}"
  346. return fn
  347. def _to_latex(self) -> dict:
  348. return self