| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402 |
- # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import os
- import re
- from pathlib import Path
- from ...common.result import BaseCVResult, LatexMixin, MarkdownMixin, WordMixin
- class MarkdownResult(BaseCVResult, MarkdownMixin, WordMixin, LatexMixin):
- def __init__(self, data) -> None:
- """Initializes a new instance of the class with the specified data."""
- super().__init__(data)
- MarkdownMixin.__init__(self)
- def _get_input_fn(self):
- fn = super()._get_input_fn()
- if (page_idx := self.get("page_index", None)) is not None:
- fp = Path(fn)
- stem, suffix = fp.stem, fp.suffix
- fn = f"{stem}_{page_idx}{suffix}"
- if (language := self.get("language", None)) is not None:
- fp = Path(fn)
- stem, suffix = fp.stem, fp.suffix
- fn = f"{stem}_{language}{suffix}"
- return fn
- def _to_markdown(self, pretty=True, show_formula_number=False) -> dict:
- return self
- # in order to make MarkdownResult support save_to_word
- def _to_word(self, save_path) -> dict:
- from bs4 import BeautifulSoup
- md_text = self.get("markdown_texts", "")
- def set_paragraph_style(
- paragraph, bold=False, align="left", font_size=11, color=None
- ):
- from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
- from docx.oxml.ns import qn
- from docx.shared import Pt, RGBColor
- # Set paragraph style uniformly
- run = paragraph.runs[0] if paragraph.runs else paragraph.add_run()
- run.font.name = "Times New Roman"
- run._element.rPr.rFonts.set(qn("w:eastAsia"), "宋体")
- run.font.size = Pt(font_size)
- run.bold = bold
- if color:
- run.font.color.rgb = RGBColor(*color)
- if align == "center":
- paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
- elif align == "right":
- paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
- else:
- paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
- def add_image(paragraph, src, width_percent):
- from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
- from docx.shared import Inches
- if os.path.exists(src):
- try:
- width_in_inches = Inches(width_percent / 100 * 6.0)
- run = paragraph.add_run()
- run.add_picture(src, width=width_in_inches)
- paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
- except Exception as e:
- paragraph.add_run(f"[fail load image: {src}]")
- else:
- paragraph.add_run(f"[image not exist: {src}]")
- def add_table(document, table_html):
- """
- Parsing HTML table and add to Word
- """
- soup = BeautifulSoup(table_html, "html.parser")
- table_tag = soup.find("table")
- if not table_tag:
- return
- rows = table_tag.find_all("tr")
- if not rows:
- return
- # Calculate the maximum number of columns to avoid out-of-bounds errors
- max_cols = max(len(row.find_all(["td", "th"])) for row in rows)
- table = document.add_table(rows=len(rows), cols=max_cols)
- table.style = "Table Grid"
- for i, row in enumerate(rows):
- cells = row.find_all(["td", "th"])
- for j in range(max_cols):
- if j < len(cells):
- text = cells[j].get_text(strip=True)
- table.cell(i, j).text = text
- else:
- table.cell(i, j).text = ""
- def process_md_page(document, md_text, output_path):
- # Process single page conten
- lines = md_text.strip().split("\n")
- for line in lines:
- line = line.strip()
- if not line:
- continue
- title_color = (0, 0, 255)
- if line.startswith("##### "):
- p = document.add_paragraph(line[6:])
- set_paragraph_style(p, bold=True, font_size=10)
- elif line.startswith("#### "):
- p = document.add_paragraph(line[5:])
- set_paragraph_style(p, bold=True, font_size=11)
- elif line.startswith("### "):
- p = document.add_paragraph(line[4:])
- set_paragraph_style(p, bold=True, font_size=12)
- elif line.startswith("## "):
- p = document.add_paragraph(line[3:])
- set_paragraph_style(p, bold=True, font_size=14)
- elif line.startswith("# "):
- p = document.add_paragraph(line[2:])
- set_paragraph_style(p, bold=True, font_size=16)
- # Handle centered content
- elif line.startswith("<div") and "text-align: center" in line:
- soup = BeautifulSoup(line, "html.parser")
- div = soup.find("div")
- if not div:
- continue
- if div.img:
- img = div.img
- src = img.get("src")
- width_attr = img.get("width", "100%").replace("%", "")
- width_percent = float(width_attr) if width_attr else 100
- p = document.add_paragraph()
- add_image(p, f"{output_path}/{src}", width_percent)
- elif div.table:
- add_table(document, str(div))
- else:
- text = div.get_text(strip=True)
- if text:
- p = document.add_paragraph(text)
- set_paragraph_style(
- p, bold=True, align="center", color=title_color
- )
- # Handle HTML tables
- elif "<table" in line:
- add_table(document, line)
- # Normal paragraph
- else:
- p = document.add_paragraph(line)
- set_paragraph_style(p, font_size=11)
- from docx import Document
- document = Document()
- process_md_page(document, md_text, save_path)
- return document
- # in order to make MarkdownResult support save_to_latex
- def _to_latex(self, save_path) -> str:
- from bs4 import BeautifulSoup
- def escape_latex_outside_formula(s: str) -> str:
- """
- Escape LaTeX special characters while preserving formulas.
- """
- if not s:
- return ""
- placeholders = []
- def repl(m):
- placeholders.append(m.group(0))
- return f"@@FORMULA{len(placeholders)-1}@@"
- # Extract formulas
- formula_pat = re.compile(
- r"(\$\$.*?\$\$|\$.*?\$|\\\[.*?\\\]|\\\(.*?\\\))", re.DOTALL
- )
- tmp = formula_pat.sub(repl, s)
- tmp = (
- tmp.replace("\\", "\\textbackslash{}")
- .replace("&", "\\&")
- .replace("%", "\\%")
- .replace("$", "\\$")
- .replace("#", "\\#")
- .replace("_", "\\_")
- .replace("{", "\\{")
- .replace("}", "\\}")
- .replace("~", "\\textasciitilde{}")
- .replace("^", "\\textasciicircum{}")
- )
- # Restore formulas
- for i, f in enumerate(placeholders):
- tmp = tmp.replace(f"@@FORMULA{i}@@", f)
- return tmp
- #
- def get_image_width_from_md_line(line, default_ratio=0.8):
- """
- Parse the image width attribute.
- """
- m = re.search(r'width\s*=\s*["\']?(\d+)%?["\']?', line)
- if m:
- val = int(m.group(1))
- return max(0.01, min(val / 100.0, 1.0))
- m2 = re.search(r"width\s*:\s*(\d+)%", line)
- if m2:
- val = int(m2.group(1))
- return max(0.01, min(val / 100.0, 1.0))
- return default_ratio
- def process_table_html(content) -> str:
- """
- Process table content.
- """
- if "<table" in content:
- soup = BeautifulSoup(content, "html.parser")
- rows = []
- for tr in soup.find_all("tr"):
- row = []
- for td in tr.find_all(["td", "th"]):
- text = td.get_text(strip=True)
- row.append(escape_latex_outside_formula(text))
- rows.append(row)
- else:
- rows = [
- [escape_latex_outside_formula(c) for c in row.split("\t")]
- for row in content.splitlines()
- if row.strip()
- ]
- if not rows:
- return ""
- col_count = max(len(r) for r in rows)
- norm_rows = [r + [""] * (col_count - len(r)) for r in rows]
- col_format = " ".join(
- [">{\\raggedright\\arraybackslash}X" for _ in range(col_count)]
- )
- latex = "\\begin{center}\n\\renewcommand{\\arraystretch}{1.5}\n"
- latex += f"\\begin{{tabularx}}{{\\textwidth}}{{{col_format}}}\n\\toprule\n"
- for i, row in enumerate(norm_rows):
- latex += " & ".join(row) + " \\\\\n"
- if i == 0:
- latex += "\\midrule\n"
- latex += "\\bottomrule\n\\end{tabularx}\n\\end{center}\n\n"
- return latex
- def process_paragraph(s: str) -> str:
- """
- Process text paragraphs, preserving formulas.
- """
- paragraphs = re.split(r"\n\s*\n", s)
- processed_paras = []
- for p in paragraphs:
- p = p.strip()
- if not p:
- continue
- processed_paras.append("\\par " + escape_latex_outside_formula(p))
- return "\n\n".join(processed_paras) + "\n\n"
- def process_md_line(line: str, save_path) -> str:
- """
- Process a single line.
- """
- line = line.strip()
- if not line:
- return ""
- if line.startswith("##### "):
- return f"\\paragraph*{{{escape_latex_outside_formula(line[6:].strip())}}}\n\n"
- if line.startswith("#### "):
- return f"\\subsubsection*{{{escape_latex_outside_formula(line[5:].strip())}}}\n\n"
- if line.startswith("### "):
- return f"\\subsection*{{{escape_latex_outside_formula(line[4:].strip())}}}\n\n"
- if line.startswith("## "):
- return f"\\section*{{{escape_latex_outside_formula(line[3:].strip())}}}\n\n"
- if line.startswith("# "):
- return f"\\section*{{{escape_latex_outside_formula(line[2:].strip())}}}\n\n"
- if "<div" in line and "text-align: center" in line:
- soup = BeautifulSoup(line, "html.parser")
- div = soup.find("div")
- if div:
- if div.img:
- img = div.img
- src = img.get("src")
- src = f"{save_path}/{src}"
- width_ratio = get_image_width_from_md_line(str(img))
- return (
- f"\\begin{{figure}}[h]\n\\centering\n"
- f"\\includegraphics[width={width_ratio:.2f}\\linewidth]{{{src}}}\n"
- f"\\end{{figure}}\n\n"
- )
- if div.table:
- return process_table_html(str(div))
- text = div.get_text(strip=True)
- if text:
- return f"\\begin{{center}}{escape_latex_outside_formula(text)}\\end{{center}}\n\n"
- if "<table" in line:
- return process_table_html(line)
- return process_paragraph(line)
- latex_lines = [
- "\\documentclass[12pt]{article}",
- "\\usepackage{xeCJK}",
- "\\usepackage{fontspec}",
- "\\usepackage{graphicx}",
- "\\usepackage{amsmath}",
- "\\usepackage{geometry}",
- "\\usepackage{fancyhdr}",
- "\\usepackage{indentfirst}",
- "\\usepackage{caption}",
- "\\usepackage{tabularx, booktabs}",
- "\\usepackage{amssymb}",
- "\\usepackage{amsfonts}",
- "\\geometry{a4paper, margin=1in}",
- "\\setCJKmainfont{Droid Sans Fallback}",
- "\\setmainfont{DejaVu Serif}",
- "\\setsansfont{Lato}",
- "\\setmonofont{Latin Modern Mono}",
- "\\pagestyle{fancy}",
- "\\setlength{\\parindent}{2em}",
- "\\begin{document}\n",
- ]
- md_text = self.get("markdown_texts", "")
- for line in md_text.splitlines():
- latex_lines.append(process_md_line(line, save_path))
- latex_lines.append("\\end{document}")
- return "\n".join(latex_lines)
- class DocumentResult(BaseCVResult, WordMixin):
- def __init__(self, data) -> None:
- """
- Initializes a new instance of the class with the specified data.
- """
- super().__init__(data)
- WordMixin.__init__(self)
- def _get_input_fn(self):
- fn = super()._get_input_fn()
- if (page_idx := self.get("page_index", None)) is not None:
- fp = Path(fn)
- stem, suffix = fp.stem, fp.suffix
- fn = f"{stem}_{page_idx}{suffix}"
- if (language := self.get("language", None)) is not None:
- fp = Path(fn)
- stem, suffix = fp.stem, fp.suffix
- fn = f"{stem}_{language}{suffix}"
- return fn
- def _to_word(self) -> dict:
- return self
- class LatexResult(BaseCVResult, LatexMixin):
- def __init__(self, data) -> None:
- super().__init__(data)
- LatexMixin.__init__(self)
- def _get_input_fn(self):
- fn = super()._get_input_fn()
- if (page_idx := self.get("page_index", None)) is not None:
- fp = Path(fn)
- stem, suffix = fp.stem, fp.suffix
- fn = f"{stem}_{page_idx}{suffix}"
- if (language := self.get("language", None)) is not None:
- fp = Path(fn)
- stem, suffix = fp.stem, fp.suffix
- fn = f"{stem}_{language}{suffix}"
- return fn
- def _to_latex(self) -> dict:
- return self
|