pp_doctranslation.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945
  1. # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from .._utils.cli import (
  15. get_subcommand_args,
  16. str2bool,
  17. )
  18. from .._utils.logging import logger
  19. from .base import PaddleXPipelineWrapper, PipelineCLISubcommandExecutor
  20. from .utils import create_config_from_structure
  21. class PPDocTranslation(PaddleXPipelineWrapper):
  22. def __init__(
  23. self,
  24. layout_detection_model_name=None,
  25. layout_detection_model_dir=None,
  26. layout_threshold=None,
  27. layout_nms=None,
  28. layout_unclip_ratio=None,
  29. layout_merge_bboxes_mode=None,
  30. chart_recognition_model_name=None,
  31. chart_recognition_model_dir=None,
  32. chart_recognition_batch_size=None,
  33. region_detection_model_name=None,
  34. region_detection_model_dir=None,
  35. doc_orientation_classify_model_name=None,
  36. doc_orientation_classify_model_dir=None,
  37. doc_unwarping_model_name=None,
  38. doc_unwarping_model_dir=None,
  39. text_detection_model_name=None,
  40. text_detection_model_dir=None,
  41. text_det_limit_side_len=None,
  42. text_det_limit_type=None,
  43. text_det_thresh=None,
  44. text_det_box_thresh=None,
  45. text_det_unclip_ratio=None,
  46. textline_orientation_model_name=None,
  47. textline_orientation_model_dir=None,
  48. textline_orientation_batch_size=None,
  49. text_recognition_model_name=None,
  50. text_recognition_model_dir=None,
  51. text_recognition_batch_size=None,
  52. text_rec_score_thresh=None,
  53. table_classification_model_name=None,
  54. table_classification_model_dir=None,
  55. wired_table_structure_recognition_model_name=None,
  56. wired_table_structure_recognition_model_dir=None,
  57. wireless_table_structure_recognition_model_name=None,
  58. wireless_table_structure_recognition_model_dir=None,
  59. wired_table_cells_detection_model_name=None,
  60. wired_table_cells_detection_model_dir=None,
  61. wireless_table_cells_detection_model_name=None,
  62. wireless_table_cells_detection_model_dir=None,
  63. table_orientation_classify_model_name=None,
  64. table_orientation_classify_model_dir=None,
  65. seal_text_detection_model_name=None,
  66. seal_text_detection_model_dir=None,
  67. seal_det_limit_side_len=None,
  68. seal_det_limit_type=None,
  69. seal_det_thresh=None,
  70. seal_det_box_thresh=None,
  71. seal_det_unclip_ratio=None,
  72. seal_text_recognition_model_name=None,
  73. seal_text_recognition_model_dir=None,
  74. seal_text_recognition_batch_size=None,
  75. seal_rec_score_thresh=None,
  76. formula_recognition_model_name=None,
  77. formula_recognition_model_dir=None,
  78. formula_recognition_batch_size=None,
  79. use_doc_orientation_classify=None,
  80. use_doc_unwarping=None,
  81. use_textline_orientation=None,
  82. use_seal_recognition=None,
  83. use_table_recognition=None,
  84. use_formula_recognition=None,
  85. use_chart_recognition=None,
  86. use_region_detection=None,
  87. chat_bot_config=None,
  88. **kwargs,
  89. ):
  90. params = locals().copy()
  91. params.pop("self")
  92. params.pop("kwargs")
  93. self._params = params
  94. super().__init__(**kwargs)
  95. @property
  96. def _paddlex_pipeline_name(self):
  97. return "PP-DocTranslation"
  98. def visual_predict_iter(
  99. self,
  100. input,
  101. *,
  102. use_doc_orientation_classify=None,
  103. use_doc_unwarping=None,
  104. use_textline_orientation=None,
  105. use_seal_recognition=None,
  106. use_table_recognition=None,
  107. use_formula_recognition=None,
  108. use_chart_recognition=None,
  109. use_region_detection=None,
  110. layout_threshold=None,
  111. layout_nms=None,
  112. layout_unclip_ratio=None,
  113. layout_merge_bboxes_mode=None,
  114. text_det_limit_side_len=None,
  115. text_det_limit_type=None,
  116. text_det_thresh=None,
  117. text_det_box_thresh=None,
  118. text_det_unclip_ratio=None,
  119. text_rec_score_thresh=None,
  120. seal_det_limit_side_len=None,
  121. seal_det_limit_type=None,
  122. seal_det_thresh=None,
  123. seal_det_box_thresh=None,
  124. seal_det_unclip_ratio=None,
  125. seal_rec_score_thresh=None,
  126. use_wired_table_cells_trans_to_html=False,
  127. use_wireless_table_cells_trans_to_html=False,
  128. use_table_orientation_classify=True,
  129. use_ocr_results_with_table_cells=True,
  130. use_e2e_wired_table_rec_model=False,
  131. use_e2e_wireless_table_rec_model=True,
  132. **kwargs,
  133. ):
  134. return self.paddlex_pipeline.visual_predict(
  135. input,
  136. use_doc_orientation_classify=use_doc_orientation_classify,
  137. use_doc_unwarping=use_doc_unwarping,
  138. use_textline_orientation=use_textline_orientation,
  139. use_seal_recognition=use_seal_recognition,
  140. use_table_recognition=use_table_recognition,
  141. use_formula_recognition=use_formula_recognition,
  142. use_chart_recognition=use_chart_recognition,
  143. use_region_detection=use_region_detection,
  144. layout_threshold=layout_threshold,
  145. layout_nms=layout_nms,
  146. layout_unclip_ratio=layout_unclip_ratio,
  147. layout_merge_bboxes_mode=layout_merge_bboxes_mode,
  148. text_det_limit_side_len=text_det_limit_side_len,
  149. text_det_limit_type=text_det_limit_type,
  150. text_det_thresh=text_det_thresh,
  151. text_det_box_thresh=text_det_box_thresh,
  152. text_det_unclip_ratio=text_det_unclip_ratio,
  153. text_rec_score_thresh=text_rec_score_thresh,
  154. seal_det_limit_side_len=seal_det_limit_side_len,
  155. seal_det_limit_type=seal_det_limit_type,
  156. seal_det_thresh=seal_det_thresh,
  157. seal_det_box_thresh=seal_det_box_thresh,
  158. seal_det_unclip_ratio=seal_det_unclip_ratio,
  159. seal_rec_score_thresh=seal_rec_score_thresh,
  160. use_wired_table_cells_trans_to_html=use_wired_table_cells_trans_to_html,
  161. use_wireless_table_cells_trans_to_html=use_wireless_table_cells_trans_to_html,
  162. use_table_orientation_classify=use_table_orientation_classify,
  163. use_ocr_results_with_table_cells=use_ocr_results_with_table_cells,
  164. use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
  165. use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
  166. **kwargs,
  167. )
  168. def visual_predict(
  169. self,
  170. input,
  171. *,
  172. use_doc_orientation_classify=None,
  173. use_doc_unwarping=None,
  174. use_textline_orientation=None,
  175. use_seal_recognition=None,
  176. use_table_recognition=None,
  177. use_formula_recognition=None,
  178. use_chart_recognition=None,
  179. use_region_detection=None,
  180. layout_threshold=None,
  181. layout_nms=None,
  182. layout_unclip_ratio=None,
  183. layout_merge_bboxes_mode=None,
  184. text_det_limit_side_len=None,
  185. text_det_limit_type=None,
  186. text_det_thresh=None,
  187. text_det_box_thresh=None,
  188. text_det_unclip_ratio=None,
  189. text_rec_score_thresh=None,
  190. seal_det_limit_side_len=None,
  191. seal_det_limit_type=None,
  192. seal_det_thresh=None,
  193. seal_det_box_thresh=None,
  194. seal_det_unclip_ratio=None,
  195. seal_rec_score_thresh=None,
  196. use_wired_table_cells_trans_to_html=False,
  197. use_wireless_table_cells_trans_to_html=False,
  198. use_table_orientation_classify=True,
  199. use_ocr_results_with_table_cells=True,
  200. use_e2e_wired_table_rec_model=False,
  201. use_e2e_wireless_table_rec_model=True,
  202. **kwargs,
  203. ):
  204. return list(
  205. self.visual_predict_iter(
  206. input,
  207. use_doc_orientation_classify=use_doc_orientation_classify,
  208. use_doc_unwarping=use_doc_unwarping,
  209. use_textline_orientation=use_textline_orientation,
  210. use_seal_recognition=use_seal_recognition,
  211. use_table_recognition=use_table_recognition,
  212. use_formula_recognition=use_formula_recognition,
  213. use_chart_recognition=use_chart_recognition,
  214. use_region_detection=use_region_detection,
  215. layout_threshold=layout_threshold,
  216. layout_nms=layout_nms,
  217. layout_unclip_ratio=layout_unclip_ratio,
  218. layout_merge_bboxes_mode=layout_merge_bboxes_mode,
  219. text_det_limit_side_len=text_det_limit_side_len,
  220. text_det_limit_type=text_det_limit_type,
  221. text_det_thresh=text_det_thresh,
  222. text_det_box_thresh=text_det_box_thresh,
  223. text_det_unclip_ratio=text_det_unclip_ratio,
  224. text_rec_score_thresh=text_rec_score_thresh,
  225. seal_det_limit_side_len=seal_det_limit_side_len,
  226. seal_det_limit_type=seal_det_limit_type,
  227. seal_det_thresh=seal_det_thresh,
  228. seal_det_box_thresh=seal_det_box_thresh,
  229. seal_det_unclip_ratio=seal_det_unclip_ratio,
  230. seal_rec_score_thresh=seal_rec_score_thresh,
  231. use_wired_table_cells_trans_to_html=use_wired_table_cells_trans_to_html,
  232. use_wireless_table_cells_trans_to_html=use_wireless_table_cells_trans_to_html,
  233. use_table_orientation_classify=use_table_orientation_classify,
  234. use_ocr_results_with_table_cells=use_ocr_results_with_table_cells,
  235. use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
  236. use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
  237. **kwargs,
  238. )
  239. )
  240. def translate_iter(
  241. self,
  242. ori_md_info_list,
  243. *,
  244. target_language="zh",
  245. chunk_size=5000,
  246. task_description=None,
  247. output_format=None,
  248. rules_str=None,
  249. few_shot_demo_text_content=None,
  250. few_shot_demo_key_value_list=None,
  251. glossary=None,
  252. llm_request_interval=0.0,
  253. chat_bot_config=None,
  254. **kwargs,
  255. ):
  256. return self.paddlex_pipeline.translate(
  257. ori_md_info_list,
  258. target_language=target_language,
  259. chunk_size=chunk_size,
  260. task_description=task_description,
  261. output_format=output_format,
  262. rules_str=rules_str,
  263. few_shot_demo_text_content=few_shot_demo_text_content,
  264. few_shot_demo_key_value_list=few_shot_demo_key_value_list,
  265. glossary=glossary,
  266. llm_request_interval=llm_request_interval,
  267. chat_bot_config=chat_bot_config,
  268. **kwargs,
  269. )
  270. def translate(
  271. self,
  272. ori_md_info_list,
  273. *,
  274. target_language="zh",
  275. chunk_size=5000,
  276. task_description=None,
  277. output_format=None,
  278. rules_str=None,
  279. few_shot_demo_text_content=None,
  280. few_shot_demo_key_value_list=None,
  281. glossary=None,
  282. llm_request_interval=0.0,
  283. chat_bot_config=None,
  284. **kwargs,
  285. ):
  286. return list(
  287. self.translate_iter(
  288. ori_md_info_list,
  289. target_language=target_language,
  290. chunk_size=chunk_size,
  291. task_description=task_description,
  292. output_format=output_format,
  293. rules_str=rules_str,
  294. few_shot_demo_text_content=few_shot_demo_text_content,
  295. few_shot_demo_key_value_list=few_shot_demo_key_value_list,
  296. glossary=glossary,
  297. llm_request_interval=llm_request_interval,
  298. chat_bot_config=chat_bot_config,
  299. **kwargs,
  300. )
  301. )
  302. def load_from_markdown(self, input):
  303. return self.paddlex_pipeline.load_from_markdown(input)
  304. def concatenate_markdown_pages(self, markdown_list):
  305. return self.paddlex_pipeline.concatenate_markdown_pages(markdown_list)
  306. @classmethod
  307. def get_cli_subcommand_executor(cls):
  308. return PPDocTranslationCLISubcommandExecutor()
  309. def _get_paddlex_config_overrides(self):
  310. # HACK: We should consider reducing duplication.
  311. STRUCTURE = {
  312. "SubPipelines.LayoutParser.SubPipelines.DocPreprocessor.use_doc_orientation_classify": self._params[
  313. "use_doc_orientation_classify"
  314. ],
  315. "SubPipelines.LayoutParser.SubPipelines.DocPreprocessor.use_doc_unwarping": self._params[
  316. "use_doc_unwarping"
  317. ],
  318. "SubPipelines.LayoutParser.use_doc_preprocessor": self._params[
  319. "use_doc_orientation_classify"
  320. ]
  321. or self._params["use_doc_unwarping"],
  322. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.use_textline_orientation": self._params[
  323. "use_textline_orientation"
  324. ],
  325. "SubPipelines.LayoutParser.use_seal_recognition": self._params[
  326. "use_seal_recognition"
  327. ],
  328. "SubPipelines.LayoutParser.use_table_recognition": self._params[
  329. "use_table_recognition"
  330. ],
  331. "SubPipelines.LayoutParser.use_formula_recognition": self._params[
  332. "use_formula_recognition"
  333. ],
  334. "SubPipelines.LayoutParser.use_chart_recognition": self._params[
  335. "use_chart_recognition"
  336. ],
  337. "SubPipelines.LayoutParser.use_region_detection": self._params[
  338. "use_region_detection"
  339. ],
  340. "SubPipelines.LayoutParser.SubModules.LayoutDetection.model_name": self._params[
  341. "layout_detection_model_name"
  342. ],
  343. "SubPipelines.LayoutParser.SubModules.LayoutDetection.model_dir": self._params[
  344. "layout_detection_model_dir"
  345. ],
  346. "SubPipelines.LayoutParser.SubModules.LayoutDetection.threshold": self._params[
  347. "layout_threshold"
  348. ],
  349. "SubPipelines.LayoutParser.SubModules.LayoutDetection.layout_nms": self._params[
  350. "layout_nms"
  351. ],
  352. "SubPipelines.LayoutParser.SubModules.LayoutDetection.layout_unclip_ratio": self._params[
  353. "layout_unclip_ratio"
  354. ],
  355. "SubPipelines.LayoutParser.SubModules.LayoutDetection.layout_merge_bboxes_mode": self._params[
  356. "layout_merge_bboxes_mode"
  357. ],
  358. "SubPipelines.LayoutParser.SubModules.ChartRecognition.model_name": self._params[
  359. "chart_recognition_model_name"
  360. ],
  361. "SubPipelines.LayoutParser.SubModules.ChartRecognition.model_dir": self._params[
  362. "chart_recognition_model_dir"
  363. ],
  364. "SubPipelines.LayoutParser.SubModules.ChartRecognition.batch_size": self._params[
  365. "chart_recognition_batch_size"
  366. ],
  367. "SubPipelines.LayoutParser.SubModules.RegionDetection.model_name": self._params[
  368. "region_detection_model_name"
  369. ],
  370. "SubPipelines.LayoutParser.SubModules.RegionDetection.model_dir": self._params[
  371. "region_detection_model_dir"
  372. ],
  373. "SubPipelines.LayoutParser.SubPipelines.DocPreprocessor.SubModules.DocOrientationClassify.model_name": self._params[
  374. "doc_orientation_classify_model_name"
  375. ],
  376. "SubPipelines.LayoutParser.SubPipelines.DocPreprocessor.SubModules.DocOrientationClassify.model_dir": self._params[
  377. "doc_orientation_classify_model_dir"
  378. ],
  379. "SubPipelines.LayoutParser.SubPipelines.DocPreprocessor.SubModules.DocUnwarping.model_name": self._params[
  380. "doc_unwarping_model_name"
  381. ],
  382. "SubPipelines.LayoutParser.SubPipelines.DocPreprocessor.SubModules.DocUnwarping.model_dir": self._params[
  383. "doc_unwarping_model_dir"
  384. ],
  385. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.model_name": self._params[
  386. "text_detection_model_name"
  387. ],
  388. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.model_dir": self._params[
  389. "text_detection_model_dir"
  390. ],
  391. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.limit_side_len": self._params[
  392. "text_det_limit_side_len"
  393. ],
  394. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.limit_type": self._params[
  395. "text_det_limit_type"
  396. ],
  397. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.thresh": self._params[
  398. "text_det_thresh"
  399. ],
  400. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.box_thresh": self._params[
  401. "text_det_box_thresh"
  402. ],
  403. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.unclip_ratio": self._params[
  404. "text_det_unclip_ratio"
  405. ],
  406. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.model_name": self._params[
  407. "textline_orientation_model_name"
  408. ],
  409. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.model_dir": self._params[
  410. "textline_orientation_model_dir"
  411. ],
  412. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.batch_size": self._params[
  413. "textline_orientation_batch_size"
  414. ],
  415. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextRecognition.model_name": self._params[
  416. "text_recognition_model_name"
  417. ],
  418. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextRecognition.model_dir": self._params[
  419. "text_recognition_model_dir"
  420. ],
  421. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextRecognition.batch_size": self._params[
  422. "text_recognition_batch_size"
  423. ],
  424. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextRecognition.score_thresh": self._params[
  425. "text_rec_score_thresh"
  426. ],
  427. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.TableClassification.model_name": self._params[
  428. "table_classification_model_name"
  429. ],
  430. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.TableClassification.model_dir": self._params[
  431. "table_classification_model_dir"
  432. ],
  433. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.WiredTableStructureRecognition.model_name": self._params[
  434. "wired_table_structure_recognition_model_name"
  435. ],
  436. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.WiredTableStructureRecognition.model_dir": self._params[
  437. "wired_table_structure_recognition_model_dir"
  438. ],
  439. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.WirelessTableStructureRecognition.model_name": self._params[
  440. "wireless_table_structure_recognition_model_name"
  441. ],
  442. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.WirelessTableStructureRecognition.model_dir": self._params[
  443. "wireless_table_structure_recognition_model_dir"
  444. ],
  445. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.WiredTableCellsDetection.model_name": self._params[
  446. "wired_table_cells_detection_model_name"
  447. ],
  448. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.WiredTableCellsDetection.model_dir": self._params[
  449. "wired_table_cells_detection_model_dir"
  450. ],
  451. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.WirelessTableCellsDetection.model_name": self._params[
  452. "wireless_table_cells_detection_model_name"
  453. ],
  454. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.WirelessTableCellsDetection.model_dir": self._params[
  455. "wireless_table_cells_detection_model_dir"
  456. ],
  457. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.TableOrientationClassify.model_name": self._params[
  458. "table_orientation_classify_model_name"
  459. ],
  460. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.TableOrientationClassify.model_dir": self._params[
  461. "table_orientation_classify_model_dir"
  462. ],
  463. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.model_name": self._params[
  464. "text_detection_model_name"
  465. ],
  466. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.model_dir": self._params[
  467. "text_detection_model_dir"
  468. ],
  469. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.limit_side_len": self._params[
  470. "text_det_limit_side_len"
  471. ],
  472. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.limit_type": self._params[
  473. "text_det_limit_type"
  474. ],
  475. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.thresh": self._params[
  476. "text_det_thresh"
  477. ],
  478. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.box_thresh": self._params[
  479. "text_det_box_thresh"
  480. ],
  481. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.unclip_ratio": self._params[
  482. "text_det_unclip_ratio"
  483. ],
  484. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.model_name": self._params[
  485. "textline_orientation_model_name"
  486. ],
  487. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.model_dir": self._params[
  488. "textline_orientation_model_dir"
  489. ],
  490. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.batch_size": self._params[
  491. "textline_orientation_batch_size"
  492. ],
  493. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextRecognition.model_name": self._params[
  494. "text_recognition_model_name"
  495. ],
  496. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextRecognition.model_dir": self._params[
  497. "text_recognition_model_dir"
  498. ],
  499. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextRecognition.batch_size": self._params[
  500. "text_recognition_batch_size"
  501. ],
  502. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextRecognition.score_thresh": self._params[
  503. "text_rec_score_thresh"
  504. ],
  505. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.model_name": self._params[
  506. "seal_text_detection_model_name"
  507. ],
  508. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.model_dir": self._params[
  509. "seal_text_detection_model_dir"
  510. ],
  511. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.limit_side_len": self._params[
  512. "text_det_limit_side_len"
  513. ],
  514. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.limit_type": self._params[
  515. "seal_det_limit_type"
  516. ],
  517. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.thresh": self._params[
  518. "seal_det_thresh"
  519. ],
  520. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.box_thresh": self._params[
  521. "seal_det_box_thresh"
  522. ],
  523. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.unclip_ratio": self._params[
  524. "seal_det_unclip_ratio"
  525. ],
  526. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextRecognition.model_name": self._params[
  527. "seal_text_recognition_model_name"
  528. ],
  529. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextRecognition.model_dir": self._params[
  530. "seal_text_recognition_model_dir"
  531. ],
  532. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextRecognition.batch_size": self._params[
  533. "seal_text_recognition_batch_size"
  534. ],
  535. "SubPipelines.LayoutParser.SubPipelines.FormulaRecognition.SubModules.FormulaRecognition.model_name": self._params[
  536. "formula_recognition_model_name"
  537. ],
  538. "SubPipelines.LayoutParser.SubPipelines.FormulaRecognition.SubModules.FormulaRecognition.model_dir": self._params[
  539. "formula_recognition_model_dir"
  540. ],
  541. "SubPipelines.LayoutParser.SubPipelines.FormulaRecognition.SubModules.FormulaRecognition.batch_size": self._params[
  542. "formula_recognition_batch_size"
  543. ],
  544. "SubModules.LLM_Chat": self._params["chat_bot_config"],
  545. }
  546. return create_config_from_structure(STRUCTURE)
  547. class PPDocTranslationCLISubcommandExecutor(PipelineCLISubcommandExecutor):
  548. @property
  549. def subparser_name(self):
  550. return "pp_doctranslation"
  551. def _update_subparser(self, subparser):
  552. subparser.add_argument(
  553. "-i",
  554. "--input",
  555. type=str,
  556. required=True,
  557. help="Input path or URL.",
  558. )
  559. subparser.add_argument(
  560. "--save_path",
  561. type=str,
  562. help="Path to the output directory.",
  563. )
  564. subparser.add_argument(
  565. "--target_language",
  566. type=str,
  567. default="zh",
  568. help="Target language.",
  569. )
  570. subparser.add_argument(
  571. "--layout_detection_model_name",
  572. type=str,
  573. help="Name of the layout detection model.",
  574. )
  575. subparser.add_argument(
  576. "--layout_detection_model_dir",
  577. type=str,
  578. help="Path to the layout detection model directory.",
  579. )
  580. subparser.add_argument(
  581. "--layout_threshold",
  582. type=float,
  583. help="Score threshold for the layout detection model.",
  584. )
  585. subparser.add_argument(
  586. "--layout_nms",
  587. type=str2bool,
  588. help="Whether to use NMS in layout detection.",
  589. )
  590. subparser.add_argument(
  591. "--layout_unclip_ratio",
  592. type=float,
  593. help="Expansion coefficient for layout detection.",
  594. )
  595. subparser.add_argument(
  596. "--layout_merge_bboxes_mode",
  597. type=str,
  598. help="Overlapping box filtering method.",
  599. )
  600. subparser.add_argument(
  601. "--chart_recognition_model_name",
  602. type=str,
  603. help="Name of the chart recognition model.",
  604. )
  605. subparser.add_argument(
  606. "--chart_recognition_model_dir",
  607. type=str,
  608. help="Path to the chart recognition model directory.",
  609. )
  610. subparser.add_argument(
  611. "--chart_recognition_batch_size",
  612. type=int,
  613. help="Batch size for the chart recognition model.",
  614. )
  615. subparser.add_argument(
  616. "--region_detection_model_name",
  617. type=str,
  618. help="Name of the region detection model.",
  619. )
  620. subparser.add_argument(
  621. "--region_detection_model_dir",
  622. type=str,
  623. help="Path to the region detection model directory.",
  624. )
  625. subparser.add_argument(
  626. "--doc_orientation_classify_model_name",
  627. type=str,
  628. help="Name of the document image orientation classification model.",
  629. )
  630. subparser.add_argument(
  631. "--doc_orientation_classify_model_dir",
  632. type=str,
  633. help="Path to the document image orientation classification model directory.",
  634. )
  635. subparser.add_argument(
  636. "--doc_unwarping_model_name",
  637. type=str,
  638. help="Name of the text image unwarping model.",
  639. )
  640. subparser.add_argument(
  641. "--doc_unwarping_model_dir",
  642. type=str,
  643. help="Path to the image unwarping model directory.",
  644. )
  645. subparser.add_argument(
  646. "--text_detection_model_name",
  647. type=str,
  648. help="Name of the text detection model.",
  649. )
  650. subparser.add_argument(
  651. "--text_detection_model_dir",
  652. type=str,
  653. help="Path to the text detection model directory.",
  654. )
  655. subparser.add_argument(
  656. "--text_det_limit_side_len",
  657. type=int,
  658. help="This sets a limit on the side length of the input image for the text detection model.",
  659. )
  660. subparser.add_argument(
  661. "--text_det_limit_type",
  662. type=str,
  663. help="This determines how the side length limit is applied to the input image before feeding it into the text deteciton model.",
  664. )
  665. subparser.add_argument(
  666. "--text_det_thresh",
  667. type=float,
  668. help="Detection pixel threshold for the text detection model. Pixels with scores greater than this threshold in the output probability map are considered text pixels.",
  669. )
  670. subparser.add_argument(
  671. "--text_det_box_thresh",
  672. type=float,
  673. help="Detection box threshold for the text detection model. A detection result is considered a text region if the average score of all pixels within the border of the result is greater than this threshold.",
  674. )
  675. subparser.add_argument(
  676. "--text_det_unclip_ratio",
  677. type=float,
  678. help="Text detection expansion coefficient, which expands the text region using this method. The larger the value, the larger the expansion area.",
  679. )
  680. subparser.add_argument(
  681. "--textline_orientation_model_name",
  682. type=str,
  683. help="Name of the text line orientation classification model.",
  684. )
  685. subparser.add_argument(
  686. "--textline_orientation_model_dir",
  687. type=str,
  688. help="Path to the text line orientation classification directory.",
  689. )
  690. subparser.add_argument(
  691. "--textline_orientation_batch_size",
  692. type=int,
  693. help="Batch size for the text line orientation classification model.",
  694. )
  695. subparser.add_argument(
  696. "--text_recognition_model_name",
  697. type=str,
  698. help="Name of the text recognition model.",
  699. )
  700. subparser.add_argument(
  701. "--text_recognition_model_dir",
  702. type=str,
  703. help="Path to the text recognition model directory.",
  704. )
  705. subparser.add_argument(
  706. "--text_recognition_batch_size",
  707. type=int,
  708. help="Batch size for the text recognition model.",
  709. )
  710. subparser.add_argument(
  711. "--text_rec_score_thresh",
  712. type=float,
  713. help="Text recognition threshold used in general OCR. Text results with scores greater than this threshold are retained.",
  714. )
  715. subparser.add_argument(
  716. "--table_classification_model_name",
  717. type=str,
  718. help="Name of the table classification model.",
  719. )
  720. subparser.add_argument(
  721. "--table_classification_model_dir",
  722. type=str,
  723. help="Path to the table classification model directory.",
  724. )
  725. subparser.add_argument(
  726. "--wired_table_structure_recognition_model_name",
  727. type=str,
  728. help="Name of the wired table structure recognition model.",
  729. )
  730. subparser.add_argument(
  731. "--wired_table_structure_recognition_model_dir",
  732. type=str,
  733. help="Path to the wired table structure recognition model directory.",
  734. )
  735. subparser.add_argument(
  736. "--wireless_table_structure_recognition_model_name",
  737. type=str,
  738. help="Name of the wireless table structure recognition model.",
  739. )
  740. subparser.add_argument(
  741. "--wireless_table_structure_recognition_model_dir",
  742. type=str,
  743. help="Path to the wired table structure recognition model directory.",
  744. )
  745. subparser.add_argument(
  746. "--wired_table_cells_detection_model_name",
  747. type=str,
  748. help="Name of the wired table cells detection model.",
  749. )
  750. subparser.add_argument(
  751. "--wired_table_cells_detection_model_dir",
  752. type=str,
  753. help="Path to the wired table cells detection model directory.",
  754. )
  755. subparser.add_argument(
  756. "--wireless_table_cells_detection_model_name",
  757. type=str,
  758. help="Name of the wireless table cells detection model.",
  759. )
  760. subparser.add_argument(
  761. "--wireless_table_cells_detection_model_dir",
  762. type=str,
  763. help="Path to the wireless table cells detection model directory.",
  764. )
  765. subparser.add_argument(
  766. "--seal_text_detection_model_name",
  767. type=str,
  768. help="Name of the seal text detection model.",
  769. )
  770. subparser.add_argument(
  771. "--seal_text_detection_model_dir",
  772. type=str,
  773. help="Path to the seal text detection model directory.",
  774. )
  775. subparser.add_argument(
  776. "--seal_det_limit_side_len",
  777. type=int,
  778. help="This sets a limit on the side length of the input image for the seal text detection model.",
  779. )
  780. subparser.add_argument(
  781. "--seal_det_limit_type",
  782. type=str,
  783. help="This determines how the side length limit is applied to the input image before feeding it into the seal text deteciton model.",
  784. )
  785. subparser.add_argument(
  786. "--seal_det_thresh",
  787. type=float,
  788. help="Detection pixel threshold for the seal text detection model. Pixels with scores greater than this threshold in the output probability map are considered text pixels.",
  789. )
  790. subparser.add_argument(
  791. "--seal_det_box_thresh",
  792. type=float,
  793. help="Detection box threshold for the seal text detection model. A detection result is considered a text region if the average score of all pixels within the border of the result is greater than this threshold.",
  794. )
  795. subparser.add_argument(
  796. "--seal_det_unclip_ratio",
  797. type=float,
  798. help="Seal text detection expansion coefficient, which expands the text region using this method. The larger the value, the larger the expansion area.",
  799. )
  800. subparser.add_argument(
  801. "--seal_text_recognition_model_name",
  802. type=str,
  803. help="Name of the seal text recognition model.",
  804. )
  805. subparser.add_argument(
  806. "--seal_text_recognition_model_dir",
  807. type=str,
  808. help="Path to the seal text recognition model directory.",
  809. )
  810. subparser.add_argument(
  811. "--seal_text_recognition_batch_size",
  812. type=int,
  813. help="Batch size for the seal text recognition model.",
  814. )
  815. subparser.add_argument(
  816. "--seal_rec_score_thresh",
  817. type=float,
  818. help="Seal text recognition threshold. Text results with scores greater than this threshold are retained.",
  819. )
  820. subparser.add_argument(
  821. "--formula_recognition_model_name",
  822. type=str,
  823. help="Name of the formula recognition model.",
  824. )
  825. subparser.add_argument(
  826. "--formula_recognition_model_dir",
  827. type=str,
  828. help="Path to the formula recognition model directory.",
  829. )
  830. subparser.add_argument(
  831. "--formula_recognition_batch_size",
  832. type=int,
  833. help="Batch size for the formula recognition model.",
  834. )
  835. subparser.add_argument(
  836. "--use_doc_orientation_classify",
  837. type=str2bool,
  838. help="Whether to use document image orientation classification.",
  839. )
  840. subparser.add_argument(
  841. "--use_doc_unwarping",
  842. type=str2bool,
  843. help="Whether to use text image unwarping.",
  844. )
  845. subparser.add_argument(
  846. "--use_textline_orientation",
  847. type=str2bool,
  848. help="Whether to use text line orientation classification.",
  849. )
  850. subparser.add_argument(
  851. "--use_seal_recognition",
  852. type=str2bool,
  853. help="Whether to use seal recognition.",
  854. )
  855. subparser.add_argument(
  856. "--use_table_recognition",
  857. type=str2bool,
  858. help="Whether to use table recognition.",
  859. )
  860. subparser.add_argument(
  861. "--use_formula_recognition",
  862. type=str2bool,
  863. help="Whether to use formula recognition.",
  864. )
  865. subparser.add_argument(
  866. "--use_chart_recognition",
  867. type=str2bool,
  868. help="Whether to use chart recognition.",
  869. )
  870. subparser.add_argument(
  871. "--use_region_detection",
  872. type=str2bool,
  873. help="Whether to use region detection.",
  874. )
  875. # FIXME: Passing API key through CLI is not secure; consider using
  876. # environment variables.
  877. subparser.add_argument(
  878. "--qianfan_api_key",
  879. type=str,
  880. help="Configuration for the embedding model.",
  881. )
  882. def execute_with_args(self, args):
  883. params = get_subcommand_args(args)
  884. input = params.pop("input")
  885. target_language = params.pop("target_language")
  886. save_path = params.pop("save_path")
  887. qianfan_api_key = params.pop("qianfan_api_key")
  888. if qianfan_api_key is not None:
  889. params["chat_bot_config"] = {
  890. "module_name": "chat_bot",
  891. "model_name": "ernie-3.5-8k",
  892. "base_url": "https://qianfan.baidubce.com/v2",
  893. "api_type": "openai",
  894. "api_key": qianfan_api_key,
  895. }
  896. chatocr = PPDocTranslation(**params)
  897. logger.info("Start analyzing images")
  898. result_visual = chatocr.visual_predict_iter(input)
  899. ori_md_info_list = []
  900. for res in result_visual:
  901. ori_md_info_list.append(res["layout_parsing_result"].markdown)
  902. if save_path:
  903. res["layout_parsing_result"].save_all(save_path)
  904. logger.info("Start translation")
  905. result_translate = chatocr.translate_iter(
  906. ori_md_info_list,
  907. target_language=target_language,
  908. )
  909. for res in result_translate:
  910. res.print()
  911. if save_path:
  912. res.save_to_markdown(save_path)