pp_structurev3.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005
  1. # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import warnings
  15. from .._utils.cli import (
  16. add_simple_inference_args,
  17. get_subcommand_args,
  18. perform_simple_inference,
  19. str2bool,
  20. )
  21. from .base import PaddleXPipelineWrapper, PipelineCLISubcommandExecutor
  22. from .utils import create_config_from_structure
  23. _SUPPORTED_OCR_VERSIONS = ["PP-OCRv3", "PP-OCRv4", "PP-OCRv5"]
  24. class PPStructureV3(PaddleXPipelineWrapper):
  25. def __init__(
  26. self,
  27. layout_detection_model_name=None,
  28. layout_detection_model_dir=None,
  29. layout_threshold=None,
  30. layout_nms=None,
  31. layout_unclip_ratio=None,
  32. layout_merge_bboxes_mode=None,
  33. chart_recognition_model_name=None,
  34. chart_recognition_model_dir=None,
  35. chart_recognition_batch_size=None,
  36. region_detection_model_name=None,
  37. region_detection_model_dir=None,
  38. doc_orientation_classify_model_name=None,
  39. doc_orientation_classify_model_dir=None,
  40. doc_unwarping_model_name=None,
  41. doc_unwarping_model_dir=None,
  42. text_detection_model_name=None,
  43. text_detection_model_dir=None,
  44. text_det_limit_side_len=None,
  45. text_det_limit_type=None,
  46. text_det_thresh=None,
  47. text_det_box_thresh=None,
  48. text_det_unclip_ratio=None,
  49. textline_orientation_model_name=None,
  50. textline_orientation_model_dir=None,
  51. textline_orientation_batch_size=None,
  52. text_recognition_model_name=None,
  53. text_recognition_model_dir=None,
  54. text_recognition_batch_size=None,
  55. text_rec_score_thresh=None,
  56. table_classification_model_name=None,
  57. table_classification_model_dir=None,
  58. wired_table_structure_recognition_model_name=None,
  59. wired_table_structure_recognition_model_dir=None,
  60. wireless_table_structure_recognition_model_name=None,
  61. wireless_table_structure_recognition_model_dir=None,
  62. wired_table_cells_detection_model_name=None,
  63. wired_table_cells_detection_model_dir=None,
  64. wireless_table_cells_detection_model_name=None,
  65. wireless_table_cells_detection_model_dir=None,
  66. table_orientation_classify_model_name=None,
  67. table_orientation_classify_model_dir=None,
  68. seal_text_detection_model_name=None,
  69. seal_text_detection_model_dir=None,
  70. seal_det_limit_side_len=None,
  71. seal_det_limit_type=None,
  72. seal_det_thresh=None,
  73. seal_det_box_thresh=None,
  74. seal_det_unclip_ratio=None,
  75. seal_text_recognition_model_name=None,
  76. seal_text_recognition_model_dir=None,
  77. seal_text_recognition_batch_size=None,
  78. seal_rec_score_thresh=None,
  79. formula_recognition_model_name=None,
  80. formula_recognition_model_dir=None,
  81. formula_recognition_batch_size=None,
  82. use_doc_orientation_classify=None,
  83. use_doc_unwarping=None,
  84. use_textline_orientation=None,
  85. use_seal_recognition=None,
  86. use_table_recognition=None,
  87. use_formula_recognition=None,
  88. use_chart_recognition=None,
  89. use_region_detection=None,
  90. lang=None,
  91. ocr_version=None,
  92. **kwargs,
  93. ):
  94. if ocr_version is not None and ocr_version not in _SUPPORTED_OCR_VERSIONS:
  95. raise ValueError(
  96. f"Invalid OCR version: {ocr_version}. Supported values are {_SUPPORTED_OCR_VERSIONS}."
  97. )
  98. if all(
  99. map(
  100. lambda p: p is None,
  101. (
  102. text_detection_model_name,
  103. text_detection_model_dir,
  104. text_recognition_model_name,
  105. text_recognition_model_dir,
  106. ),
  107. )
  108. ):
  109. if lang is not None or ocr_version is not None:
  110. det_model_name, rec_model_name = self._get_ocr_model_names(
  111. lang, ocr_version
  112. )
  113. if det_model_name is None or rec_model_name is None:
  114. raise ValueError(
  115. f"No models are available for the language {repr(lang)} and OCR version {repr(ocr_version)}."
  116. )
  117. text_detection_model_name = det_model_name
  118. text_recognition_model_name = rec_model_name
  119. else:
  120. if lang is not None or ocr_version is not None:
  121. warnings.warn(
  122. "`lang` and `ocr_version` will be ignored when model names or model directories are not `None`.",
  123. stacklevel=2,
  124. )
  125. params = locals().copy()
  126. params["text_detection_model_name"] = text_detection_model_name
  127. params["text_recognition_model_name"] = text_recognition_model_name
  128. params.pop("self")
  129. params.pop("kwargs")
  130. self._params = params
  131. super().__init__(**kwargs)
  132. @property
  133. def _paddlex_pipeline_name(self):
  134. return "PP-StructureV3"
  135. def predict_iter(
  136. self,
  137. input,
  138. *,
  139. use_doc_orientation_classify=None,
  140. use_doc_unwarping=None,
  141. use_textline_orientation=None,
  142. use_seal_recognition=None,
  143. use_table_recognition=None,
  144. use_formula_recognition=None,
  145. use_chart_recognition=None,
  146. use_region_detection=None,
  147. layout_threshold=None,
  148. layout_nms=None,
  149. layout_unclip_ratio=None,
  150. layout_merge_bboxes_mode=None,
  151. text_det_limit_side_len=None,
  152. text_det_limit_type=None,
  153. text_det_thresh=None,
  154. text_det_box_thresh=None,
  155. text_det_unclip_ratio=None,
  156. text_rec_score_thresh=None,
  157. seal_det_limit_side_len=None,
  158. seal_det_limit_type=None,
  159. seal_det_thresh=None,
  160. seal_det_box_thresh=None,
  161. seal_det_unclip_ratio=None,
  162. seal_rec_score_thresh=None,
  163. use_wired_table_cells_trans_to_html=False,
  164. use_wireless_table_cells_trans_to_html=False,
  165. use_table_orientation_classify=True,
  166. use_ocr_results_with_table_cells=True,
  167. use_e2e_wired_table_rec_model=False,
  168. use_e2e_wireless_table_rec_model=True,
  169. **kwargs,
  170. ):
  171. return self.paddlex_pipeline.predict(
  172. input,
  173. use_doc_orientation_classify=use_doc_orientation_classify,
  174. use_doc_unwarping=use_doc_unwarping,
  175. use_textline_orientation=use_textline_orientation,
  176. use_seal_recognition=use_seal_recognition,
  177. use_table_recognition=use_table_recognition,
  178. use_formula_recognition=use_formula_recognition,
  179. use_chart_recognition=use_chart_recognition,
  180. use_region_detection=use_region_detection,
  181. layout_threshold=layout_threshold,
  182. layout_nms=layout_nms,
  183. layout_unclip_ratio=layout_unclip_ratio,
  184. layout_merge_bboxes_mode=layout_merge_bboxes_mode,
  185. text_det_limit_side_len=text_det_limit_side_len,
  186. text_det_limit_type=text_det_limit_type,
  187. text_det_thresh=text_det_thresh,
  188. text_det_box_thresh=text_det_box_thresh,
  189. text_det_unclip_ratio=text_det_unclip_ratio,
  190. text_rec_score_thresh=text_rec_score_thresh,
  191. seal_det_limit_side_len=seal_det_limit_side_len,
  192. seal_det_limit_type=seal_det_limit_type,
  193. seal_det_thresh=seal_det_thresh,
  194. seal_det_box_thresh=seal_det_box_thresh,
  195. seal_det_unclip_ratio=seal_det_unclip_ratio,
  196. seal_rec_score_thresh=seal_rec_score_thresh,
  197. use_wired_table_cells_trans_to_html=use_wired_table_cells_trans_to_html,
  198. use_wireless_table_cells_trans_to_html=use_wireless_table_cells_trans_to_html,
  199. use_table_orientation_classify=use_table_orientation_classify,
  200. use_ocr_results_with_table_cells=use_ocr_results_with_table_cells,
  201. use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
  202. use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
  203. **kwargs,
  204. )
  205. def predict(
  206. self,
  207. input,
  208. *,
  209. use_doc_orientation_classify=None,
  210. use_doc_unwarping=None,
  211. use_textline_orientation=None,
  212. use_seal_recognition=None,
  213. use_table_recognition=None,
  214. use_formula_recognition=None,
  215. use_chart_recognition=None,
  216. use_region_detection=None,
  217. layout_threshold=None,
  218. layout_nms=None,
  219. layout_unclip_ratio=None,
  220. layout_merge_bboxes_mode=None,
  221. text_det_limit_side_len=None,
  222. text_det_limit_type=None,
  223. text_det_thresh=None,
  224. text_det_box_thresh=None,
  225. text_det_unclip_ratio=None,
  226. text_rec_score_thresh=None,
  227. seal_det_limit_side_len=None,
  228. seal_det_limit_type=None,
  229. seal_det_thresh=None,
  230. seal_det_box_thresh=None,
  231. seal_det_unclip_ratio=None,
  232. seal_rec_score_thresh=None,
  233. use_wired_table_cells_trans_to_html=False,
  234. use_wireless_table_cells_trans_to_html=False,
  235. use_table_orientation_classify=True,
  236. use_ocr_results_with_table_cells=True,
  237. use_e2e_wired_table_rec_model=False,
  238. use_e2e_wireless_table_rec_model=True,
  239. **kwargs,
  240. ):
  241. return list(
  242. self.predict_iter(
  243. input,
  244. use_doc_orientation_classify=use_doc_orientation_classify,
  245. use_doc_unwarping=use_doc_unwarping,
  246. use_textline_orientation=use_textline_orientation,
  247. use_seal_recognition=use_seal_recognition,
  248. use_table_recognition=use_table_recognition,
  249. use_formula_recognition=use_formula_recognition,
  250. use_chart_recognition=use_chart_recognition,
  251. use_region_detection=use_region_detection,
  252. layout_threshold=layout_threshold,
  253. layout_nms=layout_nms,
  254. layout_unclip_ratio=layout_unclip_ratio,
  255. layout_merge_bboxes_mode=layout_merge_bboxes_mode,
  256. text_det_limit_side_len=text_det_limit_side_len,
  257. text_det_limit_type=text_det_limit_type,
  258. text_det_thresh=text_det_thresh,
  259. text_det_box_thresh=text_det_box_thresh,
  260. text_det_unclip_ratio=text_det_unclip_ratio,
  261. text_rec_score_thresh=text_rec_score_thresh,
  262. seal_det_limit_side_len=seal_det_limit_side_len,
  263. seal_det_limit_type=seal_det_limit_type,
  264. seal_det_thresh=seal_det_thresh,
  265. seal_det_box_thresh=seal_det_box_thresh,
  266. seal_det_unclip_ratio=seal_det_unclip_ratio,
  267. seal_rec_score_thresh=seal_rec_score_thresh,
  268. use_wired_table_cells_trans_to_html=use_wired_table_cells_trans_to_html,
  269. use_wireless_table_cells_trans_to_html=use_wireless_table_cells_trans_to_html,
  270. use_table_orientation_classify=use_table_orientation_classify,
  271. use_ocr_results_with_table_cells=use_ocr_results_with_table_cells,
  272. use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
  273. use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
  274. **kwargs,
  275. )
  276. )
  277. def concatenate_markdown_pages(self, markdown_list):
  278. return self.paddlex_pipeline.concatenate_markdown_pages(markdown_list)
  279. @classmethod
  280. def get_cli_subcommand_executor(cls):
  281. return PPStructureV3CLISubcommandExecutor()
  282. def _get_paddlex_config_overrides(self):
  283. STRUCTURE = {
  284. "SubPipelines.DocPreprocessor.use_doc_orientation_classify": self._params[
  285. "use_doc_orientation_classify"
  286. ],
  287. "SubPipelines.DocPreprocessor.use_doc_unwarping": self._params[
  288. "use_doc_unwarping"
  289. ],
  290. "use_doc_preprocessor": self._params["use_doc_orientation_classify"]
  291. or self._params["use_doc_unwarping"],
  292. "SubPipelines.GeneralOCR.use_textline_orientation": self._params[
  293. "use_textline_orientation"
  294. ],
  295. "use_seal_recognition": self._params["use_seal_recognition"],
  296. "use_table_recognition": self._params["use_table_recognition"],
  297. "use_formula_recognition": self._params["use_formula_recognition"],
  298. "use_chart_recognition": self._params["use_chart_recognition"],
  299. "use_region_detection": self._params["use_region_detection"],
  300. "SubModules.LayoutDetection.model_name": self._params[
  301. "layout_detection_model_name"
  302. ],
  303. "SubModules.LayoutDetection.model_dir": self._params[
  304. "layout_detection_model_dir"
  305. ],
  306. "SubModules.LayoutDetection.threshold": self._params["layout_threshold"],
  307. "SubModules.LayoutDetection.layout_nms": self._params["layout_nms"],
  308. "SubModules.LayoutDetection.layout_unclip_ratio": self._params[
  309. "layout_unclip_ratio"
  310. ],
  311. "SubModules.LayoutDetection.layout_merge_bboxes_mode": self._params[
  312. "layout_merge_bboxes_mode"
  313. ],
  314. "SubModules.ChartRecognition.model_name": self._params[
  315. "chart_recognition_model_name"
  316. ],
  317. "SubModules.ChartRecognition.model_dir": self._params[
  318. "chart_recognition_model_dir"
  319. ],
  320. "SubModules.ChartRecognition.batch_size": self._params[
  321. "chart_recognition_batch_size"
  322. ],
  323. "SubModules.RegionDetection.model_name": self._params[
  324. "region_detection_model_name"
  325. ],
  326. "SubModules.RegionDetection.model_dir": self._params[
  327. "region_detection_model_dir"
  328. ],
  329. "SubPipelines.DocPreprocessor.SubModules.DocOrientationClassify.model_name": self._params[
  330. "doc_orientation_classify_model_name"
  331. ],
  332. "SubPipelines.DocPreprocessor.SubModules.DocOrientationClassify.model_dir": self._params[
  333. "doc_orientation_classify_model_dir"
  334. ],
  335. "SubPipelines.DocPreprocessor.SubModules.DocUnwarping.model_name": self._params[
  336. "doc_unwarping_model_name"
  337. ],
  338. "SubPipelines.DocPreprocessor.SubModules.DocUnwarping.model_dir": self._params[
  339. "doc_unwarping_model_dir"
  340. ],
  341. "SubPipelines.GeneralOCR.SubModules.TextDetection.model_name": self._params[
  342. "text_detection_model_name"
  343. ],
  344. "SubPipelines.GeneralOCR.SubModules.TextDetection.model_dir": self._params[
  345. "text_detection_model_dir"
  346. ],
  347. "SubPipelines.GeneralOCR.SubModules.TextDetection.limit_side_len": self._params[
  348. "text_det_limit_side_len"
  349. ],
  350. "SubPipelines.GeneralOCR.SubModules.TextDetection.limit_type": self._params[
  351. "text_det_limit_type"
  352. ],
  353. "SubPipelines.GeneralOCR.SubModules.TextDetection.thresh": self._params[
  354. "text_det_thresh"
  355. ],
  356. "SubPipelines.GeneralOCR.SubModules.TextDetection.box_thresh": self._params[
  357. "text_det_box_thresh"
  358. ],
  359. "SubPipelines.GeneralOCR.SubModules.TextDetection.unclip_ratio": self._params[
  360. "text_det_unclip_ratio"
  361. ],
  362. "SubPipelines.GeneralOCR.SubModules.TextLineOrientation.model_name": self._params[
  363. "textline_orientation_model_name"
  364. ],
  365. "SubPipelines.GeneralOCR.SubModules.TextLineOrientation.model_dir": self._params[
  366. "textline_orientation_model_dir"
  367. ],
  368. "SubPipelines.GeneralOCR.SubModules.TextLineOrientation.batch_size": self._params[
  369. "textline_orientation_batch_size"
  370. ],
  371. "SubPipelines.GeneralOCR.SubModules.TextRecognition.model_name": self._params[
  372. "text_recognition_model_name"
  373. ],
  374. "SubPipelines.GeneralOCR.SubModules.TextRecognition.model_dir": self._params[
  375. "text_recognition_model_dir"
  376. ],
  377. "SubPipelines.GeneralOCR.SubModules.TextRecognition.batch_size": self._params[
  378. "text_recognition_batch_size"
  379. ],
  380. "SubPipelines.GeneralOCR.SubModules.TextRecognition.score_thresh": self._params[
  381. "text_rec_score_thresh"
  382. ],
  383. "SubPipelines.TableRecognition.SubModules.TableClassification.model_name": self._params[
  384. "table_classification_model_name"
  385. ],
  386. "SubPipelines.TableRecognition.SubModules.TableClassification.model_dir": self._params[
  387. "table_classification_model_dir"
  388. ],
  389. "SubPipelines.TableRecognition.SubModules.WiredTableStructureRecognition.model_name": self._params[
  390. "wired_table_structure_recognition_model_name"
  391. ],
  392. "SubPipelines.TableRecognition.SubModules.WiredTableStructureRecognition.model_dir": self._params[
  393. "wired_table_structure_recognition_model_dir"
  394. ],
  395. "SubPipelines.TableRecognition.SubModules.WirelessTableStructureRecognition.model_name": self._params[
  396. "wireless_table_structure_recognition_model_name"
  397. ],
  398. "SubPipelines.TableRecognition.SubModules.WirelessTableStructureRecognition.model_dir": self._params[
  399. "wireless_table_structure_recognition_model_dir"
  400. ],
  401. "SubPipelines.TableRecognition.SubModules.WiredTableCellsDetection.model_name": self._params[
  402. "wired_table_cells_detection_model_name"
  403. ],
  404. "SubPipelines.TableRecognition.SubModules.WiredTableCellsDetection.model_dir": self._params[
  405. "wired_table_cells_detection_model_dir"
  406. ],
  407. "SubPipelines.TableRecognition.SubModules.WirelessTableCellsDetection.model_name": self._params[
  408. "wireless_table_cells_detection_model_name"
  409. ],
  410. "SubPipelines.TableRecognition.SubModules.WirelessTableCellsDetection.model_dir": self._params[
  411. "wireless_table_cells_detection_model_dir"
  412. ],
  413. "SubPipelines.TableRecognition.SubModules.TableOrientationClassify.model_name": self._params[
  414. "table_orientation_classify_model_name"
  415. ],
  416. "SubPipelines.TableRecognition.SubModules.TableOrientationClassify.model_dir": self._params[
  417. "table_orientation_classify_model_dir"
  418. ],
  419. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.model_name": self._params[
  420. "text_detection_model_name"
  421. ],
  422. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.model_dir": self._params[
  423. "text_detection_model_dir"
  424. ],
  425. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.limit_side_len": self._params[
  426. "text_det_limit_side_len"
  427. ],
  428. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.limit_type": self._params[
  429. "text_det_limit_type"
  430. ],
  431. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.thresh": self._params[
  432. "text_det_thresh"
  433. ],
  434. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.box_thresh": self._params[
  435. "text_det_box_thresh"
  436. ],
  437. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextDetection.unclip_ratio": self._params[
  438. "text_det_unclip_ratio"
  439. ],
  440. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.model_name": self._params[
  441. "textline_orientation_model_name"
  442. ],
  443. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.model_dir": self._params[
  444. "textline_orientation_model_dir"
  445. ],
  446. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.batch_size": self._params[
  447. "textline_orientation_batch_size"
  448. ],
  449. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextRecognition.model_name": self._params[
  450. "text_recognition_model_name"
  451. ],
  452. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextRecognition.model_dir": self._params[
  453. "text_recognition_model_dir"
  454. ],
  455. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextRecognition.batch_size": self._params[
  456. "text_recognition_batch_size"
  457. ],
  458. "SubPipelines.TableRecognition.SubPipelines.GeneralOCR.SubModules.TextRecognition.score_thresh": self._params[
  459. "text_rec_score_thresh"
  460. ],
  461. "SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.model_name": self._params[
  462. "seal_text_detection_model_name"
  463. ],
  464. "SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.model_dir": self._params[
  465. "seal_text_detection_model_dir"
  466. ],
  467. "SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.limit_side_len": self._params[
  468. "text_det_limit_side_len"
  469. ],
  470. "SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.limit_type": self._params[
  471. "seal_det_limit_type"
  472. ],
  473. "SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.thresh": self._params[
  474. "seal_det_thresh"
  475. ],
  476. "SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.box_thresh": self._params[
  477. "seal_det_box_thresh"
  478. ],
  479. "SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.unclip_ratio": self._params[
  480. "seal_det_unclip_ratio"
  481. ],
  482. "SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextRecognition.model_name": self._params[
  483. "seal_text_recognition_model_name"
  484. ],
  485. "SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextRecognition.model_dir": self._params[
  486. "seal_text_recognition_model_dir"
  487. ],
  488. "SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextRecognition.batch_size": self._params[
  489. "seal_text_recognition_batch_size"
  490. ],
  491. "SubPipelines.FormulaRecognition.SubModules.FormulaRecognition.model_name": self._params[
  492. "formula_recognition_model_name"
  493. ],
  494. "SubPipelines.FormulaRecognition.SubModules.FormulaRecognition.model_dir": self._params[
  495. "formula_recognition_model_dir"
  496. ],
  497. "SubPipelines.FormulaRecognition.SubModules.FormulaRecognition.batch_size": self._params[
  498. "formula_recognition_batch_size"
  499. ],
  500. }
  501. return create_config_from_structure(STRUCTURE)
  502. def _get_ocr_model_names(self, lang, ppocr_version):
  503. LATIN_LANGS = [
  504. "af",
  505. "az",
  506. "bs",
  507. "cs",
  508. "cy",
  509. "da",
  510. "de",
  511. "es",
  512. "et",
  513. "fr",
  514. "ga",
  515. "hr",
  516. "hu",
  517. "id",
  518. "is",
  519. "it",
  520. "ku",
  521. "la",
  522. "lt",
  523. "lv",
  524. "mi",
  525. "ms",
  526. "mt",
  527. "nl",
  528. "no",
  529. "oc",
  530. "pi",
  531. "pl",
  532. "pt",
  533. "ro",
  534. "rs_latin",
  535. "sk",
  536. "sl",
  537. "sq",
  538. "sv",
  539. "sw",
  540. "tl",
  541. "tr",
  542. "uz",
  543. "vi",
  544. "french",
  545. "german",
  546. ]
  547. ARABIC_LANGS = ["ar", "fa", "ug", "ur"]
  548. ESLAV_LANGS = ["ru", "be", "uk"]
  549. CYRILLIC_LANGS = [
  550. "ru",
  551. "rs_cyrillic",
  552. "be",
  553. "bg",
  554. "uk",
  555. "mn",
  556. "abq",
  557. "ady",
  558. "kbd",
  559. "ava",
  560. "dar",
  561. "inh",
  562. "che",
  563. "lbe",
  564. "lez",
  565. "tab",
  566. ]
  567. DEVANAGARI_LANGS = [
  568. "hi",
  569. "mr",
  570. "ne",
  571. "bh",
  572. "mai",
  573. "ang",
  574. "bho",
  575. "mah",
  576. "sck",
  577. "new",
  578. "gom",
  579. "sa",
  580. "bgc",
  581. ]
  582. SPECIFIC_LANGS = [
  583. "ch",
  584. "en",
  585. "korean",
  586. "japan",
  587. "chinese_cht",
  588. "te",
  589. "ka",
  590. "ta",
  591. ]
  592. if lang is None:
  593. lang = "ch"
  594. if ppocr_version is None:
  595. if (
  596. lang
  597. in ["ch", "chinese_cht", "en", "japan", "korean", "th", "el"]
  598. + LATIN_LANGS
  599. + ESLAV_LANGS
  600. ):
  601. ppocr_version = "PP-OCRv5"
  602. elif lang in (
  603. LATIN_LANGS
  604. + ARABIC_LANGS
  605. + CYRILLIC_LANGS
  606. + DEVANAGARI_LANGS
  607. + SPECIFIC_LANGS
  608. ):
  609. ppocr_version = "PP-OCRv3"
  610. else:
  611. # Unknown language specified
  612. return None, None
  613. if ppocr_version == "PP-OCRv5":
  614. rec_lang, rec_model_name = None, None
  615. if lang in ("ch", "chinese_cht", "en", "japan"):
  616. rec_model_name = "PP-OCRv5_server_rec"
  617. elif lang in LATIN_LANGS:
  618. rec_lang = "latin"
  619. elif lang in ESLAV_LANGS:
  620. rec_lang = "eslav"
  621. elif lang == "korean":
  622. rec_lang = "korean"
  623. elif lang == "th":
  624. rec_lang = "th"
  625. elif lang == "el":
  626. rec_lang = "el"
  627. if rec_lang is not None:
  628. rec_model_name = f"{rec_lang}_PP-OCRv5_mobile_rec"
  629. return "PP-OCRv5_server_det", rec_model_name
  630. elif ppocr_version == "PP-OCRv4":
  631. if lang == "ch":
  632. return "PP-OCRv4_mobile_det", "PP-OCRv4_mobile_rec"
  633. elif lang == "en":
  634. return "PP-OCRv4_mobile_det", "en_PP-OCRv4_mobile_rec"
  635. else:
  636. return None, None
  637. else:
  638. # PP-OCRv3
  639. rec_lang = None
  640. if lang in LATIN_LANGS:
  641. rec_lang = "latin"
  642. elif lang in ARABIC_LANGS:
  643. rec_lang = "arabic"
  644. elif lang in CYRILLIC_LANGS:
  645. rec_lang = "cyrillic"
  646. elif lang in DEVANAGARI_LANGS:
  647. rec_lang = "devanagari"
  648. else:
  649. if lang in SPECIFIC_LANGS:
  650. rec_lang = lang
  651. rec_model_name = None
  652. if rec_lang == "ch":
  653. rec_model_name = "PP-OCRv3_mobile_rec"
  654. elif rec_lang is not None:
  655. rec_model_name = f"{rec_lang}_PP-OCRv3_mobile_rec"
  656. return "PP-OCRv3_mobile_det", rec_model_name
  657. class PPStructureV3CLISubcommandExecutor(PipelineCLISubcommandExecutor):
  658. @property
  659. def subparser_name(self):
  660. return "pp_structurev3"
  661. def _update_subparser(self, subparser):
  662. add_simple_inference_args(subparser)
  663. subparser.add_argument(
  664. "--layout_detection_model_name",
  665. type=str,
  666. help="Name of the layout detection model.",
  667. )
  668. subparser.add_argument(
  669. "--layout_detection_model_dir",
  670. type=str,
  671. help="Path to the layout detection model directory.",
  672. )
  673. subparser.add_argument(
  674. "--layout_threshold",
  675. type=float,
  676. help="Score threshold for the layout detection model.",
  677. )
  678. subparser.add_argument(
  679. "--layout_nms",
  680. type=str2bool,
  681. help="Whether to use NMS in layout detection.",
  682. )
  683. subparser.add_argument(
  684. "--layout_unclip_ratio",
  685. type=float,
  686. help="Expansion coefficient for layout detection.",
  687. )
  688. subparser.add_argument(
  689. "--layout_merge_bboxes_mode",
  690. type=str,
  691. help="Overlapping box filtering method.",
  692. )
  693. subparser.add_argument(
  694. "--chart_recognition_model_name",
  695. type=str,
  696. help="Name of the chart recognition model.",
  697. )
  698. subparser.add_argument(
  699. "--chart_recognition_model_dir",
  700. type=str,
  701. help="Path to the chart recognition model directory.",
  702. )
  703. subparser.add_argument(
  704. "--chart_recognition_batch_size",
  705. type=int,
  706. help="Batch size for the chart recognition model.",
  707. )
  708. subparser.add_argument(
  709. "--region_detection_model_name",
  710. type=str,
  711. help="Name of the region detection model.",
  712. )
  713. subparser.add_argument(
  714. "--region_detection_model_dir",
  715. type=str,
  716. help="Path to the region detection model directory.",
  717. )
  718. subparser.add_argument(
  719. "--doc_orientation_classify_model_name",
  720. type=str,
  721. help="Name of the document image orientation classification model.",
  722. )
  723. subparser.add_argument(
  724. "--doc_orientation_classify_model_dir",
  725. type=str,
  726. help="Path to the document image orientation classification model directory.",
  727. )
  728. subparser.add_argument(
  729. "--doc_unwarping_model_name",
  730. type=str,
  731. help="Name of the text image unwarping model.",
  732. )
  733. subparser.add_argument(
  734. "--doc_unwarping_model_dir",
  735. type=str,
  736. help="Path to the image unwarping model directory.",
  737. )
  738. subparser.add_argument(
  739. "--text_detection_model_name",
  740. type=str,
  741. help="Name of the text detection model.",
  742. )
  743. subparser.add_argument(
  744. "--text_detection_model_dir",
  745. type=str,
  746. help="Path to the text detection model directory.",
  747. )
  748. subparser.add_argument(
  749. "--text_det_limit_side_len",
  750. type=int,
  751. help="This sets a limit on the side length of the input image for the text detection model.",
  752. )
  753. subparser.add_argument(
  754. "--text_det_limit_type",
  755. type=str,
  756. help="This determines how the side length limit is applied to the input image before feeding it into the text deteciton model.",
  757. )
  758. subparser.add_argument(
  759. "--text_det_thresh",
  760. type=float,
  761. help="Detection pixel threshold for the text detection model. Pixels with scores greater than this threshold in the output probability map are considered text pixels.",
  762. )
  763. subparser.add_argument(
  764. "--text_det_box_thresh",
  765. type=float,
  766. help="Detection box threshold for the text detection model. A detection result is considered a text region if the average score of all pixels within the border of the result is greater than this threshold.",
  767. )
  768. subparser.add_argument(
  769. "--text_det_unclip_ratio",
  770. type=float,
  771. help="Text detection expansion coefficient, which expands the text region using this method. The larger the value, the larger the expansion area.",
  772. )
  773. subparser.add_argument(
  774. "--textline_orientation_model_name",
  775. type=str,
  776. help="Name of the text line orientation classification model.",
  777. )
  778. subparser.add_argument(
  779. "--textline_orientation_model_dir",
  780. type=str,
  781. help="Path to the text line orientation classification directory.",
  782. )
  783. subparser.add_argument(
  784. "--textline_orientation_batch_size",
  785. type=int,
  786. help="Batch size for the text line orientation classification model.",
  787. )
  788. subparser.add_argument(
  789. "--text_recognition_model_name",
  790. type=str,
  791. help="Name of the text recognition model.",
  792. )
  793. subparser.add_argument(
  794. "--text_recognition_model_dir",
  795. type=str,
  796. help="Path to the text recognition model directory.",
  797. )
  798. subparser.add_argument(
  799. "--text_recognition_batch_size",
  800. type=int,
  801. help="Batch size for the text recognition model.",
  802. )
  803. subparser.add_argument(
  804. "--text_rec_score_thresh",
  805. type=float,
  806. help="Text recognition threshold used in general OCR. Text results with scores greater than this threshold are retained.",
  807. )
  808. subparser.add_argument(
  809. "--table_classification_model_name",
  810. type=str,
  811. help="Name of the table classification model.",
  812. )
  813. subparser.add_argument(
  814. "--table_classification_model_dir",
  815. type=str,
  816. help="Path to the table classification model directory.",
  817. )
  818. subparser.add_argument(
  819. "--wired_table_structure_recognition_model_name",
  820. type=str,
  821. help="Name of the wired table structure recognition model.",
  822. )
  823. subparser.add_argument(
  824. "--wired_table_structure_recognition_model_dir",
  825. type=str,
  826. help="Path to the wired table structure recognition model directory.",
  827. )
  828. subparser.add_argument(
  829. "--wireless_table_structure_recognition_model_name",
  830. type=str,
  831. help="Name of the wireless table structure recognition model.",
  832. )
  833. subparser.add_argument(
  834. "--wireless_table_structure_recognition_model_dir",
  835. type=str,
  836. help="Path to the wired table structure recognition model directory.",
  837. )
  838. subparser.add_argument(
  839. "--wired_table_cells_detection_model_name",
  840. type=str,
  841. help="Name of the wired table cells detection model.",
  842. )
  843. subparser.add_argument(
  844. "--wired_table_cells_detection_model_dir",
  845. type=str,
  846. help="Path to the wired table cells detection model directory.",
  847. )
  848. subparser.add_argument(
  849. "--wireless_table_cells_detection_model_name",
  850. type=str,
  851. help="Name of the wireless table cells detection model.",
  852. )
  853. subparser.add_argument(
  854. "--wireless_table_cells_detection_model_dir",
  855. type=str,
  856. help="Path to the wireless table cells detection model directory.",
  857. )
  858. subparser.add_argument(
  859. "--seal_text_detection_model_name",
  860. type=str,
  861. help="Name of the seal text detection model.",
  862. )
  863. subparser.add_argument(
  864. "--seal_text_detection_model_dir",
  865. type=str,
  866. help="Path to the seal text detection model directory.",
  867. )
  868. subparser.add_argument(
  869. "--seal_det_limit_side_len",
  870. type=int,
  871. help="This sets a limit on the side length of the input image for the seal text detection model.",
  872. )
  873. subparser.add_argument(
  874. "--seal_det_limit_type",
  875. type=str,
  876. help="This determines how the side length limit is applied to the input image before feeding it into the seal text deteciton model.",
  877. )
  878. subparser.add_argument(
  879. "--seal_det_thresh",
  880. type=float,
  881. help="Detection pixel threshold for the seal text detection model. Pixels with scores greater than this threshold in the output probability map are considered text pixels.",
  882. )
  883. subparser.add_argument(
  884. "--seal_det_box_thresh",
  885. type=float,
  886. help="Detection box threshold for the seal text detection model. A detection result is considered a text region if the average score of all pixels within the border of the result is greater than this threshold.",
  887. )
  888. subparser.add_argument(
  889. "--seal_det_unclip_ratio",
  890. type=float,
  891. help="Seal text detection expansion coefficient, which expands the text region using this method. The larger the value, the larger the expansion area.",
  892. )
  893. subparser.add_argument(
  894. "--seal_text_recognition_model_name",
  895. type=str,
  896. help="Name of the seal text recognition model.",
  897. )
  898. subparser.add_argument(
  899. "--seal_text_recognition_model_dir",
  900. type=str,
  901. help="Path to the seal text recognition model directory.",
  902. )
  903. subparser.add_argument(
  904. "--seal_text_recognition_batch_size",
  905. type=int,
  906. help="Batch size for the seal text recognition model.",
  907. )
  908. subparser.add_argument(
  909. "--seal_rec_score_thresh",
  910. type=float,
  911. help="Seal text recognition threshold. Text results with scores greater than this threshold are retained.",
  912. )
  913. subparser.add_argument(
  914. "--formula_recognition_model_name",
  915. type=str,
  916. help="Name of the formula recognition model.",
  917. )
  918. subparser.add_argument(
  919. "--formula_recognition_model_dir",
  920. type=str,
  921. help="Path to the formula recognition model directory.",
  922. )
  923. subparser.add_argument(
  924. "--formula_recognition_batch_size",
  925. type=int,
  926. help="Batch size for the formula recognition model.",
  927. )
  928. subparser.add_argument(
  929. "--use_doc_orientation_classify",
  930. type=str2bool,
  931. help="Whether to use document image orientation classification.",
  932. )
  933. subparser.add_argument(
  934. "--use_doc_unwarping",
  935. type=str2bool,
  936. help="Whether to use text image unwarping.",
  937. )
  938. subparser.add_argument(
  939. "--use_textline_orientation",
  940. type=str2bool,
  941. help="Whether to use text line orientation classification.",
  942. )
  943. subparser.add_argument(
  944. "--use_seal_recognition",
  945. type=str2bool,
  946. help="Whether to use seal recognition.",
  947. )
  948. subparser.add_argument(
  949. "--use_table_recognition",
  950. type=str2bool,
  951. help="Whether to use table recognition.",
  952. )
  953. subparser.add_argument(
  954. "--use_formula_recognition",
  955. type=str2bool,
  956. help="Whether to use formula recognition.",
  957. )
  958. subparser.add_argument(
  959. "--use_chart_recognition",
  960. type=str2bool,
  961. help="Whether to use chart recognition.",
  962. )
  963. subparser.add_argument(
  964. "--use_region_detection",
  965. type=str2bool,
  966. help="Whether to use region detection.",
  967. )
  968. def execute_with_args(self, args):
  969. params = get_subcommand_args(args)
  970. perform_simple_inference(
  971. PPStructureV3,
  972. params,
  973. )