pp_chatocrv4_doc.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745
  1. # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from .._utils.cli import (
  15. get_subcommand_args,
  16. str2bool,
  17. )
  18. from .base import PaddleXPipelineWrapper, PipelineCLISubcommandExecutor
  19. from .utils import create_config_from_structure
  20. class PPChatOCRv4Doc(PaddleXPipelineWrapper):
  21. def __init__(
  22. self,
  23. layout_detection_model_name=None,
  24. layout_detection_model_dir=None,
  25. doc_orientation_classify_model_name=None,
  26. doc_orientation_classify_model_dir=None,
  27. doc_unwarping_model_name=None,
  28. doc_unwarping_model_dir=None,
  29. text_detection_model_name=None,
  30. text_detection_model_dir=None,
  31. textline_orientation_model_name=None,
  32. textline_orientation_model_dir=None,
  33. textline_orientation_batch_size=None,
  34. text_recognition_model_name=None,
  35. text_recognition_model_dir=None,
  36. text_recognition_batch_size=None,
  37. table_structure_recognition_model_name=None,
  38. table_structure_recognition_model_dir=None,
  39. seal_text_detection_model_name=None,
  40. seal_text_detection_model_dir=None,
  41. seal_text_recognition_model_name=None,
  42. seal_text_recognition_model_dir=None,
  43. seal_text_recognition_batch_size=None,
  44. use_doc_orientation_classify=None,
  45. use_doc_unwarping=None,
  46. use_textline_orientation=None,
  47. use_seal_recognition=None,
  48. use_table_recognition=None,
  49. layout_threshold=None,
  50. layout_nms=None,
  51. layout_unclip_ratio=None,
  52. layout_merge_bboxes_mode=None,
  53. text_det_limit_side_len=None,
  54. text_det_limit_type=None,
  55. text_det_thresh=None,
  56. text_det_box_thresh=None,
  57. text_det_unclip_ratio=None,
  58. text_rec_score_thresh=None,
  59. seal_det_limit_side_len=None,
  60. seal_det_limit_type=None,
  61. seal_det_thresh=None,
  62. seal_det_box_thresh=None,
  63. seal_det_unclip_ratio=None,
  64. seal_rec_score_thresh=None,
  65. retriever_config=None,
  66. mllm_chat_bot_config=None,
  67. chat_bot_config=None,
  68. **kwargs,
  69. ):
  70. params = locals().copy()
  71. params.pop("self")
  72. params.pop("kwargs")
  73. self._params = params
  74. super().__init__(**kwargs)
  75. @property
  76. def _paddlex_pipeline_name(self):
  77. return "PP-ChatOCRv4-doc"
  78. def save_vector(self, vector_info, save_path, retriever_config=None):
  79. return self.paddlex_pipeline.save_vector(
  80. vector_info=vector_info,
  81. save_path=save_path,
  82. retriever_config=retriever_config,
  83. )
  84. def load_vector(self, data_path, retriever_config=None):
  85. return self.paddlex_pipeline.load_vector(
  86. data_path=data_path, retriever_config=retriever_config
  87. )
  88. def load_visual_info_list(self, data_path):
  89. return self.paddlex_pipeline.load_visual_info_list(data_path=data_path)
  90. def save_visual_info_list(self, visual_info, save_path):
  91. return self.paddlex_pipeline.save_visual_info_list(
  92. visual_info=visual_info, save_path=save_path
  93. )
  94. def visual_predict_iter(
  95. self,
  96. input,
  97. *,
  98. use_doc_orientation_classify=None,
  99. use_doc_unwarping=None,
  100. use_textline_orientation=None,
  101. use_seal_recognition=None,
  102. use_table_recognition=None,
  103. layout_threshold=None,
  104. layout_nms=None,
  105. layout_unclip_ratio=None,
  106. layout_merge_bboxes_mode=None,
  107. text_det_limit_side_len=None,
  108. text_det_limit_type=None,
  109. text_det_thresh=None,
  110. text_det_box_thresh=None,
  111. text_det_unclip_ratio=None,
  112. text_rec_score_thresh=None,
  113. seal_det_limit_side_len=None,
  114. seal_det_limit_type=None,
  115. seal_det_thresh=None,
  116. seal_det_box_thresh=None,
  117. seal_det_unclip_ratio=None,
  118. seal_rec_score_thresh=None,
  119. **kwargs,
  120. ):
  121. return self.paddlex_pipeline.visual_predict(
  122. input,
  123. use_doc_orientation_classify=use_doc_orientation_classify,
  124. use_doc_unwarping=use_doc_unwarping,
  125. use_textline_orientation=use_textline_orientation,
  126. use_seal_recognition=use_seal_recognition,
  127. use_table_recognition=use_table_recognition,
  128. layout_threshold=layout_threshold,
  129. layout_nms=layout_nms,
  130. layout_unclip_ratio=layout_unclip_ratio,
  131. layout_merge_bboxes_mode=layout_merge_bboxes_mode,
  132. text_det_limit_side_len=text_det_limit_side_len,
  133. text_det_limit_type=text_det_limit_type,
  134. text_det_thresh=text_det_thresh,
  135. text_det_box_thresh=text_det_box_thresh,
  136. text_det_unclip_ratio=text_det_unclip_ratio,
  137. text_rec_score_thresh=text_rec_score_thresh,
  138. seal_det_limit_side_len=seal_det_limit_side_len,
  139. seal_det_limit_type=seal_det_limit_type,
  140. seal_det_thresh=seal_det_thresh,
  141. seal_det_box_thresh=seal_det_box_thresh,
  142. seal_det_unclip_ratio=seal_det_unclip_ratio,
  143. seal_rec_score_thresh=seal_rec_score_thresh,
  144. **kwargs,
  145. )
  146. def visual_predict(
  147. self,
  148. input,
  149. *,
  150. use_doc_orientation_classify=None,
  151. use_doc_unwarping=None,
  152. use_textline_orientation=None,
  153. use_seal_recognition=None,
  154. use_table_recognition=None,
  155. layout_threshold=None,
  156. layout_nms=None,
  157. layout_unclip_ratio=None,
  158. layout_merge_bboxes_mode=None,
  159. text_det_limit_side_len=None,
  160. text_det_limit_type=None,
  161. text_det_thresh=None,
  162. text_det_box_thresh=None,
  163. text_det_unclip_ratio=None,
  164. text_rec_score_thresh=None,
  165. seal_det_limit_side_len=None,
  166. seal_det_limit_type=None,
  167. seal_det_thresh=None,
  168. seal_det_box_thresh=None,
  169. seal_det_unclip_ratio=None,
  170. seal_rec_score_thresh=None,
  171. **kwargs,
  172. ):
  173. return list(
  174. self.visual_predict_iter(
  175. input,
  176. use_doc_orientation_classify=use_doc_orientation_classify,
  177. use_doc_unwarping=use_doc_unwarping,
  178. use_textline_orientation=use_textline_orientation,
  179. use_seal_recognition=use_seal_recognition,
  180. use_table_recognition=use_table_recognition,
  181. layout_threshold=layout_threshold,
  182. layout_nms=layout_nms,
  183. layout_unclip_ratio=layout_unclip_ratio,
  184. layout_merge_bboxes_mode=layout_merge_bboxes_mode,
  185. text_det_limit_side_len=text_det_limit_side_len,
  186. text_det_limit_type=text_det_limit_type,
  187. text_det_thresh=text_det_thresh,
  188. text_det_box_thresh=text_det_box_thresh,
  189. text_det_unclip_ratio=text_det_unclip_ratio,
  190. text_rec_score_thresh=text_rec_score_thresh,
  191. seal_det_limit_side_len=seal_det_limit_side_len,
  192. seal_det_limit_type=seal_det_limit_type,
  193. seal_det_thresh=seal_det_thresh,
  194. seal_det_box_thresh=seal_det_box_thresh,
  195. seal_det_unclip_ratio=seal_det_unclip_ratio,
  196. seal_rec_score_thresh=seal_rec_score_thresh,
  197. **kwargs,
  198. )
  199. )
  200. def build_vector(
  201. self,
  202. visual_info,
  203. *,
  204. min_characters=3500,
  205. block_size=300,
  206. flag_save_bytes_vector=False,
  207. retriever_config=None,
  208. ):
  209. return self.paddlex_pipeline.build_vector(
  210. visual_info,
  211. min_characters=min_characters,
  212. block_size=block_size,
  213. flag_save_bytes_vector=flag_save_bytes_vector,
  214. retriever_config=retriever_config,
  215. )
  216. def mllm_pred(self, input, key_list, *, mllm_chat_bot_config=None):
  217. return self.paddlex_pipeline.mllm_pred(
  218. input,
  219. key_list,
  220. mllm_chat_bot_config=mllm_chat_bot_config,
  221. )
  222. def chat(
  223. self,
  224. key_list,
  225. visual_info,
  226. *,
  227. use_vector_retrieval=True,
  228. vector_info=None,
  229. min_characters=3500,
  230. text_task_description=None,
  231. text_output_format=None,
  232. text_rules_str=None,
  233. text_few_shot_demo_text_content=None,
  234. text_few_shot_demo_key_value_list=None,
  235. table_task_description=None,
  236. table_output_format=None,
  237. table_rules_str=None,
  238. table_few_shot_demo_text_content=None,
  239. table_few_shot_demo_key_value_list=None,
  240. mllm_predict_info=None,
  241. mllm_integration_strategy="integration",
  242. chat_bot_config=None,
  243. retriever_config=None,
  244. ):
  245. return self.paddlex_pipeline.chat(
  246. key_list,
  247. visual_info,
  248. use_vector_retrieval=use_vector_retrieval,
  249. vector_info=vector_info,
  250. min_characters=min_characters,
  251. text_task_description=text_task_description,
  252. text_output_format=text_output_format,
  253. text_rules_str=text_rules_str,
  254. text_few_shot_demo_text_content=text_few_shot_demo_text_content,
  255. text_few_shot_demo_key_value_list=text_few_shot_demo_key_value_list,
  256. table_task_description=table_task_description,
  257. table_output_format=table_output_format,
  258. table_rules_str=table_rules_str,
  259. table_few_shot_demo_text_content=table_few_shot_demo_text_content,
  260. table_few_shot_demo_key_value_list=table_few_shot_demo_key_value_list,
  261. mllm_predict_info=mllm_predict_info,
  262. mllm_integration_strategy=mllm_integration_strategy,
  263. chat_bot_config=chat_bot_config,
  264. retriever_config=retriever_config,
  265. )
  266. @classmethod
  267. def get_cli_subcommand_executor(cls):
  268. return PPChatOCRv4DocCLISubcommandExecutor()
  269. def _get_paddlex_config_overrides(self):
  270. STRUCTURE = {
  271. "SubPipelines.LayoutParser.SubModules.LayoutDetection.model_name": self._params[
  272. "layout_detection_model_name"
  273. ],
  274. "SubPipelines.LayoutParser.SubModules.LayoutDetection.model_dir": self._params[
  275. "layout_detection_model_dir"
  276. ],
  277. "SubPipelines.LayoutParser.SubPipelines.DocPreprocessor.SubModules.DocOrientationClassify.model_name": self._params[
  278. "doc_orientation_classify_model_name"
  279. ],
  280. "SubPipelines.LayoutParser.SubPipelines.DocPreprocessor.SubModules.DocOrientationClassify.model_dir": self._params[
  281. "doc_orientation_classify_model_dir"
  282. ],
  283. "SubPipelines.LayoutParser.SubPipelines.DocPreprocessor.SubModules.DocUnwarping.model_name": self._params[
  284. "doc_unwarping_model_name"
  285. ],
  286. "SubPipelines.LayoutParser.SubPipelines.DocPreprocessor.SubModules.DocUnwarping.model_dir": self._params[
  287. "doc_unwarping_model_dir"
  288. ],
  289. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.model_name": self._params[
  290. "text_detection_model_name"
  291. ],
  292. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.model_dir": self._params[
  293. "text_detection_model_dir"
  294. ],
  295. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.model_name": self._params[
  296. "textline_orientation_model_name"
  297. ],
  298. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.model_dir": self._params[
  299. "textline_orientation_model_dir"
  300. ],
  301. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextLineOrientation.batch_size": self._params[
  302. "textline_orientation_batch_size"
  303. ],
  304. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextRecognition.model_name": self._params[
  305. "text_recognition_model_name"
  306. ],
  307. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextRecognition.model_dir": self._params[
  308. "text_recognition_model_dir"
  309. ],
  310. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextRecognition.batch_size": self._params[
  311. "text_recognition_batch_size"
  312. ],
  313. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.TableStructureRecognition.model_name": self._params[
  314. "table_structure_recognition_model_name"
  315. ],
  316. "SubPipelines.LayoutParser.SubPipelines.TableRecognition.SubModules.TableStructureRecognition.model_dir": self._params[
  317. "table_structure_recognition_model_dir"
  318. ],
  319. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.model_name": self._params[
  320. "seal_text_detection_model_name"
  321. ],
  322. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.model_dir": self._params[
  323. "seal_text_detection_model_dir"
  324. ],
  325. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextRecognition.model_name": self._params[
  326. "seal_text_recognition_model_name"
  327. ],
  328. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextRecognition.model_dir": self._params[
  329. "seal_text_recognition_model_dir"
  330. ],
  331. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextRecognition.batch_size": self._params[
  332. "seal_text_recognition_batch_size"
  333. ],
  334. "SubPipelines.LayoutParser.SubPipelines.DocPreprocessor.use_doc_orientation_classify": self._params[
  335. "use_doc_orientation_classify"
  336. ],
  337. "SubPipelines.LayoutParser.SubPipelines.DocPreprocessor.use_doc_unwarping": self._params[
  338. "use_doc_unwarping"
  339. ],
  340. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.use_textline_orientation": self._params[
  341. "use_textline_orientation"
  342. ],
  343. "SubPipelines.LayoutParser.use_doc_preprocessor": self._params[
  344. "use_doc_orientation_classify"
  345. ]
  346. or self._params["use_doc_unwarping"],
  347. "SubPipelines.LayoutParser.use_seal_recognition": self._params[
  348. "use_seal_recognition"
  349. ],
  350. "SubPipelines.LayoutParser.use_table_recognition": self._params[
  351. "use_table_recognition"
  352. ],
  353. "SubPipelines.LayoutParser.SubModules.LayoutDetection.threshold": self._params[
  354. "layout_threshold"
  355. ],
  356. "SubPipelines.LayoutParser.SubModules.LayoutDetection.nms": self._params[
  357. "layout_nms"
  358. ],
  359. "SubPipelines.LayoutParser.SubModules.LayoutDetection.unclip_ratio": self._params[
  360. "layout_unclip_ratio"
  361. ],
  362. "SubPipelines.LayoutParser.SubModules.LayoutDetection.merge_bboxes_mode": self._params[
  363. "layout_merge_bboxes_mode"
  364. ],
  365. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.limit_side_len": self._params[
  366. "text_det_limit_side_len"
  367. ],
  368. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.limit_type": self._params[
  369. "text_det_limit_type"
  370. ],
  371. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.thresh": self._params[
  372. "text_det_thresh"
  373. ],
  374. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.box_thresh": self._params[
  375. "text_det_box_thresh"
  376. ],
  377. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextDetection.unclip_ratio": self._params[
  378. "text_det_unclip_ratio"
  379. ],
  380. "SubPipelines.LayoutParser.SubPipelines.GeneralOCR.SubModules.TextRecognition.score_thresh": self._params[
  381. "text_rec_score_thresh"
  382. ],
  383. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.limit_side_len": self._params[
  384. "text_det_limit_side_len"
  385. ],
  386. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.limit_type": self._params[
  387. "seal_det_limit_type"
  388. ],
  389. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.thresh": self._params[
  390. "seal_det_thresh"
  391. ],
  392. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.box_thresh": self._params[
  393. "seal_det_box_thresh"
  394. ],
  395. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextDetection.unclip_ratio": self._params[
  396. "seal_det_unclip_ratio"
  397. ],
  398. "SubPipelines.LayoutParser.SubPipelines.SealRecognition.SubPipelines.SealOCR.SubModules.TextRecognition.score_thresh": self._params[
  399. "seal_rec_score_thresh"
  400. ],
  401. "SubModules.LLM_Retriever": self._params["retriever_config"],
  402. "SubModules.MLLM_Chat": self._params["mllm_chat_bot_config"],
  403. "SubModules.LLM_Chat": self._params["chat_bot_config"],
  404. }
  405. return create_config_from_structure(STRUCTURE)
  406. class PPChatOCRv4DocCLISubcommandExecutor(PipelineCLISubcommandExecutor):
  407. @property
  408. def subparser_name(self):
  409. return "pp_chatocrv4_doc"
  410. def _update_subparser(self, subparser):
  411. subparser.add_argument(
  412. "-i",
  413. "--input",
  414. type=str,
  415. required=True,
  416. help="Input path or URL.",
  417. )
  418. subparser.add_argument(
  419. "-k",
  420. "--keys",
  421. type=str,
  422. nargs="+",
  423. required=True,
  424. metavar="KEY",
  425. help="Keys use for information extraction.",
  426. )
  427. subparser.add_argument(
  428. "--save_path",
  429. type=str,
  430. help="Path to the output directory.",
  431. )
  432. subparser.add_argument(
  433. "--invoke_mllm",
  434. type=str2bool,
  435. default=False,
  436. help="Whether to invoke the multimodal large language model.",
  437. )
  438. subparser.add_argument(
  439. "--layout_detection_model_name",
  440. type=str,
  441. help="Name of the layout detection model.",
  442. )
  443. subparser.add_argument(
  444. "--layout_detection_model_dir",
  445. type=str,
  446. help="Path to the layout detection model directory.",
  447. )
  448. subparser.add_argument(
  449. "--doc_orientation_classify_model_name",
  450. type=str,
  451. help="Name of the document image orientation classification model.",
  452. )
  453. subparser.add_argument(
  454. "--doc_orientation_classify_model_dir",
  455. type=str,
  456. help="Path to the document image orientation classification model directory.",
  457. )
  458. subparser.add_argument(
  459. "--doc_unwarping_model_name",
  460. type=str,
  461. help="Name of the text image unwarping model.",
  462. )
  463. subparser.add_argument(
  464. "--doc_unwarping_model_dir",
  465. type=str,
  466. help="Path to the image unwarping model directory.",
  467. )
  468. subparser.add_argument(
  469. "--text_detection_model_name",
  470. type=str,
  471. help="Name of the text detection model.",
  472. )
  473. subparser.add_argument(
  474. "--text_detection_model_dir",
  475. type=str,
  476. help="Path to the text detection model directory.",
  477. )
  478. subparser.add_argument(
  479. "--textline_orientation_model_name",
  480. type=str,
  481. help="Name of the text line orientation classification model.",
  482. )
  483. subparser.add_argument(
  484. "--textline_orientation_model_dir",
  485. type=str,
  486. help="Path to the text line orientation classification model directory.",
  487. )
  488. subparser.add_argument(
  489. "--textline_orientation_batch_size",
  490. type=int,
  491. help="Batch size for the text line orientation classification model.",
  492. )
  493. subparser.add_argument(
  494. "--text_recognition_model_name",
  495. type=str,
  496. help="Name of the text recognition model.",
  497. )
  498. subparser.add_argument(
  499. "--text_recognition_model_dir",
  500. type=str,
  501. help="Path to the text recognition model directory.",
  502. )
  503. subparser.add_argument(
  504. "--text_recognition_batch_size",
  505. type=int,
  506. help="Batch size for the text recognition model.",
  507. )
  508. subparser.add_argument(
  509. "--table_structure_recognition_model_name",
  510. type=str,
  511. help="Name of the table structure recognition model.",
  512. )
  513. subparser.add_argument(
  514. "--table_structure_recognition_model_dir",
  515. type=str,
  516. help="Path to the table structure recognition model directory.",
  517. )
  518. subparser.add_argument(
  519. "--seal_text_detection_model_name",
  520. type=str,
  521. help="Name of the seal text detection model.",
  522. )
  523. subparser.add_argument(
  524. "--seal_text_detection_model_dir",
  525. type=str,
  526. help="Path to the seal text detection model directory.",
  527. )
  528. subparser.add_argument(
  529. "--seal_text_recognition_model_name",
  530. type=str,
  531. help="Name of the seal text recognition model.",
  532. )
  533. subparser.add_argument(
  534. "--seal_text_recognition_model_dir",
  535. type=str,
  536. help="Path to the seal text recognition model directory.",
  537. )
  538. subparser.add_argument(
  539. "--seal_text_recognition_batch_size",
  540. type=int,
  541. help="Batch size for the seal text recognition model.",
  542. )
  543. subparser.add_argument(
  544. "--use_doc_orientation_classify",
  545. type=str2bool,
  546. help="Whether to use document image orientation classification.",
  547. )
  548. subparser.add_argument(
  549. "--use_doc_unwarping",
  550. type=str2bool,
  551. help="Whether to use text image unwarping.",
  552. )
  553. subparser.add_argument(
  554. "--use_textline_orientation",
  555. type=str2bool,
  556. help="Whether to use text line orientation classification.",
  557. )
  558. subparser.add_argument(
  559. "--use_seal_recognition",
  560. type=str2bool,
  561. help="Whether to use seal recognition.",
  562. )
  563. subparser.add_argument(
  564. "--use_table_recognition",
  565. type=str2bool,
  566. help="Whether to use table recognition.",
  567. )
  568. # TODO: Support dict and list types
  569. subparser.add_argument(
  570. "--layout_threshold",
  571. type=float,
  572. help="Score threshold for the layout detection model.",
  573. )
  574. subparser.add_argument(
  575. "--layout_nms",
  576. type=str2bool,
  577. help="Whether to use NMS in layout detection.",
  578. )
  579. subparser.add_argument(
  580. "--layout_unclip_ratio",
  581. type=float,
  582. help="Expansion coefficient for layout detection.",
  583. )
  584. subparser.add_argument(
  585. "--layout_merge_bboxes_mode",
  586. type=str,
  587. help="Overlapping box filtering method.",
  588. )
  589. subparser.add_argument(
  590. "--text_det_limit_side_len",
  591. type=int,
  592. help="This sets a limit on the side length of the input image for the text detection model.",
  593. )
  594. subparser.add_argument(
  595. "--text_det_limit_type",
  596. type=str,
  597. help="This determines how the side length limit is applied to the input image before feeding it into the text deteciton model.",
  598. )
  599. subparser.add_argument(
  600. "--text_det_thresh",
  601. type=float,
  602. help="Detection pixel threshold for the text detection model. Pixels with scores greater than this threshold in the output probability map are considered text pixels.",
  603. )
  604. subparser.add_argument(
  605. "--text_det_box_thresh",
  606. type=float,
  607. help="Detection box threshold for the text detection model. A detection result is considered a text region if the average score of all pixels within the border of the result is greater than this threshold.",
  608. )
  609. subparser.add_argument(
  610. "--text_det_unclip_ratio",
  611. type=float,
  612. help="Text detection expansion coefficient, which expands the text region using this method. The larger the value, the larger the expansion area.",
  613. )
  614. subparser.add_argument(
  615. "--text_rec_score_thresh",
  616. type=float,
  617. help="Text recognition threshold used in general OCR. Text results with scores greater than this threshold are retained.",
  618. )
  619. subparser.add_argument(
  620. "--seal_det_limit_side_len",
  621. type=int,
  622. help="This sets a limit on the side length of the input image for the seal text detection model.",
  623. )
  624. subparser.add_argument(
  625. "--seal_det_limit_type",
  626. type=str,
  627. help="This determines how the side length limit is applied to the input image before feeding it into the seal text deteciton model.",
  628. )
  629. subparser.add_argument(
  630. "--seal_det_thresh",
  631. type=float,
  632. help="Detection pixel threshold for the seal text detection model. Pixels with scores greater than this threshold in the output probability map are considered text pixels.",
  633. )
  634. subparser.add_argument(
  635. "--seal_det_box_thresh",
  636. type=float,
  637. help="Detection box threshold for the seal text detection model. A detection result is considered a text region if the average score of all pixels within the border of the result is greater than this threshold.",
  638. )
  639. subparser.add_argument(
  640. "--seal_det_unclip_ratio",
  641. type=float,
  642. help="Seal text detection expansion coefficient, which expands the text region using this method. The larger the value, the larger the expansion area.",
  643. )
  644. subparser.add_argument(
  645. "--seal_rec_score_thresh",
  646. type=float,
  647. help="Seal text recognition threshold. Text results with scores greater than this threshold are retained.",
  648. )
  649. # FIXME: Passing API key through CLI is not secure; consider using
  650. # environment variables.
  651. subparser.add_argument(
  652. "--qianfan_api_key",
  653. type=str,
  654. help="Configuration for the embedding model.",
  655. )
  656. subparser.add_argument(
  657. "--pp_docbee_base_url",
  658. type=str,
  659. help="Configuration for the multimodal large language model.",
  660. )
  661. def execute_with_args(self, args):
  662. params = get_subcommand_args(args)
  663. input = params.pop("input")
  664. keys = params.pop("keys")
  665. save_path = params.pop("save_path")
  666. invoke_mllm = params.pop("invoke_mllm")
  667. qianfan_api_key = params.pop("qianfan_api_key")
  668. if qianfan_api_key is not None:
  669. params["retriever_config"] = {
  670. "module_name": "retriever",
  671. "model_name": "embedding-v1",
  672. "base_url": "https://qianfan.baidubce.com/v2",
  673. "api_type": "qianfan",
  674. "api_key": qianfan_api_key,
  675. }
  676. params["chat_bot_config"] = {
  677. "module_name": "chat_bot",
  678. "model_name": "ernie-3.5-8k",
  679. "base_url": "https://qianfan.baidubce.com/v2",
  680. "api_type": "openai",
  681. "api_key": qianfan_api_key,
  682. }
  683. pp_docbee_base_url = params.pop("pp_docbee_base_url")
  684. if pp_docbee_base_url is not None:
  685. params["mllm_chat_bot_config"] = {
  686. "module_name": "chat_bot",
  687. "model_name": "PP-DocBee",
  688. # PaddleX requires endpoints such as ".../chat/completions",
  689. # which, as the parameter name suggests, are not base URLs.
  690. "base_url": pp_docbee_base_url,
  691. "api_type": "openai",
  692. "api_key": "fake_key",
  693. }
  694. chatocr = PPChatOCRv4Doc(**params)
  695. result_visual = chatocr.visual_predict_iter(input)
  696. visual_info_list = []
  697. for res in result_visual:
  698. visual_info_list.append(res["visual_info"])
  699. if save_path:
  700. res["layout_parsing_result"].save_all(save_path)
  701. vector_info = chatocr.build_vector(visual_info_list)
  702. if invoke_mllm:
  703. result_mllm = chatocr.mllm_pred(input, keys)
  704. mllm_predict_info = result_mllm["mllm_res"]
  705. else:
  706. mllm_predict_info = None
  707. result_chat = chatocr.chat(
  708. keys,
  709. visual_info_list,
  710. vector_info=vector_info,
  711. mllm_predict_info=mllm_predict_info,
  712. )
  713. # Print the result to stdout
  714. for k, v in result_chat["chat_res"].items():
  715. print(f"{k} {v}")