pipeline_inputs.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553
  1. # Copyright (c) Alibaba, Inc. and its affiliates.
  2. import numpy as np
  3. from PIL import Image
  4. from modelscope.utils.constant import Tasks
  5. class InputKeys(object):
  6. IMAGE = 'image'
  7. TEXT = 'text'
  8. VIDEO = 'video'
  9. class InputType(object):
  10. IMAGE = 'image'
  11. TEXT = 'text'
  12. AUDIO = 'audio'
  13. VIDEO = 'video'
  14. BOX = 'box'
  15. DICT = 'dict'
  16. LIST = 'list'
  17. NUMBER = 'number'
  18. INPUT_TYPE = {
  19. InputType.IMAGE: (str, np.ndarray, Image.Image),
  20. InputType.TEXT: str,
  21. InputType.AUDIO: (str, bytes, np.ndarray),
  22. InputType.VIDEO: (str, np.ndarray, 'cv2.VideoCapture'),
  23. InputType.BOX: (list, np.ndarray),
  24. InputType.DICT: (dict, type(None)),
  25. InputType.LIST: (list, type(None)),
  26. InputType.NUMBER: int,
  27. }
  28. INPUT_TYPE_SCHEMA = {
  29. InputType.IMAGE: {
  30. 'type': 'string',
  31. 'description': 'Base64 encoded image file or url string.'
  32. }, # support url or base64 encoded file.
  33. InputType.AUDIO: {
  34. 'type': 'string',
  35. 'description': 'Base64 encoded audio file or url string..'
  36. }, # support url or base64 encoded file.
  37. InputType.VIDEO: {
  38. 'type': 'string',
  39. 'description': 'Base64 encoded video file or url string..'
  40. }, # support url or base64 encoded file.
  41. InputType.TEXT: {
  42. 'type': 'string',
  43. 'description': 'The input text.'
  44. },
  45. InputType.BOX: {
  46. 'type': 'array',
  47. 'description': 'Box coordinate, should be int.',
  48. 'items': {
  49. 'type': 'number'
  50. }
  51. },
  52. InputType.DICT: { # unknown properties
  53. 'type': 'object',
  54. },
  55. InputType.LIST: {
  56. 'type': 'array'
  57. }, # unknown item type.
  58. InputType.NUMBER: {
  59. 'type': 'integer'
  60. },
  61. }
  62. def check_input_type(input_type, input):
  63. expected_type = INPUT_TYPE[input_type]
  64. if input_type == InputType.VIDEO:
  65. # special type checking using class name, to avoid introduction of opencv dependency into fundamental framework.
  66. assert type(input).__name__ == 'VideoCapture' or isinstance(input, expected_type),\
  67. f'invalid input type for {input_type}, expected {expected_type} but got {type(input)}\n {input}'
  68. else:
  69. assert isinstance(input, expected_type), \
  70. f'invalid input type for {input_type}, expected {expected_type} but got {type(input)}\n {input}'
  71. TASK_INPUTS = {
  72. # if task input is single var, value is InputType
  73. # if task input is a tuple, value is tuple of InputType
  74. # if task input is a dict, value is a dict of InputType, where key
  75. # equals the one needed in pipeline input dict
  76. # if task input is a list, value is a set of input format, in which
  77. # each element corresponds to one input format as described above and
  78. # must include a dict format.
  79. Tasks.task_template: {
  80. 'image': InputType.IMAGE,
  81. 'text': InputType.TEXT
  82. },
  83. # ============ vision tasks ===================
  84. Tasks.image_text_retrieval: {
  85. InputKeys.IMAGE: InputType.IMAGE,
  86. InputKeys.TEXT: InputType.TEXT
  87. },
  88. Tasks.general_recognition:
  89. InputType.IMAGE,
  90. Tasks.video_depth_estimation: {
  91. InputKeys.IMAGE: InputType.IMAGE,
  92. InputKeys.TEXT: InputType.TEXT
  93. },
  94. Tasks.indoor_layout_estimation:
  95. InputType.IMAGE,
  96. Tasks.image_demoireing:
  97. InputType.IMAGE,
  98. Tasks.panorama_depth_estimation:
  99. InputType.IMAGE,
  100. Tasks.video_depth_estimation:
  101. InputType.VIDEO,
  102. Tasks.animal_recognition:
  103. InputType.IMAGE,
  104. Tasks.motion_generation:
  105. InputType.TEXT,
  106. Tasks.video_panoptic_segmentation:
  107. InputType.VIDEO,
  108. Tasks.ocr_detection:
  109. InputType.IMAGE,
  110. Tasks.ocr_recognition:
  111. InputType.IMAGE,
  112. Tasks.face_2d_keypoints:
  113. InputType.IMAGE,
  114. Tasks.face_liveness:
  115. InputType.IMAGE,
  116. Tasks.face_quality_assessment:
  117. InputType.IMAGE,
  118. Tasks.card_detection:
  119. InputType.IMAGE,
  120. Tasks.license_plate_detection:
  121. InputType.IMAGE,
  122. Tasks.card_detection_correction:
  123. InputType.IMAGE,
  124. Tasks.lineless_table_recognition:
  125. InputType.IMAGE,
  126. Tasks.table_recognition:
  127. InputType.IMAGE,
  128. Tasks.face_detection:
  129. InputType.IMAGE,
  130. Tasks.facial_expression_recognition:
  131. InputType.IMAGE,
  132. Tasks.face_attribute_recognition:
  133. InputType.IMAGE,
  134. Tasks.face_recognition:
  135. InputType.IMAGE,
  136. Tasks.face_reconstruction:
  137. InputType.IMAGE,
  138. Tasks.head_reconstruction:
  139. InputType.IMAGE,
  140. Tasks.text_to_head:
  141. InputType.TEXT,
  142. Tasks.human_detection:
  143. InputType.IMAGE,
  144. Tasks.face_image_generation:
  145. InputType.NUMBER,
  146. Tasks.image_classification:
  147. InputType.IMAGE,
  148. Tasks.image_quality_assessment_mos:
  149. InputType.IMAGE,
  150. Tasks.image_quality_assessment_degradation:
  151. InputType.IMAGE,
  152. Tasks.image_object_detection:
  153. InputType.IMAGE,
  154. Tasks.domain_specific_object_detection:
  155. InputType.IMAGE,
  156. Tasks.human_wholebody_keypoint:
  157. InputType.IMAGE,
  158. Tasks.image_segmentation:
  159. InputType.IMAGE,
  160. Tasks.portrait_matting:
  161. InputType.IMAGE,
  162. Tasks.universal_matting:
  163. InputType.IMAGE,
  164. Tasks.product_segmentation:
  165. InputType.IMAGE,
  166. Tasks.semantic_segmentation:
  167. InputType.IMAGE,
  168. Tasks.face_human_hand_detection:
  169. InputType.IMAGE,
  170. Tasks.hand_static:
  171. InputType.IMAGE,
  172. Tasks.image_fewshot_detection:
  173. InputType.IMAGE,
  174. Tasks.open_vocabulary_detection: {
  175. 'img': InputType.IMAGE,
  176. 'category_names': InputType.TEXT
  177. },
  178. Tasks.image_driving_perception:
  179. InputType.IMAGE,
  180. Tasks.vision_efficient_tuning:
  181. InputType.IMAGE,
  182. # image editing task result for a single image
  183. Tasks.skin_retouching:
  184. InputType.IMAGE,
  185. Tasks.image_super_resolution:
  186. InputType.IMAGE,
  187. Tasks.image_colorization:
  188. InputType.IMAGE,
  189. Tasks.image_color_enhancement:
  190. InputType.IMAGE,
  191. Tasks.image_denoising:
  192. InputType.IMAGE,
  193. Tasks.image_body_reshaping:
  194. InputType.IMAGE,
  195. Tasks.image_portrait_enhancement:
  196. InputType.IMAGE,
  197. Tasks.crowd_counting:
  198. InputType.IMAGE,
  199. Tasks.image_super_resolution_pasd: {
  200. 'image': InputType.IMAGE,
  201. 'prompt': InputType.TEXT,
  202. },
  203. Tasks.image_inpainting: {
  204. 'img': InputType.IMAGE,
  205. 'mask': InputType.IMAGE,
  206. },
  207. Tasks.image_paintbyexample: {
  208. 'img': InputType.IMAGE,
  209. 'mask': InputType.IMAGE,
  210. 'reference': InputType.IMAGE,
  211. },
  212. Tasks.image_skychange: {
  213. 'sky_image': InputType.IMAGE,
  214. 'scene_image': InputType.IMAGE,
  215. },
  216. Tasks.controllable_image_generation: {
  217. 'image': InputType.IMAGE,
  218. 'prompt': InputType.TEXT,
  219. },
  220. Tasks.image_face_fusion: {
  221. 'template': InputType.IMAGE,
  222. 'user': InputType.IMAGE,
  223. },
  224. Tasks.image_deblurring:
  225. InputType.IMAGE,
  226. Tasks.video_colorization:
  227. InputType.VIDEO,
  228. # image generation task result for a single image
  229. Tasks.image_to_image_generation: [
  230. InputType.IMAGE,
  231. (InputType.IMAGE, InputType.IMAGE, InputType.IMAGE, InputType.IMAGE)
  232. ],
  233. Tasks.image_to_image_translation:
  234. InputType.IMAGE,
  235. Tasks.image_style_transfer: {
  236. 'content': InputType.IMAGE,
  237. 'style': InputType.IMAGE,
  238. },
  239. Tasks.image_portrait_stylization:
  240. InputType.IMAGE,
  241. Tasks.live_category:
  242. InputType.VIDEO,
  243. Tasks.action_recognition:
  244. InputType.VIDEO,
  245. Tasks.body_2d_keypoints:
  246. InputType.IMAGE,
  247. Tasks.body_3d_keypoints:
  248. InputType.VIDEO,
  249. Tasks.hand_2d_keypoints:
  250. InputType.IMAGE,
  251. Tasks.pedestrian_attribute_recognition:
  252. InputType.IMAGE,
  253. Tasks.video_single_object_tracking: (InputType.VIDEO, InputType.BOX),
  254. Tasks.video_multi_object_tracking:
  255. InputType.VIDEO,
  256. Tasks.video_category:
  257. InputType.VIDEO,
  258. Tasks.product_retrieval_embedding:
  259. InputType.IMAGE,
  260. Tasks.video_embedding:
  261. InputType.VIDEO,
  262. Tasks.virtual_try_on: [
  263. (InputType.IMAGE, InputType.IMAGE, InputType.IMAGE),
  264. {
  265. 'masked_model': InputType.IMAGE,
  266. 'pose': InputType.IMAGE,
  267. 'cloth': InputType.IMAGE,
  268. }
  269. ],
  270. Tasks.text_driven_segmentation: {
  271. InputKeys.IMAGE: InputType.IMAGE,
  272. InputKeys.TEXT: InputType.TEXT
  273. },
  274. Tasks.shop_segmentation:
  275. InputType.IMAGE,
  276. Tasks.movie_scene_segmentation:
  277. InputType.VIDEO,
  278. Tasks.bad_image_detecting:
  279. InputType.IMAGE,
  280. Tasks.image_try_on: {
  281. InputKeys.IMAGE: InputType.IMAGE,
  282. InputKeys.IMAGE: InputType.IMAGE,
  283. InputKeys.IMAGE: InputType.IMAGE
  284. },
  285. Tasks.human_image_generation: {
  286. InputKeys.IMAGE: InputType.IMAGE,
  287. 'target_pose_path': InputType.TEXT
  288. },
  289. Tasks.human3d_render: {
  290. 'dataset_id': InputType.TEXT,
  291. 'case_id': InputType.TEXT,
  292. },
  293. Tasks.human3d_animation: {
  294. 'dataset_id': InputType.TEXT,
  295. 'case_id': InputType.TEXT,
  296. 'action_dataset': InputType.TEXT,
  297. 'action': InputType.TEXT
  298. },
  299. Tasks.image_view_transform: {
  300. InputKeys.IMAGE: InputType.IMAGE,
  301. 'target_view': InputType.LIST
  302. },
  303. Tasks.image_control_3d_portrait: {
  304. InputKeys.IMAGE: InputType.IMAGE,
  305. 'save_dir': InputType.TEXT
  306. },
  307. # ============ nlp tasks ===================
  308. Tasks.chat: {
  309. # An input example for `messages` format (Dict[str, List[Dict[str, str]]]):
  310. # {'messages': [{
  311. # 'role': 'system',
  312. # 'content': 'You are a helpful assistant.'
  313. # }, {
  314. # 'role': 'user',
  315. # 'content': 'Hello! Where is the capital of Zhejiang?'
  316. # }, {
  317. # 'role': 'assistant',
  318. # 'content': 'Hangzhou is the capital of Zhejiang.'
  319. # }, {
  320. # 'role': 'user',
  321. # 'content': 'Tell me something about HangZhou?'
  322. # }]}
  323. 'messages': InputType.LIST
  324. },
  325. Tasks.text_classification: [
  326. InputType.TEXT,
  327. (InputType.TEXT, InputType.TEXT),
  328. {
  329. 'text': InputType.TEXT,
  330. 'text2': InputType.TEXT
  331. },
  332. ],
  333. Tasks.sentence_similarity: [
  334. (InputType.TEXT, InputType.TEXT),
  335. {
  336. 'source_text': InputType.TEXT,
  337. 'target_text': InputType.TEXT,
  338. },
  339. ],
  340. Tasks.nli: (InputType.TEXT, InputType.TEXT),
  341. Tasks.sentiment_classification:
  342. InputType.TEXT,
  343. Tasks.zero_shot_classification:
  344. InputType.TEXT,
  345. Tasks.relation_extraction:
  346. InputType.TEXT,
  347. Tasks.translation:
  348. InputType.TEXT,
  349. Tasks.text_summarization: [InputType.TEXT, {
  350. 'text': InputType.TEXT,
  351. }],
  352. Tasks.competency_aware_translation:
  353. InputType.TEXT,
  354. Tasks.word_segmentation: [InputType.TEXT, {
  355. 'text': InputType.TEXT,
  356. }],
  357. Tasks.part_of_speech:
  358. InputType.TEXT,
  359. Tasks.named_entity_recognition:
  360. InputType.TEXT,
  361. Tasks.text_error_correction:
  362. InputType.TEXT,
  363. Tasks.sentence_embedding: {
  364. 'source_sentence': InputType.LIST,
  365. 'sentences_to_compare': InputType.LIST,
  366. },
  367. Tasks.text_ranking: [
  368. (InputType.TEXT, InputType.TEXT),
  369. {
  370. 'source_sentence': InputType.LIST,
  371. 'sentences_to_compare': InputType.LIST
  372. }
  373. ],
  374. Tasks.text_generation:
  375. InputType.TEXT,
  376. Tasks.fid_dialogue: {
  377. 'history': InputType.TEXT,
  378. 'knowledge': InputType.TEXT,
  379. 'bot_profile': InputType.TEXT,
  380. 'user_profile': InputType.TEXT,
  381. },
  382. Tasks.fill_mask:
  383. InputType.TEXT,
  384. Tasks.table_question_answering: {
  385. 'question': InputType.TEXT,
  386. 'history_sql': InputType.DICT,
  387. },
  388. Tasks.faq_question_answering: {
  389. 'query_set': InputType.LIST,
  390. 'support_set': InputType.LIST,
  391. },
  392. Tasks.translation_evaluation: {
  393. 'hyp': InputType.LIST,
  394. 'src': InputType.LIST,
  395. 'ref': InputType.LIST,
  396. },
  397. Tasks.sudoku:
  398. InputType.TEXT,
  399. Tasks.text2sql: {
  400. 'text': InputType.TEXT,
  401. 'database': InputType.TEXT
  402. },
  403. Tasks.document_grounded_dialog_generate: {
  404. 'query': InputType.LIST,
  405. 'context': InputType.LIST,
  406. 'label': InputType.LIST,
  407. },
  408. Tasks.document_grounded_dialog_rerank: {
  409. 'dataset': InputType.LIST
  410. },
  411. Tasks.document_grounded_dialog_retrieval: {
  412. 'query': InputType.LIST,
  413. 'positive': InputType.LIST,
  414. 'negative': InputType.LIST
  415. },
  416. Tasks.machine_reading_comprehension:
  417. InputType.TEXT,
  418. Tasks.siamese_uie: InputType.TEXT,
  419. # ============ audio tasks ===================
  420. Tasks.auto_speech_recognition: # input can be audio, or audio and text.
  421. [InputType.AUDIO, {
  422. 'wav': InputType.AUDIO,
  423. 'text': InputType.TEXT
  424. }],
  425. Tasks.speech_signal_process:
  426. InputType.AUDIO,
  427. Tasks.acoustic_echo_cancellation: {
  428. 'nearend_mic': InputType.AUDIO,
  429. 'farend_speech': InputType.AUDIO
  430. },
  431. Tasks.speech_separation:
  432. InputType.AUDIO,
  433. Tasks.acoustic_noise_suppression:
  434. InputType.AUDIO,
  435. Tasks.text_to_speech:
  436. InputType.TEXT,
  437. Tasks.keyword_spotting:
  438. InputType.AUDIO,
  439. Tasks.speaker_diarization_dialogue_detection:
  440. InputType.TEXT,
  441. Tasks.language_score_prediction:
  442. InputType.TEXT,
  443. Tasks.punctuation:
  444. InputType.TEXT,
  445. Tasks.speech_language_recognition:
  446. InputType.AUDIO,
  447. Tasks.speaker_diarization_semantic_speaker_turn_detection:
  448. InputType.TEXT,
  449. Tasks.inverse_text_processing:
  450. InputType.TEXT,
  451. Tasks.speaker_verification: [InputType.AUDIO, InputType.AUDIO],
  452. # ============ multi-modal tasks ===================
  453. Tasks.image_captioning: [InputType.IMAGE, {
  454. 'image': InputType.IMAGE,
  455. }],
  456. Tasks.video_captioning: [InputType.VIDEO, {
  457. 'video': InputType.VIDEO,
  458. }],
  459. Tasks.multimodal_dialogue: {
  460. 'messages': InputType.LIST,
  461. },
  462. Tasks.visual_grounding: {
  463. 'image': InputType.IMAGE,
  464. 'text': InputType.TEXT
  465. },
  466. Tasks.text_to_image_synthesis: {
  467. 'text': InputType.TEXT,
  468. },
  469. Tasks.multi_modal_embedding: {
  470. 'img': InputType.IMAGE,
  471. 'text': InputType.TEXT
  472. },
  473. Tasks.generative_multi_modal_embedding: {
  474. 'image': InputType.IMAGE,
  475. 'text': InputType.TEXT
  476. },
  477. Tasks.multi_modal_similarity: {
  478. 'img': InputType.IMAGE,
  479. 'text': InputType.TEXT
  480. },
  481. Tasks.text_video_retrieval: {
  482. 'video': InputType.VIDEO,
  483. 'text': InputType.TEXT
  484. },
  485. Tasks.visual_question_answering: {
  486. 'image': InputType.IMAGE,
  487. 'text': InputType.TEXT
  488. },
  489. Tasks.video_question_answering: {
  490. 'video': InputType.VIDEO,
  491. 'text': InputType.TEXT
  492. },
  493. Tasks.visual_entailment: {
  494. 'image': InputType.IMAGE,
  495. 'text': InputType.TEXT,
  496. 'text2': InputType.TEXT,
  497. },
  498. Tasks.action_detection:
  499. InputType.VIDEO,
  500. Tasks.human_reconstruction:
  501. InputType.IMAGE,
  502. Tasks.text_texture_generation: {
  503. 'mesh_path': InputType.TEXT,
  504. 'texture_path': InputType.TEXT,
  505. 'prompt': InputType.TEXT,
  506. 'uvsize': InputType.NUMBER,
  507. 'image_size': InputType.NUMBER,
  508. 'output_dir': InputType.NUMBER,
  509. },
  510. Tasks.image_reid_person:
  511. InputType.IMAGE,
  512. Tasks.video_inpainting: {
  513. 'video_input_path': InputType.TEXT,
  514. 'video_output_path': InputType.TEXT,
  515. 'mask_path': InputType.TEXT,
  516. },
  517. Tasks.text_to_video_synthesis: {
  518. 'text': InputType.TEXT
  519. },
  520. Tasks.video_summarization: InputType.TEXT,
  521. Tasks.text_to_360panorama_image: {
  522. 'prompt': InputType.TEXT,
  523. },
  524. Tasks.image_editing: {
  525. 'img': InputType.IMAGE,
  526. 'prompts': InputType.LIST
  527. }
  528. }