outputs.py 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710
  1. # Copyright (c) Alibaba, Inc. and its affiliates.
  2. from collections import OrderedDict, namedtuple
  3. from dataclasses import dataclass, fields
  4. from typing import Dict, List, Tuple
  5. import numpy as np
  6. import torch
  7. from modelscope.utils.constant import Tasks
  8. class OutputKeys(object):
  9. LOSS = 'loss'
  10. LOGITS = 'logits'
  11. SCORES = 'scores'
  12. SCORE = 'score'
  13. LABEL = 'label'
  14. LABELS = 'labels'
  15. INPUT_IDS = 'input_ids'
  16. LABEL_POS = 'label_pos'
  17. POSES = 'poses'
  18. CAPTION = 'caption'
  19. BOXES = 'boxes'
  20. KEYPOINTS = 'keypoints'
  21. MASKS = 'masks'
  22. DEPTHS = 'depths'
  23. DEPTHS_COLOR = 'depths_color'
  24. FLOWS = 'flows'
  25. FLOWS_COLOR = 'flows_color'
  26. NORMALS = 'normals'
  27. NORMALS_COLOR = 'normals_color'
  28. LAYOUT = 'layout'
  29. TEXT = 'text'
  30. POLYGONS = 'polygons'
  31. OUTPUT = 'output'
  32. OUTPUT_IMG = 'output_img'
  33. OUTPUT_IMGS = 'output_imgs'
  34. OUTPUT_VIDEO = 'output_video'
  35. OUTPUT_PCM = 'output_pcm'
  36. OUTPUT_PCM_LIST = 'output_pcm_list'
  37. OUTPUT_WAV = 'output_wav'
  38. OUTPUT_OBJ = 'output_obj'
  39. OUTPUT_MESH = 'output_mesh'
  40. IMG_EMBEDDING = 'img_embedding'
  41. SPK_EMBEDDING = 'spk_embedding'
  42. SPO_LIST = 'spo_list'
  43. TEXT_EMBEDDING = 'text_embedding'
  44. TRANSLATION = 'translation'
  45. RESPONSE = 'response'
  46. PREDICTION = 'prediction'
  47. PREDICTIONS = 'predictions'
  48. PROBABILITIES = 'probabilities'
  49. DIALOG_STATES = 'dialog_states'
  50. VIDEO_EMBEDDING = 'video_embedding'
  51. PHRASE_PROTOTYPE = 'phrase_prototype'
  52. OBJECT_PROTOTYPE = 'object_prototype'
  53. SENTENCE_PROTOTYPE = 'sentence_prototype'
  54. EVENT_PROTOTYPE = 'event_prototype'
  55. TEXTVIDEO_SIM = 'textvideo_sim'
  56. UUID = 'uuid'
  57. WORD = 'word'
  58. KWS_LIST = 'kws_list'
  59. SQL_STRING = 'sql_string'
  60. SQL_QUERY = 'sql_query'
  61. HISTORY = 'history'
  62. QUERY_RESULT = 'query_result'
  63. TIMESTAMPS = 'timestamps'
  64. SHOT_NUM = 'shot_num'
  65. SCENE_NUM = 'scene_num'
  66. SCENE_META_LIST = 'scene_meta_list'
  67. SHOT_META_LIST = 'shot_meta_list'
  68. MATCHES = 'matches'
  69. PCD12 = 'pcd12'
  70. PCD12_ALIGN = 'pcd12_align'
  71. TBOUNDS = 'tbounds'
  72. MV_IMGS = 'MViews'
  73. OutputTypes = {
  74. OutputKeys.LOSS: float, # checked
  75. OutputKeys.LOGITS: np.ndarray, # checked.
  76. OutputKeys.SCORES: List[float], # checked
  77. OutputKeys.SCORE: float, # checked
  78. OutputKeys.LABEL: str, # checked
  79. OutputKeys.LABELS: List[str], # checked
  80. OutputKeys.INPUT_IDS: np.ndarray, # checked
  81. OutputKeys.LABEL_POS: np.ndarray, # checked
  82. OutputKeys.POSES:
  83. List[np.ndarray], # [Tuple(np.ndarray, np.ndarray)] # checked doubtful
  84. OutputKeys.CAPTION: str,
  85. OutputKeys.BOXES: np.ndarray, # checked
  86. OutputKeys.KEYPOINTS: np.ndarray, # checked
  87. OutputKeys.MASKS: np.ndarray, # checked
  88. OutputKeys.DEPTHS: List[np.ndarray], # checked
  89. OutputKeys.DEPTHS_COLOR: List[np.ndarray], # checked
  90. OutputKeys.LAYOUT: np.ndarray, # checked
  91. OutputKeys.TEXT: str, # checked
  92. OutputKeys.POLYGONS: np.array, # checked
  93. OutputKeys.OUTPUT: Dict,
  94. OutputKeys.OUTPUT_IMG: 'image', # checked
  95. OutputKeys.OUTPUT_IMGS: List[np.ndarray], # checked
  96. OutputKeys.OUTPUT_VIDEO: 'bytes',
  97. OutputKeys.OUTPUT_PCM: 'pcm',
  98. OutputKeys.OUTPUT_PCM_LIST: List[np.ndarray],
  99. OutputKeys.OUTPUT_WAV: 'pcm',
  100. OutputKeys.OUTPUT_OBJ: Dict,
  101. OutputKeys.OUTPUT_MESH: np.ndarray,
  102. OutputKeys.IMG_EMBEDDING: np.ndarray,
  103. OutputKeys.SPK_EMBEDDING: np.ndarray,
  104. OutputKeys.SPO_LIST: List[float],
  105. OutputKeys.TEXT_EMBEDDING: np.ndarray,
  106. OutputKeys.TRANSLATION: str,
  107. OutputKeys.RESPONSE: Dict,
  108. OutputKeys.PREDICTION: np.ndarray, # checked
  109. OutputKeys.PREDICTIONS: List[np.ndarray],
  110. OutputKeys.PROBABILITIES: np.ndarray,
  111. OutputKeys.DIALOG_STATES: object,
  112. OutputKeys.VIDEO_EMBEDDING: np.ndarray,
  113. OutputKeys.PHRASE_PROTOTYPE: np.ndarray,
  114. OutputKeys.OBJECT_PROTOTYPE: np.ndarray,
  115. OutputKeys.SENTENCE_PROTOTYPE: np.ndarray,
  116. OutputKeys.EVENT_PROTOTYPE: np.ndarray,
  117. OutputKeys.TEXTVIDEO_SIM: np.ndarray,
  118. OutputKeys.UUID: str,
  119. OutputKeys.WORD: str,
  120. OutputKeys.KWS_LIST: List[str],
  121. OutputKeys.SQL_STRING: str, # checked
  122. OutputKeys.SQL_QUERY: str, # checked
  123. OutputKeys.HISTORY: Dict, # checked
  124. OutputKeys.QUERY_RESULT: Dict, # checked
  125. OutputKeys.TIMESTAMPS: str,
  126. OutputKeys.SHOT_NUM: int,
  127. OutputKeys.SCENE_NUM: int,
  128. OutputKeys.SCENE_META_LIST: List[int],
  129. OutputKeys.SHOT_META_LIST: List[int],
  130. OutputKeys.MATCHES: List[np.ndarray],
  131. OutputKeys.PCD12: np.ndarray,
  132. OutputKeys.PCD12_ALIGN: np.ndarray,
  133. OutputKeys.TBOUNDS: Dict,
  134. OutputKeys.MV_IMGS: List[np.ndarray],
  135. }
  136. OutputTypeSchema = {
  137. OutputKeys.LOSS: {
  138. 'type': 'number'
  139. }, # checked
  140. OutputKeys.LOGITS: {
  141. 'type': 'array',
  142. 'items': {
  143. 'type': 'number'
  144. }
  145. }, # checked.
  146. OutputKeys.SCORES: {
  147. 'type': 'array',
  148. 'items': {
  149. 'type': 'number'
  150. }
  151. }, # checked
  152. OutputKeys.SCORE: {
  153. 'type': 'number'
  154. }, # checked
  155. OutputKeys.LABEL: {
  156. 'type': 'string'
  157. }, # checked
  158. OutputKeys.LABELS: {
  159. 'type': 'array',
  160. 'items': {
  161. 'type': 'string'
  162. }
  163. }, # checked
  164. OutputKeys.INPUT_IDS: {
  165. 'type': 'array',
  166. 'items': {
  167. 'type': 'number'
  168. }
  169. }, # checked
  170. OutputKeys.LABEL_POS: {
  171. 'type': 'array',
  172. 'items': {
  173. 'type': 'number'
  174. }
  175. }, # checked
  176. OutputKeys.POSES: {
  177. 'type': 'array',
  178. 'items': {
  179. 'type': 'array',
  180. 'items': {
  181. 'type': 'number'
  182. }
  183. }
  184. }, # [Tuple(np.ndarray, np.ndarray)] # checked doubtful
  185. OutputKeys.CAPTION: {
  186. 'type': 'string'
  187. },
  188. OutputKeys.BOXES: {
  189. 'type': 'array',
  190. 'items': {
  191. 'type': 'number'
  192. }
  193. }, # checked
  194. OutputKeys.KEYPOINTS: {
  195. 'type': 'array',
  196. 'items': {
  197. 'type': 'number'
  198. }
  199. }, # checked
  200. OutputKeys.MASKS: {
  201. 'type': 'array',
  202. 'items': {
  203. 'type': 'number'
  204. }
  205. }, # checked
  206. OutputKeys.DEPTHS: {
  207. 'type': 'array',
  208. 'items': {
  209. 'type': 'array',
  210. 'items': {
  211. 'type': 'number'
  212. }
  213. }
  214. }, # checked
  215. OutputKeys.DEPTHS_COLOR: {
  216. 'type': 'array',
  217. 'items': {
  218. 'type': 'array',
  219. 'items': {
  220. 'type': 'number'
  221. }
  222. }
  223. }, # checked
  224. OutputKeys.LAYOUT: {
  225. 'type': 'array',
  226. 'items': {
  227. 'type': 'number'
  228. }
  229. }, # checked
  230. OutputKeys.TEXT: {
  231. 'type': 'string'
  232. }, # checked
  233. OutputKeys.POLYGONS: {
  234. 'type': 'array',
  235. 'items': {
  236. 'type': 'number'
  237. }
  238. }, # checked
  239. OutputKeys.OUTPUT: {
  240. 'type': 'object'
  241. },
  242. OutputKeys.OUTPUT_IMG: {
  243. 'type': 'string',
  244. 'description': 'The base64 encoded image.',
  245. }, # checked
  246. OutputKeys.OUTPUT_IMGS: {
  247. 'type': 'array',
  248. 'items': {
  249. 'type': 'string',
  250. 'description': 'The base64 encoded image.',
  251. }
  252. }, # checked
  253. OutputKeys.OUTPUT_VIDEO: {
  254. 'type': 'string',
  255. 'description': 'The base64 encoded video.',
  256. },
  257. OutputKeys.OUTPUT_PCM: {
  258. 'type': 'string',
  259. 'description': 'The base64 encoded PCM.',
  260. },
  261. OutputKeys.OUTPUT_PCM_LIST: {
  262. 'type': 'array',
  263. 'items': {
  264. 'type': 'string',
  265. 'description': 'The base64 encoded PCM.',
  266. }
  267. },
  268. OutputKeys.OUTPUT_WAV: {
  269. 'type': 'string',
  270. 'description': 'The base64 encoded WAV.',
  271. },
  272. OutputKeys.OUTPUT_OBJ: {
  273. 'type': 'object'
  274. },
  275. OutputKeys.OUTPUT_MESH: {
  276. 'type': 'array',
  277. 'items': {
  278. 'type': 'number'
  279. }
  280. },
  281. OutputKeys.IMG_EMBEDDING: {
  282. 'type': 'array',
  283. 'items': {
  284. 'type': 'number'
  285. }
  286. },
  287. OutputKeys.SPK_EMBEDDING: {
  288. 'type': 'array',
  289. 'items': {
  290. 'type': 'number'
  291. }
  292. },
  293. OutputKeys.SPO_LIST: {
  294. 'type': 'array',
  295. 'items': {
  296. 'type': 'number'
  297. }
  298. },
  299. OutputKeys.TEXT_EMBEDDING: {
  300. 'type': 'array',
  301. 'items': {
  302. 'type': 'number'
  303. }
  304. },
  305. OutputKeys.TRANSLATION: {
  306. 'type': 'string'
  307. },
  308. OutputKeys.RESPONSE: {
  309. 'type': 'object'
  310. },
  311. OutputKeys.PREDICTION: {
  312. 'type': 'array',
  313. 'items': {
  314. 'type': 'number'
  315. }
  316. }, # checked
  317. OutputKeys.PREDICTIONS: {
  318. 'type': 'array',
  319. 'items': {
  320. 'type': 'array',
  321. 'items': {
  322. 'type': 'number'
  323. }
  324. }
  325. },
  326. OutputKeys.PROBABILITIES: {
  327. 'type': 'array',
  328. 'items': {
  329. 'type': 'number'
  330. }
  331. },
  332. OutputKeys.DIALOG_STATES: {
  333. 'type': 'object'
  334. },
  335. OutputKeys.VIDEO_EMBEDDING: {
  336. 'type': 'array',
  337. 'items': {
  338. 'type': 'number'
  339. }
  340. },
  341. OutputKeys.PHRASE_PROTOTYPE: {
  342. 'type': 'array',
  343. 'items': {
  344. 'type': 'number'
  345. }
  346. },
  347. OutputKeys.OBJECT_PROTOTYPE: {
  348. 'type': 'array',
  349. 'items': {
  350. 'type': 'number'
  351. }
  352. },
  353. OutputKeys.TEXTVIDEO_SIM: {
  354. 'type': 'array',
  355. 'items': {
  356. 'type': 'number'
  357. }
  358. },
  359. OutputKeys.UUID: {
  360. 'type': 'string'
  361. },
  362. OutputKeys.WORD: {
  363. 'type': 'string'
  364. },
  365. OutputKeys.KWS_LIST: {
  366. 'type': 'array',
  367. 'items': {
  368. 'type': 'string'
  369. }
  370. },
  371. OutputKeys.SQL_STRING: {
  372. 'type': 'string'
  373. }, # checked
  374. OutputKeys.SQL_QUERY: {
  375. 'type': 'string'
  376. }, # checked
  377. OutputKeys.HISTORY: {
  378. 'type': 'object'
  379. }, # checked
  380. OutputKeys.QUERY_RESULT: {
  381. 'type': 'object'
  382. }, # checked
  383. OutputKeys.TIMESTAMPS: {
  384. 'type': 'string'
  385. },
  386. OutputKeys.SHOT_NUM: {
  387. 'type': 'integer'
  388. },
  389. OutputKeys.SCENE_NUM: {
  390. 'type': 'integer'
  391. },
  392. OutputKeys.SCENE_META_LIST: {
  393. 'type': 'array',
  394. 'items': {
  395. 'type': 'integer'
  396. }
  397. },
  398. OutputKeys.SHOT_META_LIST: {
  399. 'type': 'array',
  400. 'items': {
  401. 'type': 'integer'
  402. }
  403. },
  404. OutputKeys.MATCHES: {
  405. 'type': 'array',
  406. 'items': {
  407. 'type': 'array',
  408. 'items': {
  409. 'type': 'number'
  410. }
  411. }
  412. },
  413. OutputKeys.PCD12: {
  414. 'type': 'array',
  415. 'items': {
  416. 'type': 'number'
  417. }
  418. },
  419. OutputKeys.PCD12_ALIGN: {
  420. 'type': 'array',
  421. 'items': {
  422. 'type': 'number'
  423. }
  424. },
  425. OutputKeys.TBOUNDS: {
  426. 'type': 'object'
  427. },
  428. OutputKeys.MV_IMGS: {
  429. 'type': 'array',
  430. 'items': {
  431. 'type': 'array',
  432. 'items': {
  433. 'type': 'number'
  434. }
  435. }
  436. },
  437. }
  438. TASK_OUTPUTS = {
  439. Tasks.task_template:
  440. [OutputKeys.BOXES, OutputKeys.OUTPUT_IMG, OutputKeys.TEXT_EMBEDDING],
  441. # ============ vision tasks ===================
  442. # ocr detection result for single sample
  443. # {
  444. # "polygons": np.array with shape [num_text, 8], each polygon is
  445. # [x1, y1, x2, y2, x3, y3, x4, y4]
  446. # }
  447. Tasks.ocr_detection: [OutputKeys.POLYGONS],
  448. Tasks.table_recognition: [OutputKeys.POLYGONS],
  449. Tasks.lineless_table_recognition: [OutputKeys.POLYGONS, OutputKeys.BOXES],
  450. Tasks.license_plate_detection: [OutputKeys.POLYGONS, OutputKeys.TEXT],
  451. Tasks.card_detection_correction: [
  452. OutputKeys.POLYGONS, OutputKeys.SCORES, OutputKeys.OUTPUT_IMGS,
  453. OutputKeys.LABELS, OutputKeys.LAYOUT
  454. ],
  455. # ocr recognition result for single sample
  456. # {
  457. # "text": "电子元器件提供BOM配单"
  458. # }
  459. Tasks.ocr_recognition: [OutputKeys.TEXT],
  460. Tasks.sudoku: [OutputKeys.TEXT],
  461. Tasks.text2sql: [OutputKeys.TEXT],
  462. # document vl embedding for single sample
  463. # {
  464. # "img_embedding": np.array with shape [M, D],
  465. # "text_embedding": np.array with shape [N, D]
  466. # }
  467. Tasks.document_vl_embedding:
  468. [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING],
  469. # face 2d keypoint result for single sample
  470. # {
  471. # "keypoints": [
  472. # [[x, y]*106],
  473. # [[x, y]*106],
  474. # [[x, y]*106],
  475. # ],
  476. # "poses": [
  477. # [pitch, roll, yaw],
  478. # [pitch, roll, yaw],
  479. # [pitch, roll, yaw],
  480. # ],
  481. # "boxes": [
  482. # [x1, y1, x2, y2],
  483. # [x1, y1, x2, y2],
  484. # [x1, y1, x2, y2],
  485. # ]
  486. # }
  487. Tasks.face_2d_keypoints:
  488. [OutputKeys.KEYPOINTS, OutputKeys.POSES, OutputKeys.BOXES],
  489. # face detection result for single sample
  490. # {
  491. # "scores": [0.9, 0.1, 0.05, 0.05]
  492. # "boxes": [
  493. # [x1, y1, x2, y2],
  494. # [x1, y1, x2, y2],
  495. # [x1, y1, x2, y2],
  496. # [x1, y1, x2, y2],
  497. # ],
  498. # "keypoints": [
  499. # [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
  500. # [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
  501. # [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
  502. # [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
  503. # ],
  504. # }
  505. Tasks.face_detection:
  506. [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
  507. # card detection result for single sample
  508. # {
  509. # "scores": [0.9, 0.1, 0.05, 0.05]
  510. # "boxes": [
  511. # [x1, y1, x2, y2],
  512. # [x1, y1, x2, y2],
  513. # [x1, y1, x2, y2],
  514. # [x1, y1, x2, y2],
  515. # ],
  516. # "keypoints": [
  517. # [x1, y1, x2, y2, x3, y3, x4, y4],
  518. # [x1, y1, x2, y2, x3, y3, x4, y4],
  519. # [x1, y1, x2, y2, x3, y3, x4, y4],
  520. # [x1, y1, x2, y2, x3, y3, x4, y4],
  521. # ],
  522. # }
  523. Tasks.card_detection:
  524. [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
  525. # content check result for single sample
  526. # {
  527. # "scores": [0.9] # non sexy probability
  528. # }
  529. Tasks.content_check: [OutputKeys.SCORES],
  530. # image driving perception result for single sample
  531. # {
  532. # "boxes": [
  533. # [x1, y1, x2, y2],
  534. # [x1, y1, x2, y2],
  535. # [x1, y1, x2, y2],
  536. # [x1, y1, x2, y2],
  537. # ],
  538. # "masks": [
  539. # [np.array], # with fixed shape(h=720, w=1280, 3) containing only 0, 1
  540. # [np.array], # with fixed shape(h=720, w=1280, 3) containing only 0, 1
  541. # ]
  542. # }
  543. Tasks.image_driving_perception: [OutputKeys.BOXES, OutputKeys.MASKS],
  544. # facial expression recognition result for single sample
  545. # {
  546. # "scores": [0.9]
  547. # "boxes": [x1, y1, x2, y2]
  548. # }
  549. Tasks.face_liveness: [OutputKeys.SCORES, OutputKeys.BOXES],
  550. # face quality assessment for single sample
  551. # {
  552. # "scores": [0.9]
  553. # "boxes": [x1, y1, x2, y2]
  554. # }
  555. Tasks.face_quality_assessment: [OutputKeys.SCORES, OutputKeys.BOXES],
  556. # facial expression recognition result for single sample
  557. # {
  558. # "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02],
  559. # "labels": ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
  560. # }
  561. Tasks.facial_expression_recognition:
  562. [OutputKeys.SCORES, OutputKeys.LABELS],
  563. Tasks.general_recognition: [OutputKeys.SCORES, OutputKeys.LABELS],
  564. # face processing base result for single img
  565. # {
  566. # "scores": [0.85]
  567. # "boxes": [x1, y1, x2, y2]
  568. # "keypoints": [x1, y1, x2, y2, x3, y3, x4, y4]
  569. # }
  570. Tasks.face_processing_base: [
  571. OutputKeys.OUTPUT_IMG, OutputKeys.SCORES, OutputKeys.BOXES,
  572. OutputKeys.KEYPOINTS
  573. ],
  574. # face attribute recognition result for single sample
  575. # {
  576. # "scores": [[0.9, 0.1], [0.92, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]
  577. # "labels": [['Male', 'Female'], [0-2, 3-9, 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70+]]
  578. # }
  579. Tasks.face_attribute_recognition: [OutputKeys.SCORES, OutputKeys.LABELS],
  580. # face recognition result for single sample
  581. # {
  582. # "img_embedding": np.array with shape [1, D],
  583. # }
  584. Tasks.face_recognition: [OutputKeys.IMG_EMBEDDING],
  585. # human detection result for single sample
  586. # {
  587. # "scores": [0.9, 0.1, 0.05, 0.05]
  588. # "labels": ["person", "person", "person", "person"],
  589. # "boxes": [
  590. # [x1, y1, x2, y2],
  591. # [x1, y1, x2, y2],
  592. # [x1, y1, x2, y2],
  593. # ],
  594. # }
  595. #
  596. Tasks.human_detection:
  597. [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
  598. # face generation result for single sample
  599. # {
  600. # "output_img": np.array with shape(h, w, 3)
  601. # }
  602. Tasks.face_image_generation: [OutputKeys.OUTPUT_IMG],
  603. # image classification result for single sample
  604. # {
  605. # "scores": [0.9, 0.1, 0.05, 0.05]
  606. # "labels": ["dog", "horse", "cow", "cat"],
  607. # }
  608. Tasks.image_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
  609. # object detection result for single sample
  610. # {
  611. # "scores": [0.9, 0.1, 0.05, 0.05]
  612. # "labels": ["dog", "horse", "cow", "cat"],
  613. # "boxes": [
  614. # [x1, y1, x2, y2],
  615. # [x1, y1, x2, y2],
  616. # [x1, y1, x2, y2],
  617. # ],
  618. # }
  619. Tasks.image_object_detection:
  620. [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
  621. Tasks.domain_specific_object_detection:
  622. [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
  623. Tasks.open_vocabulary_detection:
  624. [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
  625. # video object detection result for single sample
  626. # {
  627. # "scores": [[0.8, 0.25, 0.05, 0.05], [0.9, 0.1, 0.05, 0.05]]
  628. # "labels": [["person", "traffic light", "car", "bus"],
  629. # ["person", "traffic light", "car", "bus"]]
  630. # "boxes":
  631. # [
  632. # [
  633. # [x1, y1, x2, y2],
  634. # [x1, y1, x2, y2],
  635. # [x1, y1, x2, y2],
  636. # [x1, y1, x2, y2],
  637. # ],
  638. # [
  639. # [x1, y1, x2, y2],
  640. # [x1, y1, x2, y2],
  641. # [x1, y1, x2, y2],
  642. # [x1, y1, x2, y2],
  643. # ]
  644. # ],
  645. # }
  646. Tasks.video_object_detection:
  647. [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
  648. # 3d object detection result for single sample
  649. # {
  650. # "output_img": np.array with shape(h, w, 3)
  651. # }
  652. Tasks.object_detection_3d: [OutputKeys.OUTPUT_IMG],
  653. # instance segmentation result for single sample
  654. # {
  655. # "scores": [0.9, 0.1, 0.05, 0.05],
  656. # "labels": ["dog", "horse", "cow", "cat"],
  657. # "masks": [
  658. # np.array # 2D array containing only 0, 1
  659. # ]
  660. # }
  661. Tasks.image_segmentation:
  662. [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.MASKS],
  663. # video panoptic segmentation result for single sample
  664. # "scores": [[0.8, 0.25, 0.05, 0.05], [0.9, 0.1, 0.05, 0.05]]
  665. # "labels": [["person", "traffic light", "car", "bus"],
  666. # ["person", "traffic light", "car", "bus"]]
  667. # "masks": [ #array containing only 0, 1
  668. # [np.array, np.array, np.array, np.array],
  669. # [np.array, np.array, np.array, np.array],
  670. # ]
  671. # "boxes":
  672. # [
  673. # [
  674. # [x1, y1, x2, y2],
  675. # [x1, y1, x2, y2],
  676. # [x1, y1, x2, y2],
  677. # [x1, y1, x2, y2],
  678. # ],
  679. # [
  680. # [x1, y1, x2, y2],
  681. # [x1, y1, x2, y2],
  682. # [x1, y1, x2, y2],
  683. # [x1, y1, x2, y2],
  684. # ]
  685. # ],
  686. # "uuid": [[0, 1, 2, 3],[0, 1, 2, 3]]
  687. # }
  688. Tasks.video_panoptic_segmentation: [
  689. OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.MASKS,
  690. OutputKeys.BOXES, OutputKeys.UUID
  691. ],
  692. # semantic segmentation result for single sample
  693. # {
  694. # "masks": [np.array # 2D array with shape [height, width]]
  695. # }
  696. Tasks.semantic_segmentation: [OutputKeys.MASKS],
  697. # image matting result for single sample
  698. # {
  699. # "output_img": np.array with shape(h, w, 4)
  700. # for matting or (h, w, 3) for general purpose
  701. # , shape(h, w) for crowd counting
  702. # }
  703. Tasks.portrait_matting: [OutputKeys.OUTPUT_IMG],
  704. Tasks.universal_matting: [OutputKeys.OUTPUT_IMG],
  705. Tasks.image_deblurring: [OutputKeys.OUTPUT_IMG],
  706. Tasks.image_face_fusion: [OutputKeys.OUTPUT_IMG],
  707. # image_quality_assessment_mos result for a single image is a score in range [0, 1]
  708. # {0.5}
  709. Tasks.image_quality_assessment_mos: [OutputKeys.SCORE],
  710. # image editing task result for a single image
  711. # {"output_img": np.array with shape (h, w, 3)}
  712. Tasks.skin_retouching: [OutputKeys.OUTPUT_IMG],
  713. Tasks.image_super_resolution: [OutputKeys.OUTPUT_IMG],
  714. Tasks.image_super_resolution_pasd: [OutputKeys.OUTPUT_IMG],
  715. Tasks.image_colorization: [OutputKeys.OUTPUT_IMG],
  716. Tasks.image_color_enhancement: [OutputKeys.OUTPUT_IMG],
  717. Tasks.image_denoising: [OutputKeys.OUTPUT_IMG],
  718. Tasks.image_editing: [OutputKeys.OUTPUT_IMG],
  719. Tasks.image_portrait_enhancement: [OutputKeys.OUTPUT_IMG],
  720. Tasks.crowd_counting: [OutputKeys.SCORES, OutputKeys.OUTPUT_IMG],
  721. Tasks.image_inpainting: [OutputKeys.OUTPUT_IMG],
  722. Tasks.image_paintbyexample: [OutputKeys.OUTPUT_IMG],
  723. Tasks.controllable_image_generation: [OutputKeys.OUTPUT_IMG],
  724. # image generation task result for a single image
  725. # {"output_img": np.array with shape (h, w, 3)}
  726. Tasks.image_to_image_generation: [OutputKeys.OUTPUT_IMG],
  727. Tasks.image_to_image_translation: [OutputKeys.OUTPUT_IMG],
  728. Tasks.image_style_transfer: [OutputKeys.OUTPUT_IMG],
  729. Tasks.image_portrait_stylization: [OutputKeys.OUTPUT_IMG],
  730. Tasks.image_body_reshaping: [OutputKeys.OUTPUT_IMG],
  731. # video editing task result for a single video
  732. # {"output_video": "path_to_rendered_video"}
  733. Tasks.video_frame_interpolation: [OutputKeys.OUTPUT_VIDEO],
  734. Tasks.video_super_resolution: [OutputKeys.OUTPUT_VIDEO],
  735. Tasks.video_deinterlace: [OutputKeys.OUTPUT_VIDEO],
  736. Tasks.nerf_recon_acc: [OutputKeys.OUTPUT],
  737. Tasks.nerf_recon_vq_compression: [OutputKeys.OUTPUT],
  738. Tasks.surface_recon_common: [OutputKeys.OUTPUT],
  739. Tasks.video_colorization: [OutputKeys.OUTPUT_VIDEO],
  740. Tasks.image_control_3d_portrait: [OutputKeys.OUTPUT],
  741. Tasks.self_supervised_depth_completion: [OutputKeys.OUTPUT_IMG],
  742. # image quality assessment degradation result for single image
  743. # {
  744. # "scores": [0.885272, 0.014790631, 0.014558001]
  745. # "labels": ['噪声强度', '模糊程度', '压缩强度'],
  746. # }
  747. Tasks.image_quality_assessment_degradation: [
  748. OutputKeys.SCORES, OutputKeys.LABELS
  749. ],
  750. # live category recognition result for single video
  751. # {
  752. # "scores": [0.885272, 0.014790631, 0.014558001]
  753. # "labels": ['女装/女士精品>>棉衣/棉服', '女装/女士精品>>牛仔裤', '女装/女士精品>>裤子>>休闲裤'],
  754. # }
  755. Tasks.live_category: [OutputKeys.SCORES, OutputKeys.LABELS],
  756. # action recognition result for single video
  757. # {
  758. # "output_label": "abseiling"
  759. # }
  760. Tasks.action_recognition: [OutputKeys.LABELS],
  761. # human body keypoints detection result for single sample
  762. # {
  763. # "keypoints": [
  764. # [[x, y]*15],
  765. # [[x, y]*15],
  766. # [[x, y]*15]
  767. # ]
  768. # "scores": [
  769. # [[score]*15],
  770. # [[score]*15],
  771. # [[score]*15]
  772. # ]
  773. # "boxes": [
  774. # [x1, y1, x2, y2],
  775. # [x1, y1, x2, y2],
  776. # [x1, y1, x2, y2],
  777. # ]
  778. # }
  779. Tasks.body_2d_keypoints: [
  780. OutputKeys.KEYPOINTS, OutputKeys.SCORES, OutputKeys.BOXES
  781. ],
  782. # 3D human body keypoints detection result for single sample
  783. # {
  784. # "keypoints": [ # 3d pose coordinate in camera coordinate
  785. # [[x, y, z]*17], # joints of per image
  786. # [[x, y, z]*17],
  787. # ...
  788. # ],
  789. # "timestamps": [ # timestamps of all frames
  790. # "00:00:0.230",
  791. # "00:00:0.560",
  792. # "00:00:0.690",
  793. # ],
  794. # "output_video": "path_to_rendered_video" , this is optional
  795. # and is only available when the "render" option is enabled.
  796. # }
  797. Tasks.body_3d_keypoints: [
  798. OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO
  799. ],
  800. # pedestrian attribute recognition result for single sample
  801. # {
  802. # "boxes": [
  803. # [x1, y1, x2, y2],
  804. # [x1, y1, x2, y2],
  805. # [x1, y1, x2, y2],
  806. # ]
  807. # "labels": [
  808. # ['Female', 'AgeOver60', 'Front', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes',
  809. # 'LongSleeve', 'Black', 'Trousers', 'Black' ],
  810. # ['Female', 'AgeOver60', 'Front', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes',
  811. # 'LongSleeve', 'Black', 'Trousers', 'Black' ],
  812. # ['Female', 'AgeOver60', 'Front', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes',
  813. # 'LongSleeve', 'Black', 'Trousers', 'Black' ],
  814. # ]
  815. # }
  816. Tasks.pedestrian_attribute_recognition: [
  817. OutputKeys.BOXES, OutputKeys.LABELS
  818. ],
  819. # 3D face reconstruction result for single sample
  820. # {
  821. # "output_obj": io.BytesIO,
  822. # "output_img": np.array with shape(h, w, 3),
  823. # "output": {
  824. # "mesh": {
  825. # "vertices": np.array with shape(n, 3),
  826. # "faces": np.array with shape(n, 3),
  827. # "faces_uv": np.array with shape(n, 3),
  828. # "faces_normal": np.array with shape(n, 3),
  829. # "UVs": np.array with shape(n, 2),
  830. # "normals": np.array with shape(n, 3),
  831. # },
  832. # "vis_image": np.array with shape(h, w, 3),
  833. # "frame_list", [np.array with shape(h, w, 3), ...],
  834. # }
  835. # }
  836. Tasks.face_reconstruction: [OutputKeys.OUTPUT],
  837. Tasks.human3d_render: [OutputKeys.OUTPUT],
  838. Tasks.human3d_animation: [OutputKeys.OUTPUT],
  839. # 3D head reconstruction result for single sample
  840. # {
  841. # "output_obj": io.BytesIO,
  842. # "output_img": np.array with shape(h, w, 3),
  843. # "output": {
  844. # "mesh": {
  845. # "vertices": np.array with shape(n, 3),
  846. # "faces": np.array with shape(n, 3),
  847. # "faces_uv": np.array with shape(n, 3),
  848. # "faces_normal": np.array with shape(n, 3),
  849. # "UVs": np.array with shape(n, 2),
  850. # "normals": np.array with shape(n, 3),
  851. # },
  852. # }
  853. # }
  854. Tasks.head_reconstruction: [OutputKeys.OUTPUT],
  855. # text to head result for text input
  856. # {
  857. # "output_obj": io.BytesIO,
  858. # "output_img": np.array with shape(h, w, 3),
  859. # "output": {
  860. # "mesh": {
  861. # "vertices": np.array with shape(n, 3),
  862. # "faces": np.array with shape(n, 3),
  863. # "faces_uv": np.array with shape(n, 3),
  864. # "faces_normal": np.array with shape(n, 3),
  865. # "UVs": np.array with shape(n, 2),
  866. # "normals": np.array with shape(n, 3),
  867. # },
  868. # },
  869. # "image": np.array with shape(h, w, 3),
  870. # }
  871. Tasks.text_to_head: [OutputKeys.OUTPUT],
  872. # 3D human reconstruction result for single sample
  873. # {
  874. # "output": {
  875. # "vertices": np.array with shape(n, 3),
  876. # "faces": np.array with shape(n, 3),
  877. # "colors": np.array with shape(n, 3),
  878. # }
  879. # }
  880. Tasks.human_reconstruction: [OutputKeys.OUTPUT],
  881. # 3D text 2 texture generation result
  882. # {
  883. # "output": {
  884. # "Done"
  885. # }
  886. # }
  887. Tasks.text_texture_generation: [OutputKeys.OUTPUT],
  888. # 2D hand keypoints result for single sample
  889. # {
  890. # "keypoints": [
  891. # [[x, y, score] * 21],
  892. # [[x, y, score] * 21],
  893. # [[x, y, score] * 21],
  894. # ],
  895. # "boxes": [
  896. # [x1, y1, x2, y2],
  897. # [x1, y1, x2, y2],
  898. # [x1, y1, x2, y2],
  899. # ]
  900. # }
  901. Tasks.hand_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.BOXES],
  902. # video single object tracking result for single video
  903. # {
  904. # "boxes": [
  905. # [x1, y1, x2, y2],
  906. # [x1, y1, x2, y2],
  907. # [x1, y1, x2, y2],
  908. # ],
  909. # "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
  910. # }
  911. Tasks.video_single_object_tracking: [
  912. OutputKeys.BOXES, OutputKeys.TIMESTAMPS
  913. ],
  914. # video multi object tracking result for single video
  915. # {
  916. # "boxes": [
  917. # [
  918. # [x1, y1, x2, y2],
  919. # [x1, y1, x2, y2],
  920. # ...
  921. # ],
  922. # [
  923. # [x1, y1, x2, y2],
  924. # [x1, y1, x2, y2],
  925. # ...
  926. # ],
  927. # [
  928. # [x1, y1, x2, y2]
  929. # ...
  930. # ]
  931. # ],
  932. # "labels": [[obj_id0, obj_id1, ...], [obj_id1, obj_id2, ...], [obj_id3, ...]],
  933. # "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
  934. # }
  935. Tasks.video_multi_object_tracking: [
  936. OutputKeys.BOXES, OutputKeys.LABELS, OutputKeys.TIMESTAMPS
  937. ],
  938. # live category recognition result for single video
  939. # {
  940. # "scores": [0.885272, 0.014790631, 0.014558001],
  941. # 'labels': ['修身型棉衣', '高腰牛仔裤', '休闲连体裤']
  942. # }
  943. Tasks.live_category: [OutputKeys.SCORES, OutputKeys.LABELS],
  944. # video category recognition result for single video
  945. # {
  946. # "scores": [0.7716429233551025],
  947. # "labels": ['生活>>好物推荐']
  948. # }
  949. Tasks.video_category: [OutputKeys.SCORES, OutputKeys.LABELS],
  950. # image embedding result for a single image
  951. # {
  952. # "image_bedding": np.array with shape [D]
  953. # }
  954. Tasks.product_retrieval_embedding: [OutputKeys.IMG_EMBEDDING],
  955. # video embedding result for single video
  956. # {
  957. # "video_embedding": np.array with shape [D],
  958. # }
  959. Tasks.video_embedding: [OutputKeys.VIDEO_EMBEDDING],
  960. # phrase prototype result for single sentence
  961. # {
  962. # "phrase_prototype": np.array with shape [K*D],
  963. # }
  964. # sentence prototype result for single sentence
  965. # {
  966. # "sentence_prototype": np.array with shape [1*D],
  967. # }
  968. # object prototype result for single video
  969. # {
  970. # "object_prototype": np.array with shape [N*K*D],
  971. # }
  972. # event prototype result for single video
  973. # {
  974. # "event_prototype": np.array with shape [N*M*D],
  975. # }
  976. # text search video result for single sentence
  977. # {
  978. # "textvideo_sim": np.array with shape [N*N],
  979. # }
  980. Tasks.text_video_retrieval: [
  981. OutputKeys.PHRASE_PROTOTYPE, OutputKeys.SENTENCE_PROTOTYPE,
  982. OutputKeys.OBJECT_PROTOTYPE, OutputKeys.EVENT_PROTOTYPE,
  983. OutputKeys.TEXTVIDEO_SIM
  984. ],
  985. # video stabilization task result for a single video
  986. # {"output_video": "path_to_rendered_video"}
  987. Tasks.video_stabilization: [OutputKeys.OUTPUT_VIDEO],
  988. # virtual_try_on result for a single sample
  989. # {
  990. # "output_img": np.ndarray with shape [height, width, 3]
  991. # }
  992. Tasks.virtual_try_on: [OutputKeys.OUTPUT_IMG],
  993. # text driven segmentation result for single sample
  994. # {
  995. # "masks": [
  996. # np.array # 2D array containing only 0, 255
  997. # ]
  998. # }
  999. Tasks.text_driven_segmentation: [OutputKeys.MASKS],
  1000. # shop segmentation result for single sample
  1001. # {
  1002. # "masks": [
  1003. # np.array # 2D array containing only 0, 255
  1004. # ]
  1005. # }
  1006. Tasks.shop_segmentation: [OutputKeys.MASKS],
  1007. # movide scene segmentation result for a single video
  1008. # {
  1009. # "shot_num":15,
  1010. # "shot_meta_list":
  1011. # [
  1012. # {
  1013. # "frame": [start_frame, end_frame],
  1014. # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
  1015. #
  1016. # }
  1017. # ]
  1018. # "scene_num":3,
  1019. # "scene_meta_list":
  1020. # [
  1021. # {
  1022. # "shot": [0,1,2],
  1023. # "frame": [start_frame, end_frame],
  1024. # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
  1025. # }
  1026. # ]
  1027. #
  1028. # }
  1029. Tasks.movie_scene_segmentation: [
  1030. OutputKeys.SHOT_NUM, OutputKeys.SHOT_META_LIST, OutputKeys.SCENE_NUM,
  1031. OutputKeys.SCENE_META_LIST
  1032. ],
  1033. # human whole body keypoints detection result for single sample
  1034. # {
  1035. # "keypoints": [
  1036. # [[x, y]*133],
  1037. # [[x, y]*133],
  1038. # [[x, y]*133]
  1039. # ]
  1040. # "boxes": [
  1041. # [x1, y1, x2, y2],
  1042. # [x1, y1, x2, y2],
  1043. # [x1, y1, x2, y2],
  1044. # ]
  1045. # }
  1046. Tasks.human_wholebody_keypoint: [OutputKeys.KEYPOINTS, OutputKeys.BOXES],
  1047. # video summarization result for a single video
  1048. # {
  1049. # "output":
  1050. # [
  1051. # {
  1052. # "frame": [start_frame, end_frame]
  1053. # "timestamps": [start_time, end_time]
  1054. # },
  1055. # {
  1056. # "frame": [start_frame, end_frame]
  1057. # "timestamps": [start_time, end_time]
  1058. # }
  1059. # ]
  1060. # }
  1061. Tasks.video_summarization: [OutputKeys.OUTPUT],
  1062. # referring video object segmentation result for a single video
  1063. # {
  1064. # "masks": [np.array # 3D array with shape [frame_num, height, width]]
  1065. # "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
  1066. # "output_video": "path_to_rendered_video" , this is optional
  1067. # and is only available when the "render" option is enabled.
  1068. # }
  1069. Tasks.referring_video_object_segmentation: [
  1070. OutputKeys.MASKS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO
  1071. ],
  1072. # video human matting result for a single video
  1073. # {
  1074. # "masks": [np.array # 2D array with shape [height, width]]
  1075. # "output_video": "path_to_matting_video"
  1076. # }
  1077. Tasks.video_human_matting: [OutputKeys.MASKS, OutputKeys.OUTPUT_VIDEO],
  1078. # ============ nlp tasks ===================
  1079. # text classification result for single sample
  1080. # {
  1081. # "scores": [0.9, 0.1, 0.05, 0.05]
  1082. # "labels": ["happy", "sad", "calm", "angry"],
  1083. # }
  1084. Tasks.text_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
  1085. # sentence similarity result for single sample
  1086. # {
  1087. # "scores": 0.9
  1088. # "labels": "1",
  1089. # }
  1090. Tasks.sentence_similarity: [OutputKeys.SCORES, OutputKeys.LABELS],
  1091. # nli result for single sample
  1092. # {
  1093. # "labels": ["happy", "sad", "calm", "angry"],
  1094. # "scores": [0.9, 0.1, 0.05, 0.05]
  1095. # }
  1096. Tasks.nli: [OutputKeys.SCORES, OutputKeys.LABELS],
  1097. # sentiment classification result for single sample
  1098. # {
  1099. # 'scores': [0.07183828949928284, 0.9281617403030396],
  1100. # 'labels': ['1', '0']
  1101. # }
  1102. Tasks.sentiment_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
  1103. # zero-shot classification result for single sample
  1104. # {
  1105. # "scores": [0.9, 0.1, 0.05, 0.05]
  1106. # "labels": ["happy", "sad", "calm", "angry"],
  1107. # }
  1108. Tasks.zero_shot_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
  1109. # relation extraction result for a single sample
  1110. # {
  1111. # "uuid": "人生信息-1",
  1112. # "text": "《父老乡亲》是由是由由中国人民解放军海政文工团创作的军旅歌曲,石顺义作词,王锡仁作曲,范琳琳演唱",
  1113. # "spo_list": [{"subject": "石顺义", "predicate": "国籍", "object": "中国"}]
  1114. # }
  1115. Tasks.relation_extraction: [OutputKeys.SPO_LIST],
  1116. # translation result for a source sentence
  1117. # {
  1118. # "translation": “北京是中国的首都”
  1119. # }
  1120. Tasks.translation: [OutputKeys.TRANSLATION],
  1121. # word segmentation result for single sample
  1122. # {
  1123. # "output": ["今天", "天气", "不错", ",", "适合", "出去", "游玩"]
  1124. # }
  1125. # {
  1126. # 'output': ['รถ', 'คัน', 'เก่า', 'ก็', 'ยัง', 'เก็บ', 'เอา']
  1127. # }
  1128. Tasks.word_segmentation: [OutputKeys.OUTPUT],
  1129. # TODO @wenmeng.zwm support list of result check
  1130. # named entity recognition result for single sample
  1131. # {
  1132. # "output": [
  1133. # {"type": "LOC", "start": 2, "end": 5, "span": "温岭市"},
  1134. # {"type": "LOC", "start": 5, "end": 8, "span": "新河镇"}
  1135. # ]
  1136. # }
  1137. Tasks.named_entity_recognition: [OutputKeys.OUTPUT],
  1138. Tasks.part_of_speech: [OutputKeys.OUTPUT],
  1139. # text_error_correction result for a single sample
  1140. # {
  1141. # "output": "我想吃苹果"
  1142. # }
  1143. Tasks.text_error_correction: [OutputKeys.OUTPUT],
  1144. # word_alignment result for a single sample
  1145. # {
  1146. # "output": "0-0 1-3 2-4 3-1 4-2 5-5"
  1147. # }
  1148. Tasks.word_alignment: [OutputKeys.OUTPUT],
  1149. Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
  1150. Tasks.text_ranking: [OutputKeys.SCORES],
  1151. # text generation result for single sample
  1152. # {
  1153. # "text": "this is the text generated by a model."
  1154. # }
  1155. Tasks.text_generation: [OutputKeys.TEXT],
  1156. # chat task result for single sample
  1157. # {
  1158. # "response": "this is the chat response generated by a model.",
  1159. # "history": [("hi", "nice to meet you"),("I felt happy, and you", "me too")]
  1160. # }
  1161. Tasks.chat: [OutputKeys.RESPONSE, OutputKeys.HISTORY],
  1162. # fid dialogue result for single sample
  1163. # {
  1164. # "text": "My name is Mike"
  1165. # }
  1166. Tasks.fid_dialogue: [OutputKeys.TEXT],
  1167. # summarization result for single sample
  1168. # {
  1169. # "text": "this is the text generated by a model."
  1170. # }
  1171. Tasks.text_summarization: [OutputKeys.TEXT],
  1172. # text generation result for single sample
  1173. # {
  1174. # "text": "北京"
  1175. # }
  1176. Tasks.text2text_generation: [OutputKeys.TEXT],
  1177. # fill mask result for single sample
  1178. # {
  1179. # "text": "this is the text which masks filled by model."
  1180. # }
  1181. Tasks.fill_mask: [OutputKeys.TEXT],
  1182. # feature extraction result for single sample
  1183. # {
  1184. # "text_embedding": [[
  1185. # [1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04],
  1186. # [6.45841064e-05, 1.15997791e-04, 5.11605394e-05, 9.87020373e-01],
  1187. # [2.66957268e-05, 4.72324500e-05, 9.74208378e-05, 4.18022355e-05]
  1188. # ],
  1189. # [
  1190. # [2.97343540e-05, 5.81317654e-05, 5.44203431e-05, 6.28319322e-05],
  1191. # [8.24327726e-05, 4.66077945e-05, 5.32869453e-05, 4.16190960e-05],
  1192. # [3.61441926e-05, 3.38475402e-05, 3.44323053e-05, 5.70138109e-05]
  1193. # ]
  1194. # ]
  1195. # }
  1196. Tasks.feature_extraction: [OutputKeys.TEXT_EMBEDDING],
  1197. # (Deprecated) dialog intent prediction result for single sample
  1198. # {'output': {'prediction': array([2.62349960e-03, 4.12110658e-03, 4.12748595e-05, 3.77560973e-05,
  1199. # 1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04,
  1200. # 6.45841064e-05, 1.15997791e-04, 5.11605394e-05, 9.87020373e-01,
  1201. # 2.66957268e-05, 4.72324500e-05, 9.74208378e-05, 4.18022355e-05,
  1202. # 2.97343540e-05, 5.81317654e-05, 5.44203431e-05, 6.28319322e-05,
  1203. # 7.34537680e-05, 6.61411541e-05, 3.62534920e-05, 8.58885178e-05,
  1204. # 8.24327726e-05, 4.66077945e-05, 5.32869453e-05, 4.16190960e-05,
  1205. # 5.97518992e-05, 3.92273068e-05, 3.44069012e-05, 9.92335918e-05,
  1206. # 9.25978165e-05, 6.26462061e-05, 3.32317031e-05, 1.32061413e-03,
  1207. # 2.01607945e-05, 3.36636294e-05, 3.99156743e-05, 5.84108493e-05,
  1208. # 2.53432900e-05, 4.95731190e-04, 2.64443643e-05, 4.46992999e-05,
  1209. # 2.42672231e-05, 4.75615161e-05, 2.66230145e-05, 4.00083954e-05,
  1210. # 2.90536875e-04, 4.23891543e-05, 8.63691166e-05, 4.98188965e-05,
  1211. # 3.47019341e-05, 4.52718523e-05, 4.20905781e-05, 5.50173208e-05,
  1212. # 4.92360487e-05, 3.56021264e-05, 2.13957210e-05, 6.17428886e-05,
  1213. # 1.43893281e-04, 7.32152112e-05, 2.91354867e-04, 2.46623786e-05,
  1214. # 3.61441926e-05, 3.38475402e-05, 3.44323053e-05, 5.70138109e-05,
  1215. # 4.31488479e-05, 4.94503947e-05, 4.30105974e-05, 1.00963116e-04,
  1216. # 2.82062047e-05, 1.15582036e-04, 4.48261271e-05, 3.99339879e-05,
  1217. # 7.27692823e-05], dtype=float32), 'label_pos': array([11]), 'label': 'lost_or_stolen_card'}}
  1218. # (Deprecated) dialog modeling prediction result for single sample
  1219. # {'output' : ['you', 'are', 'welcome', '.', 'have', 'a', 'great', 'day', '!']}
  1220. # (Deprecated) dialog state tracking result for single sample
  1221. # {
  1222. # "output":{
  1223. # "dialog_states": {
  1224. # "taxi-leaveAt": "none",
  1225. # "taxi-destination": "none",
  1226. # "taxi-departure": "none",
  1227. # "taxi-arriveBy": "none",
  1228. # "restaurant-book_people": "none",
  1229. # "restaurant-book_day": "none",
  1230. # "restaurant-book_time": "none",
  1231. # "restaurant-food": "none",
  1232. # "restaurant-pricerange": "none",
  1233. # "restaurant-name": "none",
  1234. # "restaurant-area": "none",
  1235. # "hotel-book_people": "none",
  1236. # "hotel-book_day": "none",
  1237. # "hotel-book_stay": "none",
  1238. # "hotel-name": "none",
  1239. # "hotel-area": "none",
  1240. # "hotel-parking": "none",
  1241. # "hotel-pricerange": "cheap",
  1242. # "hotel-stars": "none",
  1243. # "hotel-internet": "none",
  1244. # "hotel-type": "true",
  1245. # "attraction-type": "none",
  1246. # "attraction-name": "none",
  1247. # "attraction-area": "none",
  1248. # "train-book_people": "none",
  1249. # "train-leaveAt": "none",
  1250. # "train-destination": "none",
  1251. # "train-day": "none",
  1252. # "train-arriveBy": "none",
  1253. # "train-departure": "none"
  1254. # }
  1255. # }
  1256. # }
  1257. Tasks.task_oriented_conversation: [OutputKeys.OUTPUT],
  1258. # table-question-answering result for single sample
  1259. # {
  1260. # "sql": "SELECT shop.Name FROM shop."
  1261. # "sql_history": {sel: 0, agg: 0, conds: [[0, 0, 'val']]}
  1262. # }
  1263. Tasks.table_question_answering: [OutputKeys.OUTPUT],
  1264. # ============ audio tasks ===================
  1265. # asr result for single sample
  1266. # { "text": "每一天都要快乐喔"}
  1267. Tasks.auto_speech_recognition: [OutputKeys.TEXT],
  1268. # itn result for single sample
  1269. # {"text": "123"}
  1270. Tasks.inverse_text_processing: [OutputKeys.TEXT],
  1271. # speaker verification for single compare task
  1272. # {'score': 84.2332}
  1273. Tasks.speaker_verification: [OutputKeys.SCORES],
  1274. # speaker diarization dialogue detection for binary results: dialogue or non_dialogue
  1275. # {
  1276. # "scores": [0.98, 0.02],
  1277. # "labels": ["dialogue", "non_dialogue"],
  1278. # }
  1279. Tasks.speaker_diarization_dialogue_detection: [
  1280. OutputKeys.SCORES, OutputKeys.LABELS
  1281. ],
  1282. Tasks.speech_language_recognition: [OutputKeys.TEXT],
  1283. # punctuation result for single sample
  1284. # { "text": "你好,明天!"}
  1285. Tasks.punctuation: [OutputKeys.TEXT],
  1286. # speaker diarization semantic speaker-turn detection
  1287. # {
  1288. # "logits": [[0.7, 0.3], ..., [0.88, 0.12]],
  1289. # "text": "您好。您好,初次见面请多指教。",
  1290. # "prediction": [-100, -100, -100, 1, -100,..., -100, 0]
  1291. # }
  1292. Tasks.speaker_diarization_semantic_speaker_turn_detection: [
  1293. OutputKeys.LOGITS, OutputKeys.TEXT, OutputKeys.PREDICTION
  1294. ],
  1295. # language model result for single sample
  1296. # { "text": " hel@@ lo 大 家 好 呀 </s>
  1297. # p( hel@@ | <s> ) = 0.00057767 [ -7.45650959 ]
  1298. # p( lo | hel@@ ) = 0.99832278 [ -0.00167861 ]
  1299. # p( 大 | lo ) = 0.49116334 [ -0.71097857 ]
  1300. # p( 家 | 大 ) = 0.99691027 [ -0.00309453 ]
  1301. # p( 好 | 家 ) = 0.97999156 [ -0.02021134 ]
  1302. # p( 呀 | 好 ) = 0.00461205 [ -5.37908363 ]
  1303. # p( </s> | 呀 ) = 0.01524554 [ -4.18346834 ]
  1304. # logprob= -17.755 ppl= 12.6345
  1305. # "}
  1306. Tasks.language_score_prediction: [OutputKeys.TEXT],
  1307. # speech timestamp result for single sample
  1308. # {
  1309. # 'text': '<sil> 0.000 0.376;一 0.376 0.556;个 0.556 0.796;东 0.796 0.976;
  1310. # 太 0.976 1.136;平 1.136 1.256;洋 1.256 1.436;国 1.436 1.676;
  1311. # <sil> 1.676 1.676;家 1.676 1.916;<sil> 1.916 2.036;为 2.036 2.196;
  1312. # 什 2.196 2.316;么 2.316 2.496;跑 2.496 2.676;到 2.676 2.856;
  1313. # 西 2.856 3.036;太 3.036 3.196;平 3.196 3.376;洋 3.376 3.496;
  1314. # 来 3.496 3.636;了 3.636 3.796;呢 3.796 4.148;<sil> 4.148 4.440;',
  1315. # 'timestamp': [[0, 376], [376, 556], [556, 795], [795, 976],
  1316. # [976, 1136], [1136, 1256], [1256, 1436], [1436, 1676],
  1317. # [1676, 1676], [1676, 1916], [1916, 2036], [2036, 2196],
  1318. # [2196, 2316], [2316, 2496], [2496, 2676], [2676, 2856],
  1319. # [2856, 3036], [3036, 3196], [3196, 3376], [3376, 3496]]
  1320. # }
  1321. Tasks.speech_timestamp: [OutputKeys.TEXT],
  1322. # audio processed for single file in PCM format
  1323. # {
  1324. # "output_pcm": pcm encoded audio bytes
  1325. # }
  1326. Tasks.speech_signal_process: [OutputKeys.OUTPUT_PCM],
  1327. Tasks.acoustic_echo_cancellation: [OutputKeys.OUTPUT_PCM],
  1328. Tasks.acoustic_noise_suppression: [OutputKeys.OUTPUT_PCM],
  1329. Tasks.speech_separation: [OutputKeys.OUTPUT_PCM_LIST],
  1330. # text_to_speech result for a single sample
  1331. # {
  1332. # "output_wav": {"input_label" : bytes}
  1333. # }
  1334. Tasks.text_to_speech: [OutputKeys.OUTPUT_WAV],
  1335. # {
  1336. # "kws_list": [
  1337. # {
  1338. # 'keyword': '', # the keyword spotted
  1339. # 'offset': 19.4, # the keyword start time in second
  1340. # 'length': 0.68, # the keyword length in second
  1341. # 'confidence': 0.85 # the possibility if it is the keyword
  1342. # },
  1343. # ...
  1344. # ]
  1345. # }
  1346. Tasks.keyword_spotting: [OutputKeys.KWS_LIST],
  1347. # ============ multi-modal tasks ===================
  1348. # image caption result for single sample
  1349. # {
  1350. # "caption": "this is an image caption text."
  1351. # }
  1352. Tasks.image_captioning: [OutputKeys.CAPTION],
  1353. # video caption result for single sample
  1354. # {
  1355. # "caption": "this is an video caption text."
  1356. # }
  1357. Tasks.video_captioning: [OutputKeys.CAPTION],
  1358. Tasks.ocr_recognition: [OutputKeys.TEXT],
  1359. # visual grounding result for single sample
  1360. # {
  1361. # "boxes": [
  1362. # [x1, y1, x2, y2],
  1363. # [x1, y1, x2, y2],
  1364. # [x1, y1, x2, y2],
  1365. # ],
  1366. # "scores": [0.9, 0.1, 0.05, 0.05]
  1367. # }
  1368. Tasks.visual_grounding: [OutputKeys.BOXES, OutputKeys.SCORES],
  1369. # text_to_image result for samples
  1370. # {
  1371. # "output_imgs": np.ndarray list with shape [[height, width, 3], ...]
  1372. # }
  1373. Tasks.text_to_image_synthesis: [OutputKeys.OUTPUT_IMGS],
  1374. # text_to_speech result for a single sample
  1375. # {
  1376. # "output_wav": {"input_label" : bytes}
  1377. # }
  1378. Tasks.text_to_speech: [OutputKeys.OUTPUT_WAV],
  1379. # multi-modal embedding result for single sample
  1380. # {
  1381. # "img_embedding": np.array with shape [1, D],
  1382. # "text_embedding": np.array with shape [1, D]
  1383. # }
  1384. Tasks.multi_modal_embedding: [
  1385. OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING
  1386. ],
  1387. # generative multi-modal embedding result for single sample
  1388. # {
  1389. # "img_embedding": np.array with shape [1, D],
  1390. # "text_embedding": np.array with shape [1, D],
  1391. # "caption": "this is an image caption text."
  1392. # }
  1393. Tasks.generative_multi_modal_embedding: [
  1394. OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.CAPTION
  1395. ],
  1396. # multi-modal similarity result for single sample
  1397. # {
  1398. # "img_embedding": np.array with shape [1, D],
  1399. # "text_embedding": np.array with shape [1, D],
  1400. # "similarity": float
  1401. # }
  1402. Tasks.multi_modal_similarity: [
  1403. OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES
  1404. ],
  1405. # VQA result for a sample
  1406. # {"text": "this is a text answser. "}
  1407. Tasks.visual_question_answering: [OutputKeys.TEXT],
  1408. # VideoQA result for a sample
  1409. # {"text": "this is a text answser. "}
  1410. Tasks.video_question_answering: [OutputKeys.TEXT],
  1411. # Multimodal Dialogue result for a sample
  1412. # {"text": "this is a text response. "}
  1413. Tasks.multimodal_dialogue: [OutputKeys.TEXT],
  1414. # auto_speech_recognition result for a single sample
  1415. # {
  1416. # "text": "每天都要快乐喔"
  1417. # }
  1418. Tasks.auto_speech_recognition: [OutputKeys.TEXT],
  1419. # {
  1420. # "scores": [0.9, 0.1, 0.1],
  1421. # "labels": ["entailment", "contradiction", "neutral"]
  1422. # }
  1423. Tasks.visual_entailment: [OutputKeys.SCORES, OutputKeys.LABELS],
  1424. # {
  1425. # 'labels': ['吸烟', '打电话', '吸烟'],
  1426. # 'scores': [0.7527753114700317, 0.753358006477356, 0.6880350708961487],
  1427. # 'boxes': [[547, 2, 1225, 719], [529, 8, 1255, 719], [584, 0, 1269, 719]],
  1428. # 'timestamps': [1, 3, 5]
  1429. # }
  1430. Tasks.action_detection: [
  1431. OutputKeys.TIMESTAMPS,
  1432. OutputKeys.LABELS,
  1433. OutputKeys.SCORES,
  1434. OutputKeys.BOXES,
  1435. ],
  1436. # {
  1437. # 'output': [
  1438. # [{'label': '6527856', 'score': 0.9942756295204163}, {'label': '1000012000', 'score': 0.0379515215754509},
  1439. # {'label': '13421097', 'score': 2.2825044965202324e-08}],
  1440. # [{'label': '1000012000', 'score': 0.910681426525116}, {'label': '6527856', 'score': 0.0005046309670433402},
  1441. # {'label': '13421097', 'score': 2.75914817393641e-06}],
  1442. # [{'label': '1000012000', 'score': 0.910681426525116}, {'label': '6527856', 'score': 0.0005046309670433402},
  1443. # {'label': '13421097', 'score': 2.75914817393641e-06}]]
  1444. # }
  1445. Tasks.faq_question_answering: [OutputKeys.OUTPUT],
  1446. # image person reid result for single sample
  1447. # {
  1448. # "img_embedding": np.array with shape [1, D],
  1449. # }
  1450. Tasks.image_reid_person: [OutputKeys.IMG_EMBEDDING],
  1451. # {
  1452. # 'output': ['Done' / 'Decode_Error']
  1453. # }
  1454. Tasks.video_inpainting: [OutputKeys.OUTPUT],
  1455. # {
  1456. # 'output': ['bixin']
  1457. # }
  1458. Tasks.hand_static: [OutputKeys.OUTPUT],
  1459. # { 'labels': [2, 1, 0],
  1460. # 'boxes':[[[78, 282, 240, 504], [127, 87, 332, 370], [0, 0, 367, 639]]
  1461. # 'scores':[0.8202137351036072, 0.8987470269203186, 0.9679114818572998]
  1462. # }
  1463. Tasks.face_human_hand_detection: [
  1464. OutputKeys.LABELS, OutputKeys.BOXES, OutputKeys.SCORES
  1465. ],
  1466. # {
  1467. # {'output': 'Happiness', 'boxes': (203, 104, 663, 564)}
  1468. # }
  1469. Tasks.face_emotion: [OutputKeys.OUTPUT, OutputKeys.BOXES],
  1470. # {
  1471. # "masks": [
  1472. # np.array # 2D array containing only 0, 255
  1473. # ]
  1474. # }
  1475. Tasks.product_segmentation: [OutputKeys.MASKS],
  1476. # image_skychange result for a single sample
  1477. # {
  1478. # "output_img": np.ndarray with shape [height, width, 3]
  1479. # }
  1480. Tasks.image_skychange: [OutputKeys.OUTPUT_IMG],
  1481. # {
  1482. # 'score': [0.1, 0.2, 0.3, ...]
  1483. # }
  1484. Tasks.translation_evaluation: [OutputKeys.SCORE],
  1485. # video object segmentation result for a single video
  1486. # {
  1487. # "masks": [np.array # 3D array with shape [frame_num, height, width]]
  1488. # }
  1489. Tasks.video_object_segmentation: [OutputKeys.MASKS],
  1490. # motion generation result for a single input
  1491. # {
  1492. # "keypoints": [np.array # 3D array with shape [frame_num, joint_num, 3]]
  1493. # "output_video": "path_to_rendered_video"
  1494. # }
  1495. Tasks.motion_generation: [OutputKeys.KEYPOINTS, OutputKeys.OUTPUT_VIDEO],
  1496. # bad image detecting for a single input
  1497. # {
  1498. # "scores": [0.8, 0.1, 0.1]
  1499. # "labels": ["正常", "花屏", "绿屏"],
  1500. Tasks.bad_image_detecting: [OutputKeys.SCORES, OutputKeys.LABELS],
  1501. # vision efficient tuning result for single sample
  1502. # {
  1503. # "scores": [0.9, 0.1, 0.05, 0.05]
  1504. # "labels": ["dog", "horse", "cow", "cat"],
  1505. # }
  1506. Tasks.vision_efficient_tuning: [OutputKeys.SCORES, OutputKeys.LABELS],
  1507. Tasks.document_grounded_dialog_generate: [OutputKeys.TEXT],
  1508. Tasks.document_grounded_dialog_rerank: [OutputKeys.OUTPUT],
  1509. Tasks.document_grounded_dialog_retrieval: [OutputKeys.OUTPUT],
  1510. Tasks.video_temporal_grounding: [OutputKeys.SCORES, OutputKeys.TBOUNDS],
  1511. Tasks.text_to_video_synthesis: [OutputKeys.OUTPUT_VIDEO],
  1512. Tasks.text_to_360panorama_image: [OutputKeys.OUTPUT_IMG],
  1513. # Tasks.image_try_on result for a single sample
  1514. # {
  1515. # "output_img": np.ndarray with shape [height, width, 3]
  1516. # }
  1517. Tasks.image_try_on: [OutputKeys.OUTPUT_IMG],
  1518. # Tasks.human_image_generation result for a single sample
  1519. # {
  1520. # "output_img": np.ndarray with shape [height, width, 3]
  1521. # }
  1522. Tasks.human_image_generation: [OutputKeys.OUTPUT_IMG],
  1523. # Tasks.image_view_transform result for a single sample
  1524. # {
  1525. # "output_imgs": np.ndarray list with shape [[height, width, 3], ...]
  1526. # }
  1527. Tasks.image_view_transform: [OutputKeys.OUTPUT_IMGS],
  1528. Tasks.image_to_3d: [OutputKeys.MV_IMGS],
  1529. Tasks.siamese_uie: [OutputKeys.OUTPUT],
  1530. }
  1531. class ModelOutputBase(list):
  1532. def __post_init__(self):
  1533. self.reconstruct()
  1534. self.post_init = True
  1535. def reconstruct(self):
  1536. # Low performance, but low frequency.
  1537. self.clear()
  1538. for idx, key in enumerate(self.keys()):
  1539. self.append(getattr(self, key))
  1540. def __getitem__(self, item):
  1541. if isinstance(item, str):
  1542. if hasattr(self, item):
  1543. return getattr(self, item)
  1544. elif isinstance(item, (int, slice)):
  1545. return super().__getitem__(item)
  1546. raise IndexError(f'No Index {item} found in the dataclass.')
  1547. def __setitem__(self, key, value):
  1548. if isinstance(key, str):
  1549. if key in [f.name for f in fields(self)]:
  1550. if key not in self.keys():
  1551. super().__setattr__(key, value)
  1552. self.reconstruct()
  1553. elif id(getattr(self, key)) != id(value):
  1554. super().__setattr__(key, value)
  1555. super().__setitem__(self.keys().index(key), value)
  1556. else:
  1557. super().__setattr__(key, value)
  1558. elif isinstance(key, int):
  1559. super().__setitem__(key, value)
  1560. key_name = self.keys()[key]
  1561. super().__setattr__(key_name, value)
  1562. def __setattr__(self, key, value):
  1563. if getattr(self, 'post_init', False):
  1564. return self.__setitem__(key, value)
  1565. else:
  1566. return super().__setattr__(key, value)
  1567. def keys(self):
  1568. return [
  1569. f.name for f in fields(self) if getattr(self, f.name) is not None
  1570. ]
  1571. def items(self):
  1572. return self.to_dict().items()
  1573. def to_dict(self):
  1574. output = OrderedDict()
  1575. for key in self.keys():
  1576. output[key] = getattr(self, key)
  1577. return output