| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710 |
- # Copyright (c) Alibaba, Inc. and its affiliates.
- from collections import OrderedDict, namedtuple
- from dataclasses import dataclass, fields
- from typing import Dict, List, Tuple
- import numpy as np
- import torch
- from modelscope.utils.constant import Tasks
- class OutputKeys(object):
- LOSS = 'loss'
- LOGITS = 'logits'
- SCORES = 'scores'
- SCORE = 'score'
- LABEL = 'label'
- LABELS = 'labels'
- INPUT_IDS = 'input_ids'
- LABEL_POS = 'label_pos'
- POSES = 'poses'
- CAPTION = 'caption'
- BOXES = 'boxes'
- KEYPOINTS = 'keypoints'
- MASKS = 'masks'
- DEPTHS = 'depths'
- DEPTHS_COLOR = 'depths_color'
- FLOWS = 'flows'
- FLOWS_COLOR = 'flows_color'
- NORMALS = 'normals'
- NORMALS_COLOR = 'normals_color'
- LAYOUT = 'layout'
- TEXT = 'text'
- POLYGONS = 'polygons'
- OUTPUT = 'output'
- OUTPUT_IMG = 'output_img'
- OUTPUT_IMGS = 'output_imgs'
- OUTPUT_VIDEO = 'output_video'
- OUTPUT_PCM = 'output_pcm'
- OUTPUT_PCM_LIST = 'output_pcm_list'
- OUTPUT_WAV = 'output_wav'
- OUTPUT_OBJ = 'output_obj'
- OUTPUT_MESH = 'output_mesh'
- IMG_EMBEDDING = 'img_embedding'
- SPK_EMBEDDING = 'spk_embedding'
- SPO_LIST = 'spo_list'
- TEXT_EMBEDDING = 'text_embedding'
- TRANSLATION = 'translation'
- RESPONSE = 'response'
- PREDICTION = 'prediction'
- PREDICTIONS = 'predictions'
- PROBABILITIES = 'probabilities'
- DIALOG_STATES = 'dialog_states'
- VIDEO_EMBEDDING = 'video_embedding'
- PHRASE_PROTOTYPE = 'phrase_prototype'
- OBJECT_PROTOTYPE = 'object_prototype'
- SENTENCE_PROTOTYPE = 'sentence_prototype'
- EVENT_PROTOTYPE = 'event_prototype'
- TEXTVIDEO_SIM = 'textvideo_sim'
- UUID = 'uuid'
- WORD = 'word'
- KWS_LIST = 'kws_list'
- SQL_STRING = 'sql_string'
- SQL_QUERY = 'sql_query'
- HISTORY = 'history'
- QUERY_RESULT = 'query_result'
- TIMESTAMPS = 'timestamps'
- SHOT_NUM = 'shot_num'
- SCENE_NUM = 'scene_num'
- SCENE_META_LIST = 'scene_meta_list'
- SHOT_META_LIST = 'shot_meta_list'
- MATCHES = 'matches'
- PCD12 = 'pcd12'
- PCD12_ALIGN = 'pcd12_align'
- TBOUNDS = 'tbounds'
- MV_IMGS = 'MViews'
- OutputTypes = {
- OutputKeys.LOSS: float, # checked
- OutputKeys.LOGITS: np.ndarray, # checked.
- OutputKeys.SCORES: List[float], # checked
- OutputKeys.SCORE: float, # checked
- OutputKeys.LABEL: str, # checked
- OutputKeys.LABELS: List[str], # checked
- OutputKeys.INPUT_IDS: np.ndarray, # checked
- OutputKeys.LABEL_POS: np.ndarray, # checked
- OutputKeys.POSES:
- List[np.ndarray], # [Tuple(np.ndarray, np.ndarray)] # checked doubtful
- OutputKeys.CAPTION: str,
- OutputKeys.BOXES: np.ndarray, # checked
- OutputKeys.KEYPOINTS: np.ndarray, # checked
- OutputKeys.MASKS: np.ndarray, # checked
- OutputKeys.DEPTHS: List[np.ndarray], # checked
- OutputKeys.DEPTHS_COLOR: List[np.ndarray], # checked
- OutputKeys.LAYOUT: np.ndarray, # checked
- OutputKeys.TEXT: str, # checked
- OutputKeys.POLYGONS: np.array, # checked
- OutputKeys.OUTPUT: Dict,
- OutputKeys.OUTPUT_IMG: 'image', # checked
- OutputKeys.OUTPUT_IMGS: List[np.ndarray], # checked
- OutputKeys.OUTPUT_VIDEO: 'bytes',
- OutputKeys.OUTPUT_PCM: 'pcm',
- OutputKeys.OUTPUT_PCM_LIST: List[np.ndarray],
- OutputKeys.OUTPUT_WAV: 'pcm',
- OutputKeys.OUTPUT_OBJ: Dict,
- OutputKeys.OUTPUT_MESH: np.ndarray,
- OutputKeys.IMG_EMBEDDING: np.ndarray,
- OutputKeys.SPK_EMBEDDING: np.ndarray,
- OutputKeys.SPO_LIST: List[float],
- OutputKeys.TEXT_EMBEDDING: np.ndarray,
- OutputKeys.TRANSLATION: str,
- OutputKeys.RESPONSE: Dict,
- OutputKeys.PREDICTION: np.ndarray, # checked
- OutputKeys.PREDICTIONS: List[np.ndarray],
- OutputKeys.PROBABILITIES: np.ndarray,
- OutputKeys.DIALOG_STATES: object,
- OutputKeys.VIDEO_EMBEDDING: np.ndarray,
- OutputKeys.PHRASE_PROTOTYPE: np.ndarray,
- OutputKeys.OBJECT_PROTOTYPE: np.ndarray,
- OutputKeys.SENTENCE_PROTOTYPE: np.ndarray,
- OutputKeys.EVENT_PROTOTYPE: np.ndarray,
- OutputKeys.TEXTVIDEO_SIM: np.ndarray,
- OutputKeys.UUID: str,
- OutputKeys.WORD: str,
- OutputKeys.KWS_LIST: List[str],
- OutputKeys.SQL_STRING: str, # checked
- OutputKeys.SQL_QUERY: str, # checked
- OutputKeys.HISTORY: Dict, # checked
- OutputKeys.QUERY_RESULT: Dict, # checked
- OutputKeys.TIMESTAMPS: str,
- OutputKeys.SHOT_NUM: int,
- OutputKeys.SCENE_NUM: int,
- OutputKeys.SCENE_META_LIST: List[int],
- OutputKeys.SHOT_META_LIST: List[int],
- OutputKeys.MATCHES: List[np.ndarray],
- OutputKeys.PCD12: np.ndarray,
- OutputKeys.PCD12_ALIGN: np.ndarray,
- OutputKeys.TBOUNDS: Dict,
- OutputKeys.MV_IMGS: List[np.ndarray],
- }
- OutputTypeSchema = {
- OutputKeys.LOSS: {
- 'type': 'number'
- }, # checked
- OutputKeys.LOGITS: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }, # checked.
- OutputKeys.SCORES: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }, # checked
- OutputKeys.SCORE: {
- 'type': 'number'
- }, # checked
- OutputKeys.LABEL: {
- 'type': 'string'
- }, # checked
- OutputKeys.LABELS: {
- 'type': 'array',
- 'items': {
- 'type': 'string'
- }
- }, # checked
- OutputKeys.INPUT_IDS: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }, # checked
- OutputKeys.LABEL_POS: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }, # checked
- OutputKeys.POSES: {
- 'type': 'array',
- 'items': {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }
- }, # [Tuple(np.ndarray, np.ndarray)] # checked doubtful
- OutputKeys.CAPTION: {
- 'type': 'string'
- },
- OutputKeys.BOXES: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }, # checked
- OutputKeys.KEYPOINTS: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }, # checked
- OutputKeys.MASKS: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }, # checked
- OutputKeys.DEPTHS: {
- 'type': 'array',
- 'items': {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }
- }, # checked
- OutputKeys.DEPTHS_COLOR: {
- 'type': 'array',
- 'items': {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }
- }, # checked
- OutputKeys.LAYOUT: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }, # checked
- OutputKeys.TEXT: {
- 'type': 'string'
- }, # checked
- OutputKeys.POLYGONS: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }, # checked
- OutputKeys.OUTPUT: {
- 'type': 'object'
- },
- OutputKeys.OUTPUT_IMG: {
- 'type': 'string',
- 'description': 'The base64 encoded image.',
- }, # checked
- OutputKeys.OUTPUT_IMGS: {
- 'type': 'array',
- 'items': {
- 'type': 'string',
- 'description': 'The base64 encoded image.',
- }
- }, # checked
- OutputKeys.OUTPUT_VIDEO: {
- 'type': 'string',
- 'description': 'The base64 encoded video.',
- },
- OutputKeys.OUTPUT_PCM: {
- 'type': 'string',
- 'description': 'The base64 encoded PCM.',
- },
- OutputKeys.OUTPUT_PCM_LIST: {
- 'type': 'array',
- 'items': {
- 'type': 'string',
- 'description': 'The base64 encoded PCM.',
- }
- },
- OutputKeys.OUTPUT_WAV: {
- 'type': 'string',
- 'description': 'The base64 encoded WAV.',
- },
- OutputKeys.OUTPUT_OBJ: {
- 'type': 'object'
- },
- OutputKeys.OUTPUT_MESH: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- },
- OutputKeys.IMG_EMBEDDING: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- },
- OutputKeys.SPK_EMBEDDING: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- },
- OutputKeys.SPO_LIST: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- },
- OutputKeys.TEXT_EMBEDDING: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- },
- OutputKeys.TRANSLATION: {
- 'type': 'string'
- },
- OutputKeys.RESPONSE: {
- 'type': 'object'
- },
- OutputKeys.PREDICTION: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }, # checked
- OutputKeys.PREDICTIONS: {
- 'type': 'array',
- 'items': {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }
- },
- OutputKeys.PROBABILITIES: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- },
- OutputKeys.DIALOG_STATES: {
- 'type': 'object'
- },
- OutputKeys.VIDEO_EMBEDDING: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- },
- OutputKeys.PHRASE_PROTOTYPE: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- },
- OutputKeys.OBJECT_PROTOTYPE: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- },
- OutputKeys.TEXTVIDEO_SIM: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- },
- OutputKeys.UUID: {
- 'type': 'string'
- },
- OutputKeys.WORD: {
- 'type': 'string'
- },
- OutputKeys.KWS_LIST: {
- 'type': 'array',
- 'items': {
- 'type': 'string'
- }
- },
- OutputKeys.SQL_STRING: {
- 'type': 'string'
- }, # checked
- OutputKeys.SQL_QUERY: {
- 'type': 'string'
- }, # checked
- OutputKeys.HISTORY: {
- 'type': 'object'
- }, # checked
- OutputKeys.QUERY_RESULT: {
- 'type': 'object'
- }, # checked
- OutputKeys.TIMESTAMPS: {
- 'type': 'string'
- },
- OutputKeys.SHOT_NUM: {
- 'type': 'integer'
- },
- OutputKeys.SCENE_NUM: {
- 'type': 'integer'
- },
- OutputKeys.SCENE_META_LIST: {
- 'type': 'array',
- 'items': {
- 'type': 'integer'
- }
- },
- OutputKeys.SHOT_META_LIST: {
- 'type': 'array',
- 'items': {
- 'type': 'integer'
- }
- },
- OutputKeys.MATCHES: {
- 'type': 'array',
- 'items': {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }
- },
- OutputKeys.PCD12: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- },
- OutputKeys.PCD12_ALIGN: {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- },
- OutputKeys.TBOUNDS: {
- 'type': 'object'
- },
- OutputKeys.MV_IMGS: {
- 'type': 'array',
- 'items': {
- 'type': 'array',
- 'items': {
- 'type': 'number'
- }
- }
- },
- }
- TASK_OUTPUTS = {
- Tasks.task_template:
- [OutputKeys.BOXES, OutputKeys.OUTPUT_IMG, OutputKeys.TEXT_EMBEDDING],
- # ============ vision tasks ===================
- # ocr detection result for single sample
- # {
- # "polygons": np.array with shape [num_text, 8], each polygon is
- # [x1, y1, x2, y2, x3, y3, x4, y4]
- # }
- Tasks.ocr_detection: [OutputKeys.POLYGONS],
- Tasks.table_recognition: [OutputKeys.POLYGONS],
- Tasks.lineless_table_recognition: [OutputKeys.POLYGONS, OutputKeys.BOXES],
- Tasks.license_plate_detection: [OutputKeys.POLYGONS, OutputKeys.TEXT],
- Tasks.card_detection_correction: [
- OutputKeys.POLYGONS, OutputKeys.SCORES, OutputKeys.OUTPUT_IMGS,
- OutputKeys.LABELS, OutputKeys.LAYOUT
- ],
- # ocr recognition result for single sample
- # {
- # "text": "电子元器件提供BOM配单"
- # }
- Tasks.ocr_recognition: [OutputKeys.TEXT],
- Tasks.sudoku: [OutputKeys.TEXT],
- Tasks.text2sql: [OutputKeys.TEXT],
- # document vl embedding for single sample
- # {
- # "img_embedding": np.array with shape [M, D],
- # "text_embedding": np.array with shape [N, D]
- # }
- Tasks.document_vl_embedding:
- [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING],
- # face 2d keypoint result for single sample
- # {
- # "keypoints": [
- # [[x, y]*106],
- # [[x, y]*106],
- # [[x, y]*106],
- # ],
- # "poses": [
- # [pitch, roll, yaw],
- # [pitch, roll, yaw],
- # [pitch, roll, yaw],
- # ],
- # "boxes": [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ]
- # }
- Tasks.face_2d_keypoints:
- [OutputKeys.KEYPOINTS, OutputKeys.POSES, OutputKeys.BOXES],
- # face detection result for single sample
- # {
- # "scores": [0.9, 0.1, 0.05, 0.05]
- # "boxes": [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ],
- # "keypoints": [
- # [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
- # [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
- # [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
- # [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
- # ],
- # }
- Tasks.face_detection:
- [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
- # card detection result for single sample
- # {
- # "scores": [0.9, 0.1, 0.05, 0.05]
- # "boxes": [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ],
- # "keypoints": [
- # [x1, y1, x2, y2, x3, y3, x4, y4],
- # [x1, y1, x2, y2, x3, y3, x4, y4],
- # [x1, y1, x2, y2, x3, y3, x4, y4],
- # [x1, y1, x2, y2, x3, y3, x4, y4],
- # ],
- # }
- Tasks.card_detection:
- [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
- # content check result for single sample
- # {
- # "scores": [0.9] # non sexy probability
- # }
- Tasks.content_check: [OutputKeys.SCORES],
- # image driving perception result for single sample
- # {
- # "boxes": [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ],
- # "masks": [
- # [np.array], # with fixed shape(h=720, w=1280, 3) containing only 0, 1
- # [np.array], # with fixed shape(h=720, w=1280, 3) containing only 0, 1
- # ]
- # }
- Tasks.image_driving_perception: [OutputKeys.BOXES, OutputKeys.MASKS],
- # facial expression recognition result for single sample
- # {
- # "scores": [0.9]
- # "boxes": [x1, y1, x2, y2]
- # }
- Tasks.face_liveness: [OutputKeys.SCORES, OutputKeys.BOXES],
- # face quality assessment for single sample
- # {
- # "scores": [0.9]
- # "boxes": [x1, y1, x2, y2]
- # }
- Tasks.face_quality_assessment: [OutputKeys.SCORES, OutputKeys.BOXES],
- # facial expression recognition result for single sample
- # {
- # "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02],
- # "labels": ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
- # }
- Tasks.facial_expression_recognition:
- [OutputKeys.SCORES, OutputKeys.LABELS],
- Tasks.general_recognition: [OutputKeys.SCORES, OutputKeys.LABELS],
- # face processing base result for single img
- # {
- # "scores": [0.85]
- # "boxes": [x1, y1, x2, y2]
- # "keypoints": [x1, y1, x2, y2, x3, y3, x4, y4]
- # }
- Tasks.face_processing_base: [
- OutputKeys.OUTPUT_IMG, OutputKeys.SCORES, OutputKeys.BOXES,
- OutputKeys.KEYPOINTS
- ],
- # face attribute recognition result for single sample
- # {
- # "scores": [[0.9, 0.1], [0.92, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]
- # "labels": [['Male', 'Female'], [0-2, 3-9, 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70+]]
- # }
- Tasks.face_attribute_recognition: [OutputKeys.SCORES, OutputKeys.LABELS],
- # face recognition result for single sample
- # {
- # "img_embedding": np.array with shape [1, D],
- # }
- Tasks.face_recognition: [OutputKeys.IMG_EMBEDDING],
- # human detection result for single sample
- # {
- # "scores": [0.9, 0.1, 0.05, 0.05]
- # "labels": ["person", "person", "person", "person"],
- # "boxes": [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ],
- # }
- #
- Tasks.human_detection:
- [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
- # face generation result for single sample
- # {
- # "output_img": np.array with shape(h, w, 3)
- # }
- Tasks.face_image_generation: [OutputKeys.OUTPUT_IMG],
- # image classification result for single sample
- # {
- # "scores": [0.9, 0.1, 0.05, 0.05]
- # "labels": ["dog", "horse", "cow", "cat"],
- # }
- Tasks.image_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
- # object detection result for single sample
- # {
- # "scores": [0.9, 0.1, 0.05, 0.05]
- # "labels": ["dog", "horse", "cow", "cat"],
- # "boxes": [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ],
- # }
- Tasks.image_object_detection:
- [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
- Tasks.domain_specific_object_detection:
- [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
- Tasks.open_vocabulary_detection:
- [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
- # video object detection result for single sample
- # {
- # "scores": [[0.8, 0.25, 0.05, 0.05], [0.9, 0.1, 0.05, 0.05]]
- # "labels": [["person", "traffic light", "car", "bus"],
- # ["person", "traffic light", "car", "bus"]]
- # "boxes":
- # [
- # [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ],
- # [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ]
- # ],
- # }
- Tasks.video_object_detection:
- [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
- # 3d object detection result for single sample
- # {
- # "output_img": np.array with shape(h, w, 3)
- # }
- Tasks.object_detection_3d: [OutputKeys.OUTPUT_IMG],
- # instance segmentation result for single sample
- # {
- # "scores": [0.9, 0.1, 0.05, 0.05],
- # "labels": ["dog", "horse", "cow", "cat"],
- # "masks": [
- # np.array # 2D array containing only 0, 1
- # ]
- # }
- Tasks.image_segmentation:
- [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.MASKS],
- # video panoptic segmentation result for single sample
- # "scores": [[0.8, 0.25, 0.05, 0.05], [0.9, 0.1, 0.05, 0.05]]
- # "labels": [["person", "traffic light", "car", "bus"],
- # ["person", "traffic light", "car", "bus"]]
- # "masks": [ #array containing only 0, 1
- # [np.array, np.array, np.array, np.array],
- # [np.array, np.array, np.array, np.array],
- # ]
- # "boxes":
- # [
- # [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ],
- # [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ]
- # ],
- # "uuid": [[0, 1, 2, 3],[0, 1, 2, 3]]
- # }
- Tasks.video_panoptic_segmentation: [
- OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.MASKS,
- OutputKeys.BOXES, OutputKeys.UUID
- ],
- # semantic segmentation result for single sample
- # {
- # "masks": [np.array # 2D array with shape [height, width]]
- # }
- Tasks.semantic_segmentation: [OutputKeys.MASKS],
- # image matting result for single sample
- # {
- # "output_img": np.array with shape(h, w, 4)
- # for matting or (h, w, 3) for general purpose
- # , shape(h, w) for crowd counting
- # }
- Tasks.portrait_matting: [OutputKeys.OUTPUT_IMG],
- Tasks.universal_matting: [OutputKeys.OUTPUT_IMG],
- Tasks.image_deblurring: [OutputKeys.OUTPUT_IMG],
- Tasks.image_face_fusion: [OutputKeys.OUTPUT_IMG],
- # image_quality_assessment_mos result for a single image is a score in range [0, 1]
- # {0.5}
- Tasks.image_quality_assessment_mos: [OutputKeys.SCORE],
- # image editing task result for a single image
- # {"output_img": np.array with shape (h, w, 3)}
- Tasks.skin_retouching: [OutputKeys.OUTPUT_IMG],
- Tasks.image_super_resolution: [OutputKeys.OUTPUT_IMG],
- Tasks.image_super_resolution_pasd: [OutputKeys.OUTPUT_IMG],
- Tasks.image_colorization: [OutputKeys.OUTPUT_IMG],
- Tasks.image_color_enhancement: [OutputKeys.OUTPUT_IMG],
- Tasks.image_denoising: [OutputKeys.OUTPUT_IMG],
- Tasks.image_editing: [OutputKeys.OUTPUT_IMG],
- Tasks.image_portrait_enhancement: [OutputKeys.OUTPUT_IMG],
- Tasks.crowd_counting: [OutputKeys.SCORES, OutputKeys.OUTPUT_IMG],
- Tasks.image_inpainting: [OutputKeys.OUTPUT_IMG],
- Tasks.image_paintbyexample: [OutputKeys.OUTPUT_IMG],
- Tasks.controllable_image_generation: [OutputKeys.OUTPUT_IMG],
- # image generation task result for a single image
- # {"output_img": np.array with shape (h, w, 3)}
- Tasks.image_to_image_generation: [OutputKeys.OUTPUT_IMG],
- Tasks.image_to_image_translation: [OutputKeys.OUTPUT_IMG],
- Tasks.image_style_transfer: [OutputKeys.OUTPUT_IMG],
- Tasks.image_portrait_stylization: [OutputKeys.OUTPUT_IMG],
- Tasks.image_body_reshaping: [OutputKeys.OUTPUT_IMG],
- # video editing task result for a single video
- # {"output_video": "path_to_rendered_video"}
- Tasks.video_frame_interpolation: [OutputKeys.OUTPUT_VIDEO],
- Tasks.video_super_resolution: [OutputKeys.OUTPUT_VIDEO],
- Tasks.video_deinterlace: [OutputKeys.OUTPUT_VIDEO],
- Tasks.nerf_recon_acc: [OutputKeys.OUTPUT],
- Tasks.nerf_recon_vq_compression: [OutputKeys.OUTPUT],
- Tasks.surface_recon_common: [OutputKeys.OUTPUT],
- Tasks.video_colorization: [OutputKeys.OUTPUT_VIDEO],
- Tasks.image_control_3d_portrait: [OutputKeys.OUTPUT],
- Tasks.self_supervised_depth_completion: [OutputKeys.OUTPUT_IMG],
- # image quality assessment degradation result for single image
- # {
- # "scores": [0.885272, 0.014790631, 0.014558001]
- # "labels": ['噪声强度', '模糊程度', '压缩强度'],
- # }
- Tasks.image_quality_assessment_degradation: [
- OutputKeys.SCORES, OutputKeys.LABELS
- ],
- # live category recognition result for single video
- # {
- # "scores": [0.885272, 0.014790631, 0.014558001]
- # "labels": ['女装/女士精品>>棉衣/棉服', '女装/女士精品>>牛仔裤', '女装/女士精品>>裤子>>休闲裤'],
- # }
- Tasks.live_category: [OutputKeys.SCORES, OutputKeys.LABELS],
- # action recognition result for single video
- # {
- # "output_label": "abseiling"
- # }
- Tasks.action_recognition: [OutputKeys.LABELS],
- # human body keypoints detection result for single sample
- # {
- # "keypoints": [
- # [[x, y]*15],
- # [[x, y]*15],
- # [[x, y]*15]
- # ]
- # "scores": [
- # [[score]*15],
- # [[score]*15],
- # [[score]*15]
- # ]
- # "boxes": [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ]
- # }
- Tasks.body_2d_keypoints: [
- OutputKeys.KEYPOINTS, OutputKeys.SCORES, OutputKeys.BOXES
- ],
- # 3D human body keypoints detection result for single sample
- # {
- # "keypoints": [ # 3d pose coordinate in camera coordinate
- # [[x, y, z]*17], # joints of per image
- # [[x, y, z]*17],
- # ...
- # ],
- # "timestamps": [ # timestamps of all frames
- # "00:00:0.230",
- # "00:00:0.560",
- # "00:00:0.690",
- # ],
- # "output_video": "path_to_rendered_video" , this is optional
- # and is only available when the "render" option is enabled.
- # }
- Tasks.body_3d_keypoints: [
- OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO
- ],
- # pedestrian attribute recognition result for single sample
- # {
- # "boxes": [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ]
- # "labels": [
- # ['Female', 'AgeOver60', 'Front', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes',
- # 'LongSleeve', 'Black', 'Trousers', 'Black' ],
- # ['Female', 'AgeOver60', 'Front', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes',
- # 'LongSleeve', 'Black', 'Trousers', 'Black' ],
- # ['Female', 'AgeOver60', 'Front', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes',
- # 'LongSleeve', 'Black', 'Trousers', 'Black' ],
- # ]
- # }
- Tasks.pedestrian_attribute_recognition: [
- OutputKeys.BOXES, OutputKeys.LABELS
- ],
- # 3D face reconstruction result for single sample
- # {
- # "output_obj": io.BytesIO,
- # "output_img": np.array with shape(h, w, 3),
- # "output": {
- # "mesh": {
- # "vertices": np.array with shape(n, 3),
- # "faces": np.array with shape(n, 3),
- # "faces_uv": np.array with shape(n, 3),
- # "faces_normal": np.array with shape(n, 3),
- # "UVs": np.array with shape(n, 2),
- # "normals": np.array with shape(n, 3),
- # },
- # "vis_image": np.array with shape(h, w, 3),
- # "frame_list", [np.array with shape(h, w, 3), ...],
- # }
- # }
- Tasks.face_reconstruction: [OutputKeys.OUTPUT],
- Tasks.human3d_render: [OutputKeys.OUTPUT],
- Tasks.human3d_animation: [OutputKeys.OUTPUT],
- # 3D head reconstruction result for single sample
- # {
- # "output_obj": io.BytesIO,
- # "output_img": np.array with shape(h, w, 3),
- # "output": {
- # "mesh": {
- # "vertices": np.array with shape(n, 3),
- # "faces": np.array with shape(n, 3),
- # "faces_uv": np.array with shape(n, 3),
- # "faces_normal": np.array with shape(n, 3),
- # "UVs": np.array with shape(n, 2),
- # "normals": np.array with shape(n, 3),
- # },
- # }
- # }
- Tasks.head_reconstruction: [OutputKeys.OUTPUT],
- # text to head result for text input
- # {
- # "output_obj": io.BytesIO,
- # "output_img": np.array with shape(h, w, 3),
- # "output": {
- # "mesh": {
- # "vertices": np.array with shape(n, 3),
- # "faces": np.array with shape(n, 3),
- # "faces_uv": np.array with shape(n, 3),
- # "faces_normal": np.array with shape(n, 3),
- # "UVs": np.array with shape(n, 2),
- # "normals": np.array with shape(n, 3),
- # },
- # },
- # "image": np.array with shape(h, w, 3),
- # }
- Tasks.text_to_head: [OutputKeys.OUTPUT],
- # 3D human reconstruction result for single sample
- # {
- # "output": {
- # "vertices": np.array with shape(n, 3),
- # "faces": np.array with shape(n, 3),
- # "colors": np.array with shape(n, 3),
- # }
- # }
- Tasks.human_reconstruction: [OutputKeys.OUTPUT],
- # 3D text 2 texture generation result
- # {
- # "output": {
- # "Done"
- # }
- # }
- Tasks.text_texture_generation: [OutputKeys.OUTPUT],
- # 2D hand keypoints result for single sample
- # {
- # "keypoints": [
- # [[x, y, score] * 21],
- # [[x, y, score] * 21],
- # [[x, y, score] * 21],
- # ],
- # "boxes": [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ]
- # }
- Tasks.hand_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.BOXES],
- # video single object tracking result for single video
- # {
- # "boxes": [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ],
- # "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
- # }
- Tasks.video_single_object_tracking: [
- OutputKeys.BOXES, OutputKeys.TIMESTAMPS
- ],
- # video multi object tracking result for single video
- # {
- # "boxes": [
- # [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ...
- # ],
- # [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ...
- # ],
- # [
- # [x1, y1, x2, y2]
- # ...
- # ]
- # ],
- # "labels": [[obj_id0, obj_id1, ...], [obj_id1, obj_id2, ...], [obj_id3, ...]],
- # "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
- # }
- Tasks.video_multi_object_tracking: [
- OutputKeys.BOXES, OutputKeys.LABELS, OutputKeys.TIMESTAMPS
- ],
- # live category recognition result for single video
- # {
- # "scores": [0.885272, 0.014790631, 0.014558001],
- # 'labels': ['修身型棉衣', '高腰牛仔裤', '休闲连体裤']
- # }
- Tasks.live_category: [OutputKeys.SCORES, OutputKeys.LABELS],
- # video category recognition result for single video
- # {
- # "scores": [0.7716429233551025],
- # "labels": ['生活>>好物推荐']
- # }
- Tasks.video_category: [OutputKeys.SCORES, OutputKeys.LABELS],
- # image embedding result for a single image
- # {
- # "image_bedding": np.array with shape [D]
- # }
- Tasks.product_retrieval_embedding: [OutputKeys.IMG_EMBEDDING],
- # video embedding result for single video
- # {
- # "video_embedding": np.array with shape [D],
- # }
- Tasks.video_embedding: [OutputKeys.VIDEO_EMBEDDING],
- # phrase prototype result for single sentence
- # {
- # "phrase_prototype": np.array with shape [K*D],
- # }
- # sentence prototype result for single sentence
- # {
- # "sentence_prototype": np.array with shape [1*D],
- # }
- # object prototype result for single video
- # {
- # "object_prototype": np.array with shape [N*K*D],
- # }
- # event prototype result for single video
- # {
- # "event_prototype": np.array with shape [N*M*D],
- # }
- # text search video result for single sentence
- # {
- # "textvideo_sim": np.array with shape [N*N],
- # }
- Tasks.text_video_retrieval: [
- OutputKeys.PHRASE_PROTOTYPE, OutputKeys.SENTENCE_PROTOTYPE,
- OutputKeys.OBJECT_PROTOTYPE, OutputKeys.EVENT_PROTOTYPE,
- OutputKeys.TEXTVIDEO_SIM
- ],
- # video stabilization task result for a single video
- # {"output_video": "path_to_rendered_video"}
- Tasks.video_stabilization: [OutputKeys.OUTPUT_VIDEO],
- # virtual_try_on result for a single sample
- # {
- # "output_img": np.ndarray with shape [height, width, 3]
- # }
- Tasks.virtual_try_on: [OutputKeys.OUTPUT_IMG],
- # text driven segmentation result for single sample
- # {
- # "masks": [
- # np.array # 2D array containing only 0, 255
- # ]
- # }
- Tasks.text_driven_segmentation: [OutputKeys.MASKS],
- # shop segmentation result for single sample
- # {
- # "masks": [
- # np.array # 2D array containing only 0, 255
- # ]
- # }
- Tasks.shop_segmentation: [OutputKeys.MASKS],
- # movide scene segmentation result for a single video
- # {
- # "shot_num":15,
- # "shot_meta_list":
- # [
- # {
- # "frame": [start_frame, end_frame],
- # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
- #
- # }
- # ]
- # "scene_num":3,
- # "scene_meta_list":
- # [
- # {
- # "shot": [0,1,2],
- # "frame": [start_frame, end_frame],
- # "timestamps": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
- # }
- # ]
- #
- # }
- Tasks.movie_scene_segmentation: [
- OutputKeys.SHOT_NUM, OutputKeys.SHOT_META_LIST, OutputKeys.SCENE_NUM,
- OutputKeys.SCENE_META_LIST
- ],
- # human whole body keypoints detection result for single sample
- # {
- # "keypoints": [
- # [[x, y]*133],
- # [[x, y]*133],
- # [[x, y]*133]
- # ]
- # "boxes": [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ]
- # }
- Tasks.human_wholebody_keypoint: [OutputKeys.KEYPOINTS, OutputKeys.BOXES],
- # video summarization result for a single video
- # {
- # "output":
- # [
- # {
- # "frame": [start_frame, end_frame]
- # "timestamps": [start_time, end_time]
- # },
- # {
- # "frame": [start_frame, end_frame]
- # "timestamps": [start_time, end_time]
- # }
- # ]
- # }
- Tasks.video_summarization: [OutputKeys.OUTPUT],
- # referring video object segmentation result for a single video
- # {
- # "masks": [np.array # 3D array with shape [frame_num, height, width]]
- # "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
- # "output_video": "path_to_rendered_video" , this is optional
- # and is only available when the "render" option is enabled.
- # }
- Tasks.referring_video_object_segmentation: [
- OutputKeys.MASKS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO
- ],
- # video human matting result for a single video
- # {
- # "masks": [np.array # 2D array with shape [height, width]]
- # "output_video": "path_to_matting_video"
- # }
- Tasks.video_human_matting: [OutputKeys.MASKS, OutputKeys.OUTPUT_VIDEO],
- # ============ nlp tasks ===================
- # text classification result for single sample
- # {
- # "scores": [0.9, 0.1, 0.05, 0.05]
- # "labels": ["happy", "sad", "calm", "angry"],
- # }
- Tasks.text_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
- # sentence similarity result for single sample
- # {
- # "scores": 0.9
- # "labels": "1",
- # }
- Tasks.sentence_similarity: [OutputKeys.SCORES, OutputKeys.LABELS],
- # nli result for single sample
- # {
- # "labels": ["happy", "sad", "calm", "angry"],
- # "scores": [0.9, 0.1, 0.05, 0.05]
- # }
- Tasks.nli: [OutputKeys.SCORES, OutputKeys.LABELS],
- # sentiment classification result for single sample
- # {
- # 'scores': [0.07183828949928284, 0.9281617403030396],
- # 'labels': ['1', '0']
- # }
- Tasks.sentiment_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
- # zero-shot classification result for single sample
- # {
- # "scores": [0.9, 0.1, 0.05, 0.05]
- # "labels": ["happy", "sad", "calm", "angry"],
- # }
- Tasks.zero_shot_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
- # relation extraction result for a single sample
- # {
- # "uuid": "人生信息-1",
- # "text": "《父老乡亲》是由是由由中国人民解放军海政文工团创作的军旅歌曲,石顺义作词,王锡仁作曲,范琳琳演唱",
- # "spo_list": [{"subject": "石顺义", "predicate": "国籍", "object": "中国"}]
- # }
- Tasks.relation_extraction: [OutputKeys.SPO_LIST],
- # translation result for a source sentence
- # {
- # "translation": “北京是中国的首都”
- # }
- Tasks.translation: [OutputKeys.TRANSLATION],
- # word segmentation result for single sample
- # {
- # "output": ["今天", "天气", "不错", ",", "适合", "出去", "游玩"]
- # }
- # {
- # 'output': ['รถ', 'คัน', 'เก่า', 'ก็', 'ยัง', 'เก็บ', 'เอา']
- # }
- Tasks.word_segmentation: [OutputKeys.OUTPUT],
- # TODO @wenmeng.zwm support list of result check
- # named entity recognition result for single sample
- # {
- # "output": [
- # {"type": "LOC", "start": 2, "end": 5, "span": "温岭市"},
- # {"type": "LOC", "start": 5, "end": 8, "span": "新河镇"}
- # ]
- # }
- Tasks.named_entity_recognition: [OutputKeys.OUTPUT],
- Tasks.part_of_speech: [OutputKeys.OUTPUT],
- # text_error_correction result for a single sample
- # {
- # "output": "我想吃苹果"
- # }
- Tasks.text_error_correction: [OutputKeys.OUTPUT],
- # word_alignment result for a single sample
- # {
- # "output": "0-0 1-3 2-4 3-1 4-2 5-5"
- # }
- Tasks.word_alignment: [OutputKeys.OUTPUT],
- Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
- Tasks.text_ranking: [OutputKeys.SCORES],
- # text generation result for single sample
- # {
- # "text": "this is the text generated by a model."
- # }
- Tasks.text_generation: [OutputKeys.TEXT],
- # chat task result for single sample
- # {
- # "response": "this is the chat response generated by a model.",
- # "history": [("hi", "nice to meet you"),("I felt happy, and you", "me too")]
- # }
- Tasks.chat: [OutputKeys.RESPONSE, OutputKeys.HISTORY],
- # fid dialogue result for single sample
- # {
- # "text": "My name is Mike"
- # }
- Tasks.fid_dialogue: [OutputKeys.TEXT],
- # summarization result for single sample
- # {
- # "text": "this is the text generated by a model."
- # }
- Tasks.text_summarization: [OutputKeys.TEXT],
- # text generation result for single sample
- # {
- # "text": "北京"
- # }
- Tasks.text2text_generation: [OutputKeys.TEXT],
- # fill mask result for single sample
- # {
- # "text": "this is the text which masks filled by model."
- # }
- Tasks.fill_mask: [OutputKeys.TEXT],
- # feature extraction result for single sample
- # {
- # "text_embedding": [[
- # [1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04],
- # [6.45841064e-05, 1.15997791e-04, 5.11605394e-05, 9.87020373e-01],
- # [2.66957268e-05, 4.72324500e-05, 9.74208378e-05, 4.18022355e-05]
- # ],
- # [
- # [2.97343540e-05, 5.81317654e-05, 5.44203431e-05, 6.28319322e-05],
- # [8.24327726e-05, 4.66077945e-05, 5.32869453e-05, 4.16190960e-05],
- # [3.61441926e-05, 3.38475402e-05, 3.44323053e-05, 5.70138109e-05]
- # ]
- # ]
- # }
- Tasks.feature_extraction: [OutputKeys.TEXT_EMBEDDING],
- # (Deprecated) dialog intent prediction result for single sample
- # {'output': {'prediction': array([2.62349960e-03, 4.12110658e-03, 4.12748595e-05, 3.77560973e-05,
- # 1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04,
- # 6.45841064e-05, 1.15997791e-04, 5.11605394e-05, 9.87020373e-01,
- # 2.66957268e-05, 4.72324500e-05, 9.74208378e-05, 4.18022355e-05,
- # 2.97343540e-05, 5.81317654e-05, 5.44203431e-05, 6.28319322e-05,
- # 7.34537680e-05, 6.61411541e-05, 3.62534920e-05, 8.58885178e-05,
- # 8.24327726e-05, 4.66077945e-05, 5.32869453e-05, 4.16190960e-05,
- # 5.97518992e-05, 3.92273068e-05, 3.44069012e-05, 9.92335918e-05,
- # 9.25978165e-05, 6.26462061e-05, 3.32317031e-05, 1.32061413e-03,
- # 2.01607945e-05, 3.36636294e-05, 3.99156743e-05, 5.84108493e-05,
- # 2.53432900e-05, 4.95731190e-04, 2.64443643e-05, 4.46992999e-05,
- # 2.42672231e-05, 4.75615161e-05, 2.66230145e-05, 4.00083954e-05,
- # 2.90536875e-04, 4.23891543e-05, 8.63691166e-05, 4.98188965e-05,
- # 3.47019341e-05, 4.52718523e-05, 4.20905781e-05, 5.50173208e-05,
- # 4.92360487e-05, 3.56021264e-05, 2.13957210e-05, 6.17428886e-05,
- # 1.43893281e-04, 7.32152112e-05, 2.91354867e-04, 2.46623786e-05,
- # 3.61441926e-05, 3.38475402e-05, 3.44323053e-05, 5.70138109e-05,
- # 4.31488479e-05, 4.94503947e-05, 4.30105974e-05, 1.00963116e-04,
- # 2.82062047e-05, 1.15582036e-04, 4.48261271e-05, 3.99339879e-05,
- # 7.27692823e-05], dtype=float32), 'label_pos': array([11]), 'label': 'lost_or_stolen_card'}}
- # (Deprecated) dialog modeling prediction result for single sample
- # {'output' : ['you', 'are', 'welcome', '.', 'have', 'a', 'great', 'day', '!']}
- # (Deprecated) dialog state tracking result for single sample
- # {
- # "output":{
- # "dialog_states": {
- # "taxi-leaveAt": "none",
- # "taxi-destination": "none",
- # "taxi-departure": "none",
- # "taxi-arriveBy": "none",
- # "restaurant-book_people": "none",
- # "restaurant-book_day": "none",
- # "restaurant-book_time": "none",
- # "restaurant-food": "none",
- # "restaurant-pricerange": "none",
- # "restaurant-name": "none",
- # "restaurant-area": "none",
- # "hotel-book_people": "none",
- # "hotel-book_day": "none",
- # "hotel-book_stay": "none",
- # "hotel-name": "none",
- # "hotel-area": "none",
- # "hotel-parking": "none",
- # "hotel-pricerange": "cheap",
- # "hotel-stars": "none",
- # "hotel-internet": "none",
- # "hotel-type": "true",
- # "attraction-type": "none",
- # "attraction-name": "none",
- # "attraction-area": "none",
- # "train-book_people": "none",
- # "train-leaveAt": "none",
- # "train-destination": "none",
- # "train-day": "none",
- # "train-arriveBy": "none",
- # "train-departure": "none"
- # }
- # }
- # }
- Tasks.task_oriented_conversation: [OutputKeys.OUTPUT],
- # table-question-answering result for single sample
- # {
- # "sql": "SELECT shop.Name FROM shop."
- # "sql_history": {sel: 0, agg: 0, conds: [[0, 0, 'val']]}
- # }
- Tasks.table_question_answering: [OutputKeys.OUTPUT],
- # ============ audio tasks ===================
- # asr result for single sample
- # { "text": "每一天都要快乐喔"}
- Tasks.auto_speech_recognition: [OutputKeys.TEXT],
- # itn result for single sample
- # {"text": "123"}
- Tasks.inverse_text_processing: [OutputKeys.TEXT],
- # speaker verification for single compare task
- # {'score': 84.2332}
- Tasks.speaker_verification: [OutputKeys.SCORES],
- # speaker diarization dialogue detection for binary results: dialogue or non_dialogue
- # {
- # "scores": [0.98, 0.02],
- # "labels": ["dialogue", "non_dialogue"],
- # }
- Tasks.speaker_diarization_dialogue_detection: [
- OutputKeys.SCORES, OutputKeys.LABELS
- ],
- Tasks.speech_language_recognition: [OutputKeys.TEXT],
- # punctuation result for single sample
- # { "text": "你好,明天!"}
- Tasks.punctuation: [OutputKeys.TEXT],
- # speaker diarization semantic speaker-turn detection
- # {
- # "logits": [[0.7, 0.3], ..., [0.88, 0.12]],
- # "text": "您好。您好,初次见面请多指教。",
- # "prediction": [-100, -100, -100, 1, -100,..., -100, 0]
- # }
- Tasks.speaker_diarization_semantic_speaker_turn_detection: [
- OutputKeys.LOGITS, OutputKeys.TEXT, OutputKeys.PREDICTION
- ],
- # language model result for single sample
- # { "text": " hel@@ lo 大 家 好 呀 </s>
- # p( hel@@ | <s> ) = 0.00057767 [ -7.45650959 ]
- # p( lo | hel@@ ) = 0.99832278 [ -0.00167861 ]
- # p( 大 | lo ) = 0.49116334 [ -0.71097857 ]
- # p( 家 | 大 ) = 0.99691027 [ -0.00309453 ]
- # p( 好 | 家 ) = 0.97999156 [ -0.02021134 ]
- # p( 呀 | 好 ) = 0.00461205 [ -5.37908363 ]
- # p( </s> | 呀 ) = 0.01524554 [ -4.18346834 ]
- # logprob= -17.755 ppl= 12.6345
- # "}
- Tasks.language_score_prediction: [OutputKeys.TEXT],
- # speech timestamp result for single sample
- # {
- # 'text': '<sil> 0.000 0.376;一 0.376 0.556;个 0.556 0.796;东 0.796 0.976;
- # 太 0.976 1.136;平 1.136 1.256;洋 1.256 1.436;国 1.436 1.676;
- # <sil> 1.676 1.676;家 1.676 1.916;<sil> 1.916 2.036;为 2.036 2.196;
- # 什 2.196 2.316;么 2.316 2.496;跑 2.496 2.676;到 2.676 2.856;
- # 西 2.856 3.036;太 3.036 3.196;平 3.196 3.376;洋 3.376 3.496;
- # 来 3.496 3.636;了 3.636 3.796;呢 3.796 4.148;<sil> 4.148 4.440;',
- # 'timestamp': [[0, 376], [376, 556], [556, 795], [795, 976],
- # [976, 1136], [1136, 1256], [1256, 1436], [1436, 1676],
- # [1676, 1676], [1676, 1916], [1916, 2036], [2036, 2196],
- # [2196, 2316], [2316, 2496], [2496, 2676], [2676, 2856],
- # [2856, 3036], [3036, 3196], [3196, 3376], [3376, 3496]]
- # }
- Tasks.speech_timestamp: [OutputKeys.TEXT],
- # audio processed for single file in PCM format
- # {
- # "output_pcm": pcm encoded audio bytes
- # }
- Tasks.speech_signal_process: [OutputKeys.OUTPUT_PCM],
- Tasks.acoustic_echo_cancellation: [OutputKeys.OUTPUT_PCM],
- Tasks.acoustic_noise_suppression: [OutputKeys.OUTPUT_PCM],
- Tasks.speech_separation: [OutputKeys.OUTPUT_PCM_LIST],
- # text_to_speech result for a single sample
- # {
- # "output_wav": {"input_label" : bytes}
- # }
- Tasks.text_to_speech: [OutputKeys.OUTPUT_WAV],
- # {
- # "kws_list": [
- # {
- # 'keyword': '', # the keyword spotted
- # 'offset': 19.4, # the keyword start time in second
- # 'length': 0.68, # the keyword length in second
- # 'confidence': 0.85 # the possibility if it is the keyword
- # },
- # ...
- # ]
- # }
- Tasks.keyword_spotting: [OutputKeys.KWS_LIST],
- # ============ multi-modal tasks ===================
- # image caption result for single sample
- # {
- # "caption": "this is an image caption text."
- # }
- Tasks.image_captioning: [OutputKeys.CAPTION],
- # video caption result for single sample
- # {
- # "caption": "this is an video caption text."
- # }
- Tasks.video_captioning: [OutputKeys.CAPTION],
- Tasks.ocr_recognition: [OutputKeys.TEXT],
- # visual grounding result for single sample
- # {
- # "boxes": [
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # [x1, y1, x2, y2],
- # ],
- # "scores": [0.9, 0.1, 0.05, 0.05]
- # }
- Tasks.visual_grounding: [OutputKeys.BOXES, OutputKeys.SCORES],
- # text_to_image result for samples
- # {
- # "output_imgs": np.ndarray list with shape [[height, width, 3], ...]
- # }
- Tasks.text_to_image_synthesis: [OutputKeys.OUTPUT_IMGS],
- # text_to_speech result for a single sample
- # {
- # "output_wav": {"input_label" : bytes}
- # }
- Tasks.text_to_speech: [OutputKeys.OUTPUT_WAV],
- # multi-modal embedding result for single sample
- # {
- # "img_embedding": np.array with shape [1, D],
- # "text_embedding": np.array with shape [1, D]
- # }
- Tasks.multi_modal_embedding: [
- OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING
- ],
- # generative multi-modal embedding result for single sample
- # {
- # "img_embedding": np.array with shape [1, D],
- # "text_embedding": np.array with shape [1, D],
- # "caption": "this is an image caption text."
- # }
- Tasks.generative_multi_modal_embedding: [
- OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.CAPTION
- ],
- # multi-modal similarity result for single sample
- # {
- # "img_embedding": np.array with shape [1, D],
- # "text_embedding": np.array with shape [1, D],
- # "similarity": float
- # }
- Tasks.multi_modal_similarity: [
- OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES
- ],
- # VQA result for a sample
- # {"text": "this is a text answser. "}
- Tasks.visual_question_answering: [OutputKeys.TEXT],
- # VideoQA result for a sample
- # {"text": "this is a text answser. "}
- Tasks.video_question_answering: [OutputKeys.TEXT],
- # Multimodal Dialogue result for a sample
- # {"text": "this is a text response. "}
- Tasks.multimodal_dialogue: [OutputKeys.TEXT],
- # auto_speech_recognition result for a single sample
- # {
- # "text": "每天都要快乐喔"
- # }
- Tasks.auto_speech_recognition: [OutputKeys.TEXT],
- # {
- # "scores": [0.9, 0.1, 0.1],
- # "labels": ["entailment", "contradiction", "neutral"]
- # }
- Tasks.visual_entailment: [OutputKeys.SCORES, OutputKeys.LABELS],
- # {
- # 'labels': ['吸烟', '打电话', '吸烟'],
- # 'scores': [0.7527753114700317, 0.753358006477356, 0.6880350708961487],
- # 'boxes': [[547, 2, 1225, 719], [529, 8, 1255, 719], [584, 0, 1269, 719]],
- # 'timestamps': [1, 3, 5]
- # }
- Tasks.action_detection: [
- OutputKeys.TIMESTAMPS,
- OutputKeys.LABELS,
- OutputKeys.SCORES,
- OutputKeys.BOXES,
- ],
- # {
- # 'output': [
- # [{'label': '6527856', 'score': 0.9942756295204163}, {'label': '1000012000', 'score': 0.0379515215754509},
- # {'label': '13421097', 'score': 2.2825044965202324e-08}],
- # [{'label': '1000012000', 'score': 0.910681426525116}, {'label': '6527856', 'score': 0.0005046309670433402},
- # {'label': '13421097', 'score': 2.75914817393641e-06}],
- # [{'label': '1000012000', 'score': 0.910681426525116}, {'label': '6527856', 'score': 0.0005046309670433402},
- # {'label': '13421097', 'score': 2.75914817393641e-06}]]
- # }
- Tasks.faq_question_answering: [OutputKeys.OUTPUT],
- # image person reid result for single sample
- # {
- # "img_embedding": np.array with shape [1, D],
- # }
- Tasks.image_reid_person: [OutputKeys.IMG_EMBEDDING],
- # {
- # 'output': ['Done' / 'Decode_Error']
- # }
- Tasks.video_inpainting: [OutputKeys.OUTPUT],
- # {
- # 'output': ['bixin']
- # }
- Tasks.hand_static: [OutputKeys.OUTPUT],
- # { 'labels': [2, 1, 0],
- # 'boxes':[[[78, 282, 240, 504], [127, 87, 332, 370], [0, 0, 367, 639]]
- # 'scores':[0.8202137351036072, 0.8987470269203186, 0.9679114818572998]
- # }
- Tasks.face_human_hand_detection: [
- OutputKeys.LABELS, OutputKeys.BOXES, OutputKeys.SCORES
- ],
- # {
- # {'output': 'Happiness', 'boxes': (203, 104, 663, 564)}
- # }
- Tasks.face_emotion: [OutputKeys.OUTPUT, OutputKeys.BOXES],
- # {
- # "masks": [
- # np.array # 2D array containing only 0, 255
- # ]
- # }
- Tasks.product_segmentation: [OutputKeys.MASKS],
- # image_skychange result for a single sample
- # {
- # "output_img": np.ndarray with shape [height, width, 3]
- # }
- Tasks.image_skychange: [OutputKeys.OUTPUT_IMG],
- # {
- # 'score': [0.1, 0.2, 0.3, ...]
- # }
- Tasks.translation_evaluation: [OutputKeys.SCORE],
- # video object segmentation result for a single video
- # {
- # "masks": [np.array # 3D array with shape [frame_num, height, width]]
- # }
- Tasks.video_object_segmentation: [OutputKeys.MASKS],
- # motion generation result for a single input
- # {
- # "keypoints": [np.array # 3D array with shape [frame_num, joint_num, 3]]
- # "output_video": "path_to_rendered_video"
- # }
- Tasks.motion_generation: [OutputKeys.KEYPOINTS, OutputKeys.OUTPUT_VIDEO],
- # bad image detecting for a single input
- # {
- # "scores": [0.8, 0.1, 0.1]
- # "labels": ["正常", "花屏", "绿屏"],
- Tasks.bad_image_detecting: [OutputKeys.SCORES, OutputKeys.LABELS],
- # vision efficient tuning result for single sample
- # {
- # "scores": [0.9, 0.1, 0.05, 0.05]
- # "labels": ["dog", "horse", "cow", "cat"],
- # }
- Tasks.vision_efficient_tuning: [OutputKeys.SCORES, OutputKeys.LABELS],
- Tasks.document_grounded_dialog_generate: [OutputKeys.TEXT],
- Tasks.document_grounded_dialog_rerank: [OutputKeys.OUTPUT],
- Tasks.document_grounded_dialog_retrieval: [OutputKeys.OUTPUT],
- Tasks.video_temporal_grounding: [OutputKeys.SCORES, OutputKeys.TBOUNDS],
- Tasks.text_to_video_synthesis: [OutputKeys.OUTPUT_VIDEO],
- Tasks.text_to_360panorama_image: [OutputKeys.OUTPUT_IMG],
- # Tasks.image_try_on result for a single sample
- # {
- # "output_img": np.ndarray with shape [height, width, 3]
- # }
- Tasks.image_try_on: [OutputKeys.OUTPUT_IMG],
- # Tasks.human_image_generation result for a single sample
- # {
- # "output_img": np.ndarray with shape [height, width, 3]
- # }
- Tasks.human_image_generation: [OutputKeys.OUTPUT_IMG],
- # Tasks.image_view_transform result for a single sample
- # {
- # "output_imgs": np.ndarray list with shape [[height, width, 3], ...]
- # }
- Tasks.image_view_transform: [OutputKeys.OUTPUT_IMGS],
- Tasks.image_to_3d: [OutputKeys.MV_IMGS],
- Tasks.siamese_uie: [OutputKeys.OUTPUT],
- }
- class ModelOutputBase(list):
- def __post_init__(self):
- self.reconstruct()
- self.post_init = True
- def reconstruct(self):
- # Low performance, but low frequency.
- self.clear()
- for idx, key in enumerate(self.keys()):
- self.append(getattr(self, key))
- def __getitem__(self, item):
- if isinstance(item, str):
- if hasattr(self, item):
- return getattr(self, item)
- elif isinstance(item, (int, slice)):
- return super().__getitem__(item)
- raise IndexError(f'No Index {item} found in the dataclass.')
- def __setitem__(self, key, value):
- if isinstance(key, str):
- if key in [f.name for f in fields(self)]:
- if key not in self.keys():
- super().__setattr__(key, value)
- self.reconstruct()
- elif id(getattr(self, key)) != id(value):
- super().__setattr__(key, value)
- super().__setitem__(self.keys().index(key), value)
- else:
- super().__setattr__(key, value)
- elif isinstance(key, int):
- super().__setitem__(key, value)
- key_name = self.keys()[key]
- super().__setattr__(key_name, value)
- def __setattr__(self, key, value):
- if getattr(self, 'post_init', False):
- return self.__setitem__(key, value)
- else:
- return super().__setattr__(key, value)
- def keys(self):
- return [
- f.name for f in fields(self) if getattr(self, f.name) is not None
- ]
- def items(self):
- return self.to_dict().items()
- def to_dict(self):
- output = OrderedDict()
- for key in self.keys():
- output[key] = getattr(self, key)
- return output
|