model.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378
  1. # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
  2. import os
  3. import os.path as osp
  4. import torch
  5. import torch.nn as nn
  6. import torch.nn.functional as F
  7. from modelscope.metainfo import Models
  8. from modelscope.models.base.base_torch_model import TorchModel
  9. from modelscope.models.builder import MODELS
  10. from modelscope.utils.config import Config
  11. from modelscope.utils.constant import ModelFile, Tasks
  12. from .backbone import load_clip
  13. from .basic_utils import get_state_dict, set_seed
  14. @MODELS.register_module(
  15. Tasks.vop_retrieval, module_name=Models.vop_retrieval_model)
  16. class VoP(TorchModel):
  17. """
  18. The implementation of 'VoP: Text-Video Co-operative Prompt Tuning for Cross-Modal Retrieval'.
  19. This model is dynamically initialized with the following parts:
  20. - clip: the upstream pre-trained backbone model (CLIP in this code)
  21. - pool_frames: the frames pooling method
  22. - visual_prompt_learner: visual prompt
  23. - ImageEncoder: get image encoder
  24. - TextPromptLearner: text prompt
  25. - TextEncoder: get text encoder
  26. """
  27. def __init__(self, model_dir: str, *args, **kwargs):
  28. """
  29. Initialize a VoP Model
  30. Args:
  31. model_dir: model id or path,
  32. """
  33. super(VoP, self).__init__()
  34. model_path = osp.join(model_dir, 'VoP_msrvtt9k.pth')
  35. clip_arch = osp.join(model_dir, 'ViT-B-32.pt')
  36. config_path = osp.join(model_dir, ModelFile.CONFIGURATION)
  37. self.config = Config.from_file(config_path).hyperparam
  38. self.clip = load_clip(name=clip_arch)
  39. self.config.vpt_layers = list(
  40. range(self.clip.visual.transformer.layers))
  41. self.config.tpt_layers = list(range(self.clip.transformer.layers))
  42. self.pool_frames = BaselinePooling(self.config.pooling_type,
  43. self.config)
  44. self.visual_prompt_learner = VisualPromptLearner(
  45. self.clip, self.config)
  46. self.image_encoder = ImageEncoder(self.clip, self.config)
  47. self.text_prompt_learner = TextPromptLearner(self.clip, self.config)
  48. self.text_encoder = TextEncoder(self.clip, self.config)
  49. # load param from pre-train model
  50. self.load_state_dict(get_state_dict(model_path))
  51. self.eval()
  52. # set seed
  53. os.environ['TOKENIZERS_PARALLELISM'] = 'false'
  54. set_seed(self.config.seed)
  55. def get_video_features(self, videos, return_all_frames=False):
  56. """
  57. Get video Features
  58. Args:
  59. videos: the dim is [1, 12, 3, 224, 224]
  60. return_all_frames: default False
  61. """
  62. batch_size = videos.shape[0]
  63. video_data = videos.reshape(-1, 3, self.config.input_res,
  64. self.config.input_res)
  65. visual_prompts = self.visual_prompt_learner()
  66. video_features = self.image_encoder(visual_prompts, video_data)
  67. video_features = video_features / video_features.norm(
  68. dim=-1, keepdim=True)
  69. video_features = video_features.reshape(batch_size,
  70. self.config.num_frames, -1)
  71. video_features_pooled = self.pool_frames(None, video_features)
  72. if return_all_frames:
  73. return video_features, video_features_pooled
  74. return video_features_pooled
  75. def get_text_features(self, text_data):
  76. """
  77. Get Text Features
  78. Args:
  79. text_data: the dim is [1, 69]
  80. """
  81. text_prompts = self.text_prompt_learner()
  82. text_features = self.text_encoder(text_prompts, text_data)
  83. text_features = text_features / text_features.norm(
  84. dim=-1, keepdim=True)
  85. return text_features
  86. def forward(self, data, return_all_frames=False):
  87. """
  88. Dynamic Forward Function of VoP
  89. Args:
  90. data: the input data
  91. return_all_frames: default False
  92. """
  93. batch_size = data['video'].shape[0]
  94. text_data = data['text']
  95. video_data = data['video']
  96. video_data = video_data.reshape(-1, 3, self.config.input_res,
  97. self.config.input_res)
  98. visual_prompts = self.visual_prompt_learner()
  99. video_features = self.image_encoder(visual_prompts, video_data)
  100. text_prompts = self.text_prompt_learner()
  101. text_features = self.text_encoder(text_prompts, text_data)
  102. text_features = text_features / text_features.norm(
  103. dim=-1, keepdim=True)
  104. video_features = video_features / video_features.norm(
  105. dim=-1, keepdim=True)
  106. video_features = video_features.reshape(batch_size,
  107. self.config.num_frames, -1)
  108. video_features_pooled = self.pool_frames(text_features, video_features)
  109. if return_all_frames:
  110. return text_features, video_features, video_features_pooled
  111. return text_features, video_features_pooled
  112. class BaselinePooling(TorchModel):
  113. """
  114. Redefined Pooling Function
  115. """
  116. def __init__(self, pooling_type, config):
  117. super(BaselinePooling, self).__init__()
  118. if pooling_type == 'avg':
  119. self.pooling_func = self._avg_pooling
  120. else:
  121. raise NotImplementedError
  122. def _avg_pooling(self, text_embeds, video_embeds):
  123. """
  124. Pooling mean of frames
  125. Args:
  126. text_embeds: the input text embedding which is None here.
  127. video_embeds: the input video embedding with [1, 12, 512].
  128. Returns:
  129. video_embeds_pooled: num_vids x embed_dim
  130. """
  131. video_embeds_pooled = video_embeds.mean(dim=1)
  132. return video_embeds_pooled
  133. def forward(self, text_embeds, video_embeds):
  134. return self.pooling_func(text_embeds, video_embeds)
  135. class VisualPromptLearner(TorchModel):
  136. """
  137. The implementation of visual prompt.
  138. This module is used to define the learnable prompt parameters:
  139. the number of tokens is 8,
  140. the prompt dimension is 768,
  141. and the initialization weight std used is 0.02.
  142. """
  143. def __init__(self, clip_model, config):
  144. super(VisualPromptLearner, self).__init__()
  145. vp_token_num = config.vp_token_num
  146. vp_dim = clip_model.visual.ln_post.weight.shape[0]
  147. dtype = clip_model.dtype
  148. visual_prompts = torch.empty(
  149. len(config.vpt_layers), 1, vp_token_num, vp_dim, dtype=dtype)
  150. nn.init.normal_(visual_prompts, std=0.02)
  151. self.visual_prompts = nn.Parameter(visual_prompts)
  152. def forward(self):
  153. vp = self.visual_prompts
  154. return vp
  155. class TextPromptLearner(TorchModel):
  156. """
  157. The implementation of visual prompt.
  158. This module is used to define the learnable prompt parameters:
  159. the number of tokens is 4,
  160. the prompt dimension is 512,
  161. and the initialization weight std used is 0.02.
  162. """
  163. def __init__(self, clip_model, config):
  164. super(TextPromptLearner, self).__init__()
  165. tp_prefix_token_num = config.tp_prefix_token_num
  166. tp_suffix_token_num = config.tp_suffix_token_num
  167. assert tp_prefix_token_num >= 0 and tp_suffix_token_num >= 0
  168. tp_dim = clip_model.ln_final.weight.shape[0]
  169. dtype = clip_model.dtype
  170. text_prompts = torch.empty(
  171. len(config.tpt_layers),
  172. tp_prefix_token_num + tp_suffix_token_num,
  173. tp_dim,
  174. dtype=dtype)
  175. nn.init.normal_(text_prompts, std=0.02)
  176. self.text_prompts = nn.Parameter(text_prompts)
  177. self.tp_prefix_token_num = tp_prefix_token_num
  178. self.tp_suffix_token_num = tp_suffix_token_num
  179. def forward(self):
  180. return (self.text_prompts[:, :self.tp_prefix_token_num, :],
  181. self.text_prompts[:, self.tp_prefix_token_num:, :])
  182. class ImageEncoder(TorchModel):
  183. """
  184. The implementation of image encoder.
  185. This module is used to obtain the features of each frame of the video.
  186. """
  187. def __init__(self, clip_model, config):
  188. super(ImageEncoder, self).__init__()
  189. self.config = config
  190. self.vpt_layers = config.vpt_layers
  191. self.vp_token_num = config.vp_token_num
  192. self.num_frames = config.num_frames
  193. self.conv1 = clip_model.visual.conv1
  194. self.class_embedding = clip_model.visual.class_embedding
  195. self.positional_embedding = clip_model.visual.positional_embedding
  196. self.ln_pre = clip_model.visual.ln_pre
  197. self.transformer = clip_model.visual.transformer
  198. self.ln_post = clip_model.visual.ln_post
  199. self.proj = clip_model.visual.proj
  200. def forward(self, visual_prompts, x):
  201. """
  202. The forward function of image encoder.
  203. Args:
  204. visual_prompts: the visual prompt, dim is [12, 1, 8, 768]
  205. x: the input data, dim is [12, 3, 224, 224]
  206. Returns:
  207. x: the output data, dim is [12, 512]
  208. """
  209. batch_size = x.shape[0]
  210. x = self.conv1(x)
  211. x = x.reshape(batch_size, x.shape[1], -1)
  212. x = x.permute(0, 2, 1)
  213. x_1 = self.class_embedding.to(x.dtype)
  214. x_2 = torch.zeros(
  215. batch_size, 1, x.shape[-1], dtype=x.dtype, device=x.device)
  216. x_1 = x_1 + x_2
  217. x = torch.cat([x_1, x], dim=1)
  218. x = x + self.positional_embedding.to(x.dtype)
  219. for i_layer in range(self.transformer.layers):
  220. if i_layer in self.vpt_layers:
  221. i_prompt = self.vpt_layers.index(i_layer)
  222. cur_layer_vp = visual_prompts[i_prompt, :, :, :].repeat(
  223. batch_size, 1, 1)
  224. x = torch.cat([x[:, :1, :], cur_layer_vp, x[:, 1:, :]], dim=1)
  225. if i_layer == 0:
  226. x = self.ln_pre(x)
  227. x = x.permute(1, 0, 2)
  228. x = self.transformer.resblocks[i_layer](x)
  229. x = x.permute(1, 0, 2)
  230. if i_layer + 1 in self.vpt_layers:
  231. x = torch.cat([x[:, :1, :], x[:, 1 + self.vp_token_num:, :]],
  232. dim=1)
  233. x = self.ln_post(x[:, 0, :])
  234. if self.proj is not None:
  235. x = x @ self.proj
  236. return x
  237. class TextEncoder(TorchModel):
  238. """
  239. The implementation of text encoder.
  240. This module is used to obtain the features of each word of the sentence.
  241. """
  242. def __init__(self, clip_model, config):
  243. super(TextEncoder, self).__init__()
  244. self.transformer = clip_model.transformer
  245. self.token_embedding = clip_model.token_embedding
  246. self.positional_embedding = clip_model.positional_embedding
  247. self.ln_final = clip_model.ln_final
  248. self.text_projection = clip_model.text_projection
  249. self.dtype = clip_model.dtype
  250. self.tpt_layers = config.tpt_layers
  251. assert 0 in self.tpt_layers
  252. self.tp_prefix_token_num = config.tp_prefix_token_num
  253. self.tp_suffix_token_num = config.tp_suffix_token_num
  254. self.tp_token_num = config.tp_prefix_token_num + config.tp_suffix_token_num
  255. def forward(self, text_prompts, text):
  256. """
  257. The forward function of text encoder.
  258. Args:
  259. text_prompts: the text prompt, dim is 2 x [12, 4, 512]
  260. text: the input data, dim is [1, 69]
  261. Returns:
  262. x: the output data, dim is [1, 512]
  263. """
  264. x = self.token_embedding(text).type(self.dtype)
  265. batch_size = x.shape[0]
  266. prompt_prefix, prompt_suffix = text_prompts
  267. for i_layer in range(self.transformer.layers):
  268. if i_layer in self.tpt_layers:
  269. i_prompt = self.tpt_layers.index(i_layer)
  270. if self.tp_prefix_token_num > 0:
  271. cur_layer_tp_prefix = prompt_prefix[i_prompt:i_prompt
  272. + 1, :, :].expand(
  273. batch_size, -1, -1)
  274. x = torch.cat(
  275. [x[:, :1, :], cur_layer_tp_prefix, x[:, 1:, :]], dim=1)
  276. if self.tp_suffix_token_num > 0:
  277. cur_layer_tp_suffix = prompt_suffix[i_prompt:i_prompt
  278. + 1, :, :].expand(
  279. batch_size, -1, -1)
  280. x = torch.cat(
  281. [x[:, :-1, :], cur_layer_tp_suffix, x[:, -1:, :]],
  282. dim=1)
  283. if i_layer == 0:
  284. x = x + self.positional_embedding.type(self.dtype)
  285. x = x.permute(1, 0, 2)
  286. x = self.transformer.resblocks[i_layer](x)
  287. x = x.permute(1, 0, 2)
  288. if i_layer + 1 in self.tpt_layers:
  289. temp_1 = x[:, :1, :]
  290. temp_2 = x[:, 1 + self.tp_prefix_token_num:-1
  291. - self.tp_suffix_token_num, :]
  292. temp_3 = x[:, -1:, :]
  293. temp = torch.cat([temp_1, temp_2, temp_3], dim=1)
  294. x = temp
  295. x = self.ln_final(x).type(self.dtype)
  296. x = x[torch.arange(x.shape[0]),
  297. text.argmax(dim=-1) + self.tp_token_num] @ self.text_projection
  298. return x