summarizer.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. # Part of the implementation is borrowed and modified from PGL-SUM,
  2. # publicly available at https://github.com/e-apostolidis/PGL-SUM
  3. import os.path as osp
  4. from typing import Dict, Union
  5. import numpy as np
  6. import torch
  7. import torch.nn as nn
  8. from modelscope.metainfo import Models
  9. from modelscope.models.base import Tensor, TorchModel
  10. from modelscope.models.builder import MODELS
  11. from modelscope.models.cv.video_summarization.kts.cpd_auto import cpd_auto
  12. from modelscope.models.cv.video_summarization.pgl_sum import PGL_SUM
  13. from modelscope.utils.constant import ModelFile, Tasks
  14. from modelscope.utils.logger import get_logger
  15. logger = get_logger()
  16. def get_change_points(video_feat, n_frame):
  17. video_feat = np.array(video_feat, np.float32)
  18. K = np.dot(video_feat, video_feat.T)
  19. change_points, _ = cpd_auto(
  20. K, ncp=min(K.shape[0] - 1, 120), vmax=2.2 / 4.0, lmin=1)
  21. change_points = change_points * 15
  22. change_points = np.concatenate(([0], change_points, [n_frame - 1]))
  23. temp_change_points = []
  24. for idx in range(len(change_points) - 1):
  25. segment = [change_points[idx], change_points[idx + 1] - 1]
  26. if idx == len(change_points) - 2:
  27. segment = [change_points[idx], change_points[idx + 1]]
  28. temp_change_points.append(segment)
  29. change_points = np.array(list(temp_change_points))
  30. temp_n_frame_per_seg = []
  31. for change_points_idx in range(len(change_points)):
  32. n_frame = change_points[change_points_idx][1] - change_points[
  33. change_points_idx][0]
  34. temp_n_frame_per_seg.append(n_frame)
  35. n_frame_per_seg = np.array(list(temp_n_frame_per_seg))
  36. return change_points, n_frame_per_seg
  37. def knap_sack(W, wt, val, n):
  38. """ Maximize the value that a knapsack of capacity W can hold. You can either put the item or discard it, there is
  39. no concept of putting some part of item in the knapsack.
  40. :param int W: Maximum capacity -in frames- of the knapsack.
  41. :param list[int] wt: The weights (lengths -in frames-) of each video shot.
  42. :param list[float] val: The values (importance scores) of each video shot.
  43. :param int n: The number of the shots.
  44. :return: A list containing the indices of the selected shots.
  45. """
  46. K = [[0 for _ in range(W + 1)] for _ in range(n + 1)]
  47. # Build table K[][] in bottom up manner
  48. for i in range(n + 1):
  49. for w in range(W + 1):
  50. if i == 0 or w == 0:
  51. K[i][w] = 0
  52. elif wt[i - 1] <= w:
  53. K[i][w] = max(val[i - 1] + K[i - 1][w - wt[i - 1]],
  54. K[i - 1][w])
  55. else:
  56. K[i][w] = K[i - 1][w]
  57. selected = []
  58. w = W
  59. for i in range(n, 0, -1):
  60. if K[i][w] != K[i - 1][w]:
  61. selected.insert(0, i - 1)
  62. w -= wt[i - 1]
  63. return selected
  64. def generate_summary(all_shot_bound, all_scores, all_nframes, all_positions):
  65. """ Generate the automatic machine summary, based on the video shots; the frame importance scores; the number of
  66. frames in the original video and the position of the sub-sampled frames of the original video.
  67. :param list[np.ndarray] all_shot_bound: The video shots for all the -original- testing videos.
  68. :param list[np.ndarray] all_scores: The calculated frame importance scores for all the sub-sampled testing videos.
  69. :param list[np.ndarray] all_nframes: The number of frames for all the -original- testing videos.
  70. :param list[np.ndarray] all_positions: The position of the sub-sampled frames for all the -original- testing videos.
  71. :return: A list containing the indices of the selected frames for all the -original- testing videos.
  72. """
  73. all_summaries = []
  74. for video_index in range(len(all_scores)):
  75. # Get shots' boundaries
  76. shot_bound = all_shot_bound[video_index] # [number_of_shots, 2]
  77. frame_init_scores = all_scores[video_index]
  78. n_frames = all_nframes[video_index]
  79. positions = all_positions[video_index]
  80. # Compute the importance scores for the initial frame sequence (not the sub-sampled one)
  81. frame_scores = np.zeros(n_frames, dtype=np.float32)
  82. if positions.dtype != int:
  83. positions = positions.astype(np.int32)
  84. if positions[-1] != n_frames:
  85. positions = np.concatenate([positions, [n_frames]])
  86. for i in range(len(positions) - 1):
  87. pos_left, pos_right = positions[i], positions[i + 1]
  88. if i == len(frame_init_scores):
  89. frame_scores[pos_left:pos_right] = 0
  90. else:
  91. frame_scores[pos_left:pos_right] = frame_init_scores[i]
  92. # Compute shot-level importance scores by taking the average importance scores of all frames in the shot
  93. shot_imp_scores = []
  94. shot_lengths = []
  95. for shot in shot_bound:
  96. shot_lengths.append(shot[1] - shot[0] + 1)
  97. shot_imp_scores.append(
  98. (frame_scores[shot[0]:shot[1] + 1].mean()).item())
  99. # Select the best shots using the knapsack implementation
  100. final_shot = shot_bound[-1]
  101. final_max_length = int((final_shot[1] + 1) * 0.15)
  102. selected = knap_sack(final_max_length, shot_lengths, shot_imp_scores,
  103. len(shot_lengths))
  104. # Select all frames from each selected shot (by setting their value in the summary vector to 1)
  105. summary = np.zeros(final_shot[1] + 1, dtype=np.int8)
  106. for shot in selected:
  107. summary[shot_bound[shot][0]:shot_bound[shot][1] + 1] = 1
  108. all_summaries.append(summary)
  109. return all_summaries
  110. def transform_time(seconds):
  111. m, s = divmod(seconds, 60)
  112. h, m = divmod(m, 60)
  113. time = '%02d:%02d:%06.3f' % (h, m, s)
  114. return time
  115. def summary_format(summary, fps):
  116. frames_list = []
  117. start_frame = -1
  118. end_frame = -1
  119. is_summary_frame = False
  120. for i, idx in enumerate(summary):
  121. if idx:
  122. if is_summary_frame is False:
  123. start_frame = i
  124. is_summary_frame = True
  125. else:
  126. if is_summary_frame:
  127. end_frame = i - 1
  128. frames_list.append([start_frame, end_frame])
  129. is_summary_frame = False
  130. if is_summary_frame and summary[-1] == 1:
  131. end_frame = len(summary) - 1
  132. frames_list.append([start_frame, end_frame])
  133. output = []
  134. for seg in frames_list:
  135. output.append({
  136. 'frame':
  137. seg,
  138. 'timestamps': [
  139. transform_time(seg[0] / float(fps)),
  140. transform_time(seg[1] / float(fps))
  141. ]
  142. })
  143. return output
  144. @MODELS.register_module(
  145. Tasks.video_summarization, module_name=Models.video_summarization)
  146. class PGLVideoSummarization(TorchModel):
  147. def __init__(self, model_dir: str, *args, **kwargs):
  148. """initialize the video summarization model from the `model_dir` path.
  149. Args:
  150. model_dir (str): the model path.
  151. """
  152. super().__init__(model_dir, *args, **kwargs)
  153. model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
  154. self.loss = nn.MSELoss()
  155. self.model = PGL_SUM(
  156. input_size=1024,
  157. output_size=1024,
  158. num_segments=4,
  159. heads=8,
  160. fusion='add',
  161. pos_enc='absolute')
  162. if torch.cuda.is_available():
  163. self._device = torch.device('cuda')
  164. else:
  165. self._device = torch.device('cpu')
  166. self.model = self.model.to(self._device)
  167. self.model = self._load_pretrained(self.model, model_path)
  168. if self.training:
  169. self.model.train()
  170. else:
  171. self.model.eval()
  172. def _train_forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
  173. frame_features = input['frame_features']
  174. gtscore = input['gtscore']
  175. preds, attn_weights = self.model(frame_features)
  176. return {'loss': self.loss(preds, gtscore)}
  177. def _inference_forward(self, input: Dict[str,
  178. Tensor]) -> Dict[str, Tensor]:
  179. frame_features = input['frame_features']
  180. y, attn_weights = self.model(frame_features)
  181. return {'scores': y}
  182. def forward(self, input: Dict[str,
  183. Tensor]) -> Dict[str, Union[list, Tensor]]:
  184. """return the result by the model
  185. Args:
  186. input (Dict[str, Tensor]): the preprocessed data
  187. Returns:
  188. Dict[str, Union[list, Tensor]]: results
  189. """
  190. for key, value in input.items():
  191. input[key] = input[key].to(self._device)
  192. if self.training:
  193. return self._train_forward(input)
  194. else:
  195. return self._inference_forward(input)