e2e_pg_loss.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. # copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from __future__ import absolute_import
  15. from __future__ import division
  16. from __future__ import print_function
  17. from paddle import nn
  18. import paddle
  19. from .det_basic_loss import DiceLoss
  20. from ppocr.utils.e2e_utils.extract_batchsize import pre_process
  21. class PGLoss(nn.Layer):
  22. def __init__(
  23. self, tcl_bs, max_text_length, max_text_nums, pad_num, eps=1e-6, **kwargs
  24. ):
  25. super(PGLoss, self).__init__()
  26. self.tcl_bs = tcl_bs
  27. self.max_text_nums = max_text_nums
  28. self.max_text_length = max_text_length
  29. self.pad_num = pad_num
  30. self.dice_loss = DiceLoss(eps=eps)
  31. def border_loss(self, f_border, l_border, l_score, l_mask):
  32. l_border_split, l_border_norm = paddle.tensor.split(
  33. l_border, num_or_sections=[4, 1], axis=1
  34. )
  35. f_border_split = f_border
  36. b, c, h, w = l_border_norm.shape
  37. l_border_norm_split = paddle.expand(x=l_border_norm, shape=[b, 4 * c, h, w])
  38. b, c, h, w = l_score.shape
  39. l_border_score = paddle.expand(x=l_score, shape=[b, 4 * c, h, w])
  40. b, c, h, w = l_mask.shape
  41. l_border_mask = paddle.expand(x=l_mask, shape=[b, 4 * c, h, w])
  42. border_diff = l_border_split - f_border_split
  43. abs_border_diff = paddle.abs(border_diff)
  44. border_sign = abs_border_diff < 1.0
  45. border_sign = paddle.cast(border_sign, dtype="float32")
  46. border_sign.stop_gradient = True
  47. border_in_loss = 0.5 * abs_border_diff * abs_border_diff * border_sign + (
  48. abs_border_diff - 0.5
  49. ) * (1.0 - border_sign)
  50. border_out_loss = l_border_norm_split * border_in_loss
  51. border_loss = paddle.sum(border_out_loss * l_border_score * l_border_mask) / (
  52. paddle.sum(l_border_score * l_border_mask) + 1e-5
  53. )
  54. return border_loss
  55. def direction_loss(self, f_direction, l_direction, l_score, l_mask):
  56. l_direction_split, l_direction_norm = paddle.tensor.split(
  57. l_direction, num_or_sections=[2, 1], axis=1
  58. )
  59. f_direction_split = f_direction
  60. b, c, h, w = l_direction_norm.shape
  61. l_direction_norm_split = paddle.expand(
  62. x=l_direction_norm, shape=[b, 2 * c, h, w]
  63. )
  64. b, c, h, w = l_score.shape
  65. l_direction_score = paddle.expand(x=l_score, shape=[b, 2 * c, h, w])
  66. b, c, h, w = l_mask.shape
  67. l_direction_mask = paddle.expand(x=l_mask, shape=[b, 2 * c, h, w])
  68. direction_diff = l_direction_split - f_direction_split
  69. abs_direction_diff = paddle.abs(direction_diff)
  70. direction_sign = abs_direction_diff < 1.0
  71. direction_sign = paddle.cast(direction_sign, dtype="float32")
  72. direction_sign.stop_gradient = True
  73. direction_in_loss = (
  74. 0.5 * abs_direction_diff * abs_direction_diff * direction_sign
  75. + (abs_direction_diff - 0.5) * (1.0 - direction_sign)
  76. )
  77. direction_out_loss = l_direction_norm_split * direction_in_loss
  78. direction_loss = paddle.sum(
  79. direction_out_loss * l_direction_score * l_direction_mask
  80. ) / (paddle.sum(l_direction_score * l_direction_mask) + 1e-5)
  81. return direction_loss
  82. def ctcloss(self, f_char, tcl_pos, tcl_mask, tcl_label, label_t):
  83. f_char = paddle.transpose(f_char, [0, 2, 3, 1])
  84. tcl_pos = paddle.reshape(tcl_pos, [-1, 3])
  85. tcl_pos = paddle.cast(tcl_pos, dtype=int)
  86. f_tcl_char = paddle.gather_nd(f_char, tcl_pos)
  87. f_tcl_char = paddle.reshape(
  88. f_tcl_char, [-1, 64, self.pad_num + 1]
  89. ) # len(Lexicon_Table)+1
  90. f_tcl_char_fg, f_tcl_char_bg = paddle.split(
  91. f_tcl_char, [self.pad_num, 1], axis=2
  92. )
  93. f_tcl_char_bg = f_tcl_char_bg * tcl_mask + (1.0 - tcl_mask) * 20.0
  94. b, c, l = tcl_mask.shape
  95. tcl_mask_fg = paddle.expand(x=tcl_mask, shape=[b, c, self.pad_num * l])
  96. tcl_mask_fg.stop_gradient = True
  97. f_tcl_char_fg = f_tcl_char_fg * tcl_mask_fg + (1.0 - tcl_mask_fg) * (-20.0)
  98. f_tcl_char_mask = paddle.concat([f_tcl_char_fg, f_tcl_char_bg], axis=2)
  99. f_tcl_char_ld = paddle.transpose(f_tcl_char_mask, (1, 0, 2))
  100. N, B, _ = f_tcl_char_ld.shape
  101. input_lengths = paddle.to_tensor([N] * B, dtype="int64")
  102. cost = paddle.nn.functional.ctc_loss(
  103. log_probs=f_tcl_char_ld,
  104. labels=tcl_label,
  105. input_lengths=input_lengths,
  106. label_lengths=label_t,
  107. blank=self.pad_num,
  108. reduction="none",
  109. )
  110. cost = cost.mean()
  111. return cost
  112. def forward(self, predicts, labels):
  113. (
  114. images,
  115. tcl_maps,
  116. tcl_label_maps,
  117. border_maps,
  118. direction_maps,
  119. training_masks,
  120. label_list,
  121. pos_list,
  122. pos_mask,
  123. ) = labels
  124. # for all the batch_size
  125. pos_list, pos_mask, label_list, label_t = pre_process(
  126. label_list,
  127. pos_list,
  128. pos_mask,
  129. self.max_text_length,
  130. self.max_text_nums,
  131. self.pad_num,
  132. self.tcl_bs,
  133. )
  134. f_score, f_border, f_direction, f_char = (
  135. predicts["f_score"],
  136. predicts["f_border"],
  137. predicts["f_direction"],
  138. predicts["f_char"],
  139. )
  140. score_loss = self.dice_loss(f_score, tcl_maps, training_masks)
  141. border_loss = self.border_loss(f_border, border_maps, tcl_maps, training_masks)
  142. direction_loss = self.direction_loss(
  143. f_direction, direction_maps, tcl_maps, training_masks
  144. )
  145. ctc_loss = self.ctcloss(f_char, pos_list, pos_mask, label_list, label_t)
  146. loss_all = score_loss + border_loss + direction_loss + 5 * ctc_loss
  147. losses = {
  148. "loss": loss_all,
  149. "score_loss": score_loss,
  150. "border_loss": border_loss,
  151. "direction_loss": direction_loss,
  152. "ctc_loss": ctc_loss,
  153. }
  154. return losses