rec_rfl_head.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. # copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. This code is refer from:
  16. https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_rcg/models/sequence_heads/counting_head.py
  17. """
  18. import paddle
  19. import paddle.nn as nn
  20. from paddle.nn.initializer import TruncatedNormal, Constant, Normal, KaimingNormal
  21. from .rec_att_head import AttentionLSTM
  22. kaiming_init_ = KaimingNormal()
  23. zeros_ = Constant(value=0.0)
  24. ones_ = Constant(value=1.0)
  25. class CNTHead(nn.Layer):
  26. def __init__(self, embed_size=512, encode_length=26, out_channels=38, **kwargs):
  27. super(CNTHead, self).__init__()
  28. self.out_channels = out_channels
  29. self.Wv_fusion = nn.Linear(embed_size, embed_size, bias_attr=False)
  30. self.Prediction_visual = nn.Linear(
  31. encode_length * embed_size, self.out_channels
  32. )
  33. def forward(self, visual_feature):
  34. b, c, h, w = visual_feature.shape
  35. visual_feature = visual_feature.reshape([b, c, h * w]).transpose([0, 2, 1])
  36. visual_feature_num = self.Wv_fusion(visual_feature) # batch * 26 * 512
  37. b, n, c = visual_feature_num.shape
  38. # using visual feature directly calculate the text length
  39. visual_feature_num = visual_feature_num.reshape([b, n * c])
  40. prediction_visual = self.Prediction_visual(visual_feature_num)
  41. return prediction_visual
  42. class RFLHead(nn.Layer):
  43. def __init__(
  44. self,
  45. in_channels=512,
  46. hidden_size=256,
  47. batch_max_legnth=25,
  48. out_channels=38,
  49. use_cnt=True,
  50. use_seq=True,
  51. **kwargs,
  52. ):
  53. super(RFLHead, self).__init__()
  54. assert use_cnt or use_seq
  55. self.use_cnt = use_cnt
  56. self.use_seq = use_seq
  57. if self.use_cnt:
  58. self.cnt_head = CNTHead(
  59. embed_size=in_channels,
  60. encode_length=batch_max_legnth + 1,
  61. out_channels=out_channels,
  62. **kwargs,
  63. )
  64. if self.use_seq:
  65. self.seq_head = AttentionLSTM(
  66. in_channels=in_channels,
  67. out_channels=out_channels,
  68. hidden_size=hidden_size,
  69. **kwargs,
  70. )
  71. self.batch_max_legnth = batch_max_legnth
  72. self.num_class = out_channels
  73. self.apply(self.init_weights)
  74. def init_weights(self, m):
  75. if isinstance(m, nn.Linear):
  76. kaiming_init_(m.weight)
  77. if isinstance(m, nn.Linear) and m.bias is not None:
  78. zeros_(m.bias)
  79. def forward(self, x, targets=None):
  80. cnt_inputs, seq_inputs = x
  81. if self.use_cnt:
  82. cnt_outputs = self.cnt_head(cnt_inputs)
  83. else:
  84. cnt_outputs = None
  85. if self.use_seq:
  86. if self.training:
  87. seq_outputs = self.seq_head(
  88. seq_inputs, targets[0], self.batch_max_legnth
  89. )
  90. else:
  91. seq_outputs = self.seq_head(seq_inputs, None, self.batch_max_legnth)
  92. return cnt_outputs, seq_outputs
  93. else:
  94. return cnt_outputs