yichael
/
AIStoryBoard


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
							# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/hikopensource/DAVAR-Lab-OCR/blob/main/davarocr/davar_rcg/models/sequence_heads/counting_head.py
"""
import paddle
import paddle.nn as nn
from paddle.nn.initializer import TruncatedNormal, Constant, Normal, KaimingNormal

from .rec_att_head import AttentionLSTM

kaiming_init_ = KaimingNormal()
zeros_ = Constant(value=0.0)
ones_ = Constant(value=1.0)


class CNTHead(nn.Layer):
    def __init__(self, embed_size=512, encode_length=26, out_channels=38, **kwargs):
        super(CNTHead, self).__init__()

        self.out_channels = out_channels

        self.Wv_fusion = nn.Linear(embed_size, embed_size, bias_attr=False)
        self.Prediction_visual = nn.Linear(
            encode_length * embed_size, self.out_channels
        )

    def forward(self, visual_feature):
        b, c, h, w = visual_feature.shape
        visual_feature = visual_feature.reshape([b, c, h * w]).transpose([0, 2, 1])
        visual_feature_num = self.Wv_fusion(visual_feature)  # batch * 26 * 512
        b, n, c = visual_feature_num.shape
        # using visual feature directly calculate the text length
        visual_feature_num = visual_feature_num.reshape([b, n * c])
        prediction_visual = self.Prediction_visual(visual_feature_num)

        return prediction_visual


class RFLHead(nn.Layer):
    def __init__(
        self,
        in_channels=512,
        hidden_size=256,
        batch_max_legnth=25,
        out_channels=38,
        use_cnt=True,
        use_seq=True,
        **kwargs,
    ):
        super(RFLHead, self).__init__()
        assert use_cnt or use_seq
        self.use_cnt = use_cnt
        self.use_seq = use_seq
        if self.use_cnt:
            self.cnt_head = CNTHead(
                embed_size=in_channels,
                encode_length=batch_max_legnth + 1,
                out_channels=out_channels,
                **kwargs,
            )
        if self.use_seq:
            self.seq_head = AttentionLSTM(
                in_channels=in_channels,
                out_channels=out_channels,
                hidden_size=hidden_size,
                **kwargs,
            )
        self.batch_max_legnth = batch_max_legnth
        self.num_class = out_channels
        self.apply(self.init_weights)

    def init_weights(self, m):
        if isinstance(m, nn.Linear):
            kaiming_init_(m.weight)
            if isinstance(m, nn.Linear) and m.bias is not None:
                zeros_(m.bias)

    def forward(self, x, targets=None):
        cnt_inputs, seq_inputs = x
        if self.use_cnt:
            cnt_outputs = self.cnt_head(cnt_inputs)
        else:
            cnt_outputs = None
        if self.use_seq:
            if self.training:
                seq_outputs = self.seq_head(
                    seq_inputs, targets[0], self.batch_max_legnth
                )
            else:
                seq_outputs = self.seq_head(seq_inputs, None, self.batch_max_legnth)
            return cnt_outputs, seq_outputs
        else:
            return cnt_outputs