yichael
/
AIStoryBoard


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
							# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is refer from:
https://github.com/roatienza/deep-text-recognition-benchmark/blob/master/modules/vitstr.py
"""

import numpy as np
import paddle
import paddle.nn as nn
from ppocr.modeling.backbones.rec_svtrnet import (
    Block,
    PatchEmbed,
    zeros_,
    trunc_normal_,
    ones_,
)

scale_dim_heads = {"tiny": [192, 3], "small": [384, 6], "base": [768, 12]}


class ViTSTR(nn.Layer):
    def __init__(
        self,
        img_size=[224, 224],
        in_channels=1,
        scale="tiny",
        seqlen=27,
        patch_size=[16, 16],
        embed_dim=None,
        depth=12,
        num_heads=None,
        mlp_ratio=4,
        qkv_bias=True,
        qk_scale=None,
        drop_path_rate=0.0,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        norm_layer="nn.LayerNorm",
        act_layer="nn.GELU",
        epsilon=1e-6,
        out_channels=None,
        **kwargs,
    ):
        super().__init__()
        self.seqlen = seqlen
        embed_dim = embed_dim if embed_dim is not None else scale_dim_heads[scale][0]
        num_heads = num_heads if num_heads is not None else scale_dim_heads[scale][1]
        out_channels = out_channels if out_channels is not None else embed_dim
        self.patch_embed = PatchEmbed(
            img_size=img_size,
            in_channels=in_channels,
            embed_dim=embed_dim,
            patch_size=patch_size,
            mode="linear",
        )
        num_patches = self.patch_embed.num_patches

        self.pos_embed = self.create_parameter(
            shape=[1, num_patches + 1, embed_dim], default_initializer=zeros_
        )
        self.add_parameter("pos_embed", self.pos_embed)
        self.cls_token = self.create_parameter(
            shape=[1, 1, embed_dim], default_initializer=zeros_
        )
        self.add_parameter("cls_token", self.cls_token)

        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = np.linspace(0, drop_path_rate, depth)
        self.blocks = nn.LayerList(
            [
                Block(
                    dim=embed_dim,
                    num_heads=num_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    act_layer=eval(act_layer),
                    epsilon=epsilon,
                    prenorm=False,
                )
                for i in range(depth)
            ]
        )
        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)

        self.out_channels = out_channels

        trunc_normal_(self.pos_embed)
        trunc_normal_(self.cls_token)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight)
            if isinstance(m, nn.Linear) and m.bias is not None:
                zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
            zeros_(m.bias)
            ones_(m.weight)

    def forward_features(self, x):
        B = x.shape[0]
        x = self.patch_embed(x)
        cls_tokens = paddle.tile(self.cls_token, repeat_times=[B, 1, 1])
        x = paddle.concat((cls_tokens, x), axis=1)
        x = x + self.pos_embed
        x = self.pos_drop(x)
        for blk in self.blocks:
            x = blk(x)
        x = self.norm(x)
        return x

    def forward(self, x):
        x = self.forward_features(x)
        x = x[:, : self.seqlen]
        return x.transpose([0, 2, 1]).unsqueeze(2)