rec_svtrv2.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from paddle import ParamAttr
  15. from paddle.nn.initializer import KaimingNormal
  16. import numpy as np
  17. import paddle
  18. import paddle.nn as nn
  19. from paddle.nn.initializer import TruncatedNormal, Constant, Normal
  20. trunc_normal_ = TruncatedNormal(std=0.02)
  21. normal_ = Normal
  22. zeros_ = Constant(value=0.0)
  23. ones_ = Constant(value=1.0)
  24. def drop_path(x, drop_prob=0.0, training=False):
  25. """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
  26. the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
  27. See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
  28. """
  29. if drop_prob == 0.0 or not training:
  30. return x
  31. keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
  32. shape = (paddle.shape(x)[0],) + (1,) * (x.ndim - 1)
  33. random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
  34. random_tensor = paddle.floor(random_tensor) # binarize
  35. output = x.divide(keep_prob) * random_tensor
  36. return output
  37. class DropPath(nn.Layer):
  38. """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
  39. def __init__(self, drop_prob=None):
  40. super(DropPath, self).__init__()
  41. self.drop_prob = drop_prob
  42. def forward(self, x):
  43. return drop_path(x, self.drop_prob, self.training)
  44. class Identity(nn.Layer):
  45. def __init__(self):
  46. super(Identity, self).__init__()
  47. def forward(self, input):
  48. return input
  49. class Mlp(nn.Layer):
  50. def __init__(
  51. self,
  52. in_features,
  53. hidden_features=None,
  54. out_features=None,
  55. act_layer=nn.GELU,
  56. drop=0.0,
  57. ):
  58. super().__init__()
  59. out_features = out_features or in_features
  60. hidden_features = hidden_features or in_features
  61. self.fc1 = nn.Linear(in_features, hidden_features)
  62. self.act = act_layer()
  63. self.fc2 = nn.Linear(hidden_features, out_features)
  64. self.drop = nn.Dropout(drop)
  65. def forward(self, x):
  66. x = self.fc1(x)
  67. x = self.act(x)
  68. x = self.drop(x)
  69. x = self.fc2(x)
  70. x = self.drop(x)
  71. return x
  72. class ConvBNLayer(nn.Layer):
  73. def __init__(
  74. self,
  75. in_channels,
  76. out_channels,
  77. kernel_size=3,
  78. stride=1,
  79. padding=0,
  80. bias_attr=False,
  81. groups=1,
  82. act=nn.GELU,
  83. ):
  84. super().__init__()
  85. self.conv = nn.Conv2D(
  86. in_channels=in_channels,
  87. out_channels=out_channels,
  88. kernel_size=kernel_size,
  89. stride=stride,
  90. padding=padding,
  91. groups=groups,
  92. weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()),
  93. bias_attr=bias_attr,
  94. )
  95. self.norm = nn.BatchNorm2D(out_channels)
  96. self.act = act()
  97. def forward(self, inputs):
  98. out = self.conv(inputs)
  99. out = self.norm(out)
  100. out = self.act(out)
  101. return out
  102. class Attention(nn.Layer):
  103. def __init__(
  104. self,
  105. dim,
  106. num_heads=8,
  107. qkv_bias=False,
  108. qk_scale=None,
  109. attn_drop=0.0,
  110. proj_drop=0.0,
  111. ):
  112. super().__init__()
  113. self.num_heads = num_heads
  114. self.dim = dim
  115. self.head_dim = dim // num_heads
  116. self.scale = qk_scale or self.head_dim**-0.5
  117. self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
  118. self.attn_drop = nn.Dropout(attn_drop)
  119. self.proj = nn.Linear(dim, dim)
  120. self.proj_drop = nn.Dropout(proj_drop)
  121. def forward(self, x):
  122. qkv = (
  123. self.qkv(x)
  124. .reshape((0, -1, 3, self.num_heads, self.head_dim))
  125. .transpose((2, 0, 3, 1, 4))
  126. )
  127. q, k, v = qkv[0], qkv[1], qkv[2]
  128. attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
  129. attn = nn.functional.softmax(attn, axis=-1)
  130. attn = self.attn_drop(attn)
  131. x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, -1, self.dim))
  132. x = self.proj(x)
  133. x = self.proj_drop(x)
  134. return x
  135. class Block(nn.Layer):
  136. def __init__(
  137. self,
  138. dim,
  139. num_heads,
  140. mlp_ratio=4.0,
  141. qkv_bias=False,
  142. qk_scale=None,
  143. drop=0.0,
  144. attn_drop=0.0,
  145. drop_path=0.0,
  146. act_layer=nn.GELU,
  147. norm_layer=nn.LayerNorm,
  148. epsilon=1e-6,
  149. ):
  150. super().__init__()
  151. self.norm1 = norm_layer(dim, epsilon=epsilon)
  152. self.mixer = Attention(
  153. dim,
  154. num_heads=num_heads,
  155. qkv_bias=qkv_bias,
  156. qk_scale=qk_scale,
  157. attn_drop=attn_drop,
  158. proj_drop=drop,
  159. )
  160. self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
  161. self.norm2 = norm_layer(dim, epsilon=epsilon)
  162. mlp_hidden_dim = int(dim * mlp_ratio)
  163. self.mlp_ratio = mlp_ratio
  164. self.mlp = Mlp(
  165. in_features=dim,
  166. hidden_features=mlp_hidden_dim,
  167. act_layer=act_layer,
  168. drop=drop,
  169. )
  170. def forward(self, x):
  171. x = self.norm1(x + self.drop_path(self.mixer(x)))
  172. x = self.norm2(x + self.drop_path(self.mlp(x)))
  173. return x
  174. class ConvBlock(nn.Layer):
  175. def __init__(
  176. self,
  177. dim,
  178. num_heads,
  179. mlp_ratio=4.0,
  180. drop=0.0,
  181. drop_path=0.0,
  182. act_layer=nn.GELU,
  183. norm_layer=nn.LayerNorm,
  184. epsilon=1e-6,
  185. ):
  186. super().__init__()
  187. mlp_hidden_dim = int(dim * mlp_ratio)
  188. self.norm1 = norm_layer(dim, epsilon=epsilon)
  189. self.mixer = nn.Conv2D(
  190. dim,
  191. dim,
  192. 5,
  193. 1,
  194. 2,
  195. groups=num_heads,
  196. weight_attr=ParamAttr(initializer=KaimingNormal()),
  197. )
  198. self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
  199. self.norm2 = norm_layer(dim, epsilon=epsilon)
  200. self.mlp = Mlp(
  201. in_features=dim,
  202. hidden_features=mlp_hidden_dim,
  203. act_layer=act_layer,
  204. drop=drop,
  205. )
  206. def forward(self, x):
  207. C, H, W = x.shape[1:]
  208. x = x + self.drop_path(self.mixer(x))
  209. x = self.norm1(x.flatten(2).transpose([0, 2, 1]))
  210. x = self.norm2(x + self.drop_path(self.mlp(x)))
  211. x = x.transpose([0, 2, 1]).reshape([0, C, H, W])
  212. return x
  213. class FlattenTranspose(nn.Layer):
  214. def forward(self, x):
  215. return x.flatten(2).transpose([0, 2, 1])
  216. class SubSample2D(nn.Layer):
  217. def __init__(
  218. self,
  219. in_channels,
  220. out_channels,
  221. stride=[2, 1],
  222. ):
  223. super().__init__()
  224. self.conv = nn.Conv2D(
  225. in_channels,
  226. out_channels,
  227. kernel_size=3,
  228. stride=stride,
  229. padding=1,
  230. weight_attr=ParamAttr(initializer=KaimingNormal()),
  231. )
  232. self.norm = nn.LayerNorm(out_channels)
  233. def forward(self, x, sz):
  234. # print(x.shape)
  235. x = self.conv(x)
  236. C, H, W = x.shape[1:]
  237. x = self.norm(x.flatten(2).transpose([0, 2, 1]))
  238. x = x.transpose([0, 2, 1]).reshape([0, C, H, W])
  239. return x, [H, W]
  240. class SubSample1D(nn.Layer):
  241. def __init__(
  242. self,
  243. in_channels,
  244. out_channels,
  245. stride=[2, 1],
  246. ):
  247. super().__init__()
  248. self.conv = nn.Conv2D(
  249. in_channels,
  250. out_channels,
  251. kernel_size=3,
  252. stride=stride,
  253. padding=1,
  254. weight_attr=ParamAttr(initializer=KaimingNormal()),
  255. )
  256. self.norm = nn.LayerNorm(out_channels)
  257. def forward(self, x, sz):
  258. C = x.shape[-1]
  259. x = x.transpose([0, 2, 1]).reshape([0, C, sz[0], sz[1]])
  260. x = self.conv(x)
  261. C, H, W = x.shape[1:]
  262. x = self.norm(x.flatten(2).transpose([0, 2, 1]))
  263. return x, [H, W]
  264. class IdentitySize(nn.Layer):
  265. def forward(self, x, sz):
  266. return x, sz
  267. class SVTRStage(nn.Layer):
  268. def __init__(
  269. self,
  270. dim=64,
  271. out_dim=256,
  272. depth=3,
  273. mixer=["Local"] * 3,
  274. sub_k=[2, 1],
  275. num_heads=2,
  276. mlp_ratio=4,
  277. qkv_bias=True,
  278. qk_scale=None,
  279. drop_rate=0.0,
  280. attn_drop_rate=0.0,
  281. drop_path=[0.1] * 3,
  282. norm_layer=nn.LayerNorm,
  283. act=nn.GELU,
  284. eps=1e-6,
  285. downsample=None,
  286. **kwargs,
  287. ):
  288. super().__init__()
  289. self.dim = dim
  290. conv_block_num = sum([1 if mix == "Conv" else 0 for mix in mixer])
  291. blocks = []
  292. for i in range(depth):
  293. if mixer[i] == "Conv":
  294. blocks.append(
  295. ConvBlock(
  296. dim=dim,
  297. num_heads=num_heads,
  298. mlp_ratio=mlp_ratio,
  299. drop=drop_rate,
  300. act_layer=act,
  301. drop_path=drop_path[i],
  302. norm_layer=norm_layer,
  303. epsilon=eps,
  304. )
  305. )
  306. else:
  307. blocks.append(
  308. Block(
  309. dim=dim,
  310. num_heads=num_heads,
  311. mlp_ratio=mlp_ratio,
  312. qkv_bias=qkv_bias,
  313. qk_scale=qk_scale,
  314. drop=drop_rate,
  315. act_layer=act,
  316. attn_drop=attn_drop_rate,
  317. drop_path=drop_path[i],
  318. norm_layer=norm_layer,
  319. epsilon=eps,
  320. )
  321. )
  322. if i == conv_block_num - 1 and mixer[-1] != "Conv":
  323. blocks.append(FlattenTranspose())
  324. self.blocks = nn.Sequential(*blocks)
  325. if downsample:
  326. if mixer[-1] == "Conv":
  327. self.downsample = SubSample2D(dim, out_dim, stride=sub_k)
  328. elif mixer[-1] == "Global":
  329. self.downsample = SubSample1D(dim, out_dim, stride=sub_k)
  330. else:
  331. self.downsample = IdentitySize()
  332. def forward(self, x, sz):
  333. x = self.blocks(x)
  334. x, sz = self.downsample(x, sz)
  335. return x, sz
  336. class ADDPosEmbed(nn.Layer):
  337. def __init__(self, feat_max_size=[8, 32], embed_dim=768):
  338. super().__init__()
  339. pos_embed = paddle.zeros(
  340. [1, feat_max_size[0] * feat_max_size[1], embed_dim], dtype=paddle.float32
  341. )
  342. trunc_normal_(pos_embed)
  343. pos_embed = pos_embed.transpose([0, 2, 1]).reshape(
  344. [1, embed_dim, feat_max_size[0], feat_max_size[1]]
  345. )
  346. self.pos_embed = self.create_parameter(
  347. [1, embed_dim, feat_max_size[0], feat_max_size[1]]
  348. )
  349. self.add_parameter("pos_embed", self.pos_embed)
  350. self.pos_embed.set_value(pos_embed)
  351. def forward(self, x):
  352. sz = x.shape[2:]
  353. x = x + self.pos_embed[:, :, : sz[0], : sz[1]]
  354. return x
  355. class POPatchEmbed(nn.Layer):
  356. """Image to Patch Embedding"""
  357. def __init__(
  358. self,
  359. in_channels=3,
  360. feat_max_size=[8, 32],
  361. embed_dim=768,
  362. use_pos_embed=False,
  363. flatten=False,
  364. ):
  365. super().__init__()
  366. patch_embed = [
  367. ConvBNLayer(
  368. in_channels=in_channels,
  369. out_channels=embed_dim // 2,
  370. kernel_size=3,
  371. stride=2,
  372. padding=1,
  373. act=nn.GELU,
  374. bias_attr=None,
  375. ),
  376. ConvBNLayer(
  377. in_channels=embed_dim // 2,
  378. out_channels=embed_dim,
  379. kernel_size=3,
  380. stride=2,
  381. padding=1,
  382. act=nn.GELU,
  383. bias_attr=None,
  384. ),
  385. ]
  386. if use_pos_embed:
  387. patch_embed.append(ADDPosEmbed(feat_max_size, embed_dim))
  388. if flatten:
  389. patch_embed.append(FlattenTranspose())
  390. self.patch_embed = nn.Sequential(*patch_embed)
  391. def forward(self, x):
  392. sz = x.shape[2:]
  393. x = self.patch_embed(x)
  394. return x, [sz[0] // 4, sz[1] // 4]
  395. class LastStage(nn.Layer):
  396. def __init__(self, in_channels, out_channels, last_drop, out_char_num):
  397. super().__init__()
  398. self.last_conv = nn.Linear(in_channels, out_channels, bias_attr=False)
  399. self.hardswish = nn.Hardswish()
  400. self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer")
  401. def forward(self, x, sz):
  402. x = x.reshape([0, sz[0], sz[1], x.shape[-1]])
  403. x = x.mean(1)
  404. x = self.last_conv(x)
  405. x = self.hardswish(x)
  406. x = self.dropout(x)
  407. return x, [1, sz[1]]
  408. class OutPool(nn.Layer):
  409. def __init__(self):
  410. super().__init__()
  411. def forward(self, x, sz):
  412. C = x.shape[-1]
  413. x = x.transpose([0, 2, 1]).reshape([0, C, sz[0], sz[1]])
  414. x = nn.functional.avg_pool2d(x, [sz[0], 2])
  415. return x, [1, sz[1] // 2]
  416. class Feat2D(nn.Layer):
  417. def __init__(self):
  418. super().__init__()
  419. def forward(self, x, sz):
  420. C = x.shape[-1]
  421. x = x.transpose([0, 2, 1]).reshape([0, C, sz[0], sz[1]])
  422. return x, sz
  423. class SVTRv2(nn.Layer):
  424. def __init__(
  425. self,
  426. max_sz=[32, 128],
  427. in_channels=3,
  428. out_channels=192,
  429. out_char_num=25,
  430. depths=[3, 6, 3],
  431. dims=[64, 128, 256],
  432. mixer=[["Conv"] * 3, ["Conv"] * 3 + ["Global"] * 3, ["Global"] * 3],
  433. use_pos_embed=False,
  434. sub_k=[[1, 1], [2, 1], [1, 1]],
  435. num_heads=[2, 4, 8],
  436. mlp_ratio=4,
  437. qkv_bias=True,
  438. qk_scale=None,
  439. drop_rate=0.0,
  440. last_drop=0.1,
  441. attn_drop_rate=0.0,
  442. drop_path_rate=0.1,
  443. norm_layer=nn.LayerNorm,
  444. act=nn.GELU,
  445. last_stage=False,
  446. eps=1e-6,
  447. use_pool=False,
  448. feat2d=False,
  449. **kwargs,
  450. ):
  451. super().__init__()
  452. num_stages = len(depths)
  453. self.num_features = dims[-1]
  454. feat_max_size = [max_sz[0] // 4, max_sz[1] // 4]
  455. self.pope = POPatchEmbed(
  456. in_channels=in_channels,
  457. feat_max_size=feat_max_size,
  458. embed_dim=dims[0],
  459. use_pos_embed=use_pos_embed,
  460. flatten=mixer[0][0] != "Conv",
  461. )
  462. dpr = np.linspace(0, drop_path_rate, sum(depths)) # stochastic depth decay rule
  463. self.stages = nn.LayerList()
  464. for i_stage in range(num_stages):
  465. stage = SVTRStage(
  466. dim=dims[i_stage],
  467. out_dim=dims[i_stage + 1] if i_stage < num_stages - 1 else 0,
  468. depth=depths[i_stage],
  469. mixer=mixer[i_stage],
  470. sub_k=sub_k[i_stage],
  471. num_heads=num_heads[i_stage],
  472. mlp_ratio=mlp_ratio,
  473. qkv_bias=qkv_bias,
  474. qk_scale=qk_scale,
  475. drop=drop_rate,
  476. attn_drop=attn_drop_rate,
  477. drop_path=dpr[sum(depths[:i_stage]) : sum(depths[: i_stage + 1])],
  478. norm_layer=norm_layer,
  479. act=act,
  480. downsample=False if i_stage == num_stages - 1 else True,
  481. eps=eps,
  482. )
  483. self.stages.append(stage)
  484. self.out_channels = self.num_features
  485. self.last_stage = last_stage
  486. if last_stage:
  487. self.out_channels = out_channels
  488. self.stages.append(
  489. LastStage(self.num_features, out_channels, last_drop, out_char_num)
  490. )
  491. if use_pool:
  492. self.stages.append(OutPool())
  493. if feat2d:
  494. self.stages.append(Feat2D())
  495. self.apply(self._init_weights)
  496. def _init_weights(self, m):
  497. if isinstance(m, nn.Linear):
  498. trunc_normal_(m.weight)
  499. if isinstance(m, nn.Linear) and m.bias is not None:
  500. zeros_(m.bias)
  501. elif isinstance(m, nn.LayerNorm):
  502. zeros_(m.bias)
  503. ones_(m.weight)
  504. def forward(self, x):
  505. x, sz = self.pope(x)
  506. for stage in self.stages:
  507. x, sz = stage(x, sz)
  508. return x