rec_img_aug.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947
  1. # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import math
  15. import cv2
  16. import numpy as np
  17. import random
  18. import copy
  19. from PIL import Image
  20. import PIL
  21. from .text_image_aug import tia_perspective, tia_stretch, tia_distort
  22. from .abinet_aug import (
  23. CVGeometry,
  24. CVDeterioration,
  25. CVColorJitter,
  26. SVTRGeometry,
  27. SVTRDeterioration,
  28. ParseQDeterioration,
  29. )
  30. from paddle import get_device
  31. from paddle.vision.transforms import Compose
  32. class RecAug(object):
  33. def __init__(
  34. self,
  35. tia_prob=0.4,
  36. crop_prob=0.4,
  37. reverse_prob=0.4,
  38. noise_prob=0.4,
  39. jitter_prob=0.4,
  40. blur_prob=0.4,
  41. hsv_aug_prob=0.4,
  42. **kwargs,
  43. ):
  44. self.tia_prob = tia_prob
  45. self.bda = BaseDataAugmentation(
  46. crop_prob, reverse_prob, noise_prob, jitter_prob, blur_prob, hsv_aug_prob
  47. )
  48. def __call__(self, data):
  49. img = data["image"]
  50. h, w, _ = img.shape
  51. # tia
  52. if random.random() <= self.tia_prob:
  53. if h >= 20 and w >= 20:
  54. img = tia_distort(img, random.randint(3, 6))
  55. img = tia_stretch(img, random.randint(3, 6))
  56. img = tia_perspective(img)
  57. # bda
  58. data["image"] = img
  59. data = self.bda(data)
  60. return data
  61. class BaseDataAugmentation(object):
  62. def __init__(
  63. self,
  64. crop_prob=0.4,
  65. reverse_prob=0.4,
  66. noise_prob=0.4,
  67. jitter_prob=0.4,
  68. blur_prob=0.4,
  69. hsv_aug_prob=0.4,
  70. **kwargs,
  71. ):
  72. self.crop_prob = crop_prob
  73. self.reverse_prob = reverse_prob
  74. self.noise_prob = noise_prob
  75. self.jitter_prob = jitter_prob
  76. self.blur_prob = blur_prob
  77. self.hsv_aug_prob = hsv_aug_prob
  78. # for GaussianBlur
  79. self.fil = cv2.getGaussianKernel(ksize=5, sigma=1, ktype=cv2.CV_32F)
  80. def __call__(self, data):
  81. img = data["image"]
  82. h, w, _ = img.shape
  83. if random.random() <= self.crop_prob and h >= 20 and w >= 20:
  84. img = get_crop(img)
  85. if random.random() <= self.blur_prob:
  86. # GaussianBlur
  87. img = cv2.sepFilter2D(img, -1, self.fil, self.fil)
  88. if random.random() <= self.hsv_aug_prob:
  89. img = hsv_aug(img)
  90. if random.random() <= self.jitter_prob:
  91. img = jitter(img)
  92. if random.random() <= self.noise_prob:
  93. img = add_gaussian_noise(img)
  94. if random.random() <= self.reverse_prob:
  95. img = 255 - img
  96. data["image"] = img
  97. return data
  98. class ABINetRecAug(object):
  99. def __init__(
  100. self, geometry_p=0.5, deterioration_p=0.25, colorjitter_p=0.25, **kwargs
  101. ):
  102. self.transforms = Compose(
  103. [
  104. CVGeometry(
  105. degrees=45,
  106. translate=(0.0, 0.0),
  107. scale=(0.5, 2.0),
  108. shear=(45, 15),
  109. distortion=0.5,
  110. p=geometry_p,
  111. ),
  112. CVDeterioration(var=20, degrees=6, factor=4, p=deterioration_p),
  113. CVColorJitter(
  114. brightness=0.5,
  115. contrast=0.5,
  116. saturation=0.5,
  117. hue=0.1,
  118. p=colorjitter_p,
  119. ),
  120. ]
  121. )
  122. def __call__(self, data):
  123. img = data["image"]
  124. img = self.transforms(img)
  125. data["image"] = img
  126. return data
  127. class RecConAug(object):
  128. def __init__(
  129. self,
  130. prob=0.5,
  131. image_shape=(32, 320, 3),
  132. max_text_length=25,
  133. ext_data_num=1,
  134. **kwargs,
  135. ):
  136. self.ext_data_num = ext_data_num
  137. self.prob = prob
  138. self.max_text_length = max_text_length
  139. self.image_shape = image_shape
  140. self.max_wh_ratio = self.image_shape[1] / self.image_shape[0]
  141. def merge_ext_data(self, data, ext_data):
  142. ori_w = round(
  143. data["image"].shape[1] / data["image"].shape[0] * self.image_shape[0]
  144. )
  145. ext_w = round(
  146. ext_data["image"].shape[1]
  147. / ext_data["image"].shape[0]
  148. * self.image_shape[0]
  149. )
  150. data["image"] = cv2.resize(data["image"], (ori_w, self.image_shape[0]))
  151. ext_data["image"] = cv2.resize(ext_data["image"], (ext_w, self.image_shape[0]))
  152. data["image"] = np.concatenate([data["image"], ext_data["image"]], axis=1)
  153. data["label"] += ext_data["label"]
  154. return data
  155. def __call__(self, data):
  156. rnd_num = random.random()
  157. if rnd_num > self.prob:
  158. return data
  159. for idx, ext_data in enumerate(data["ext_data"]):
  160. if len(data["label"]) + len(ext_data["label"]) > self.max_text_length:
  161. break
  162. concat_ratio = (
  163. data["image"].shape[1] / data["image"].shape[0]
  164. + ext_data["image"].shape[1] / ext_data["image"].shape[0]
  165. )
  166. if concat_ratio > self.max_wh_ratio:
  167. break
  168. data = self.merge_ext_data(data, ext_data)
  169. data.pop("ext_data")
  170. return data
  171. class SVTRRecAug(object):
  172. def __init__(
  173. self,
  174. aug_type=0,
  175. geometry_p=0.5,
  176. deterioration_p=0.25,
  177. colorjitter_p=0.25,
  178. **kwargs,
  179. ):
  180. self.transforms = Compose(
  181. [
  182. SVTRGeometry(
  183. aug_type=aug_type,
  184. degrees=45,
  185. translate=(0.0, 0.0),
  186. scale=(0.5, 2.0),
  187. shear=(45, 15),
  188. distortion=0.5,
  189. p=geometry_p,
  190. ),
  191. SVTRDeterioration(var=20, degrees=6, factor=4, p=deterioration_p),
  192. CVColorJitter(
  193. brightness=0.5,
  194. contrast=0.5,
  195. saturation=0.5,
  196. hue=0.1,
  197. p=colorjitter_p,
  198. ),
  199. ]
  200. )
  201. def __call__(self, data):
  202. img = data["image"]
  203. img = self.transforms(img)
  204. data["image"] = img
  205. return data
  206. class ParseQRecAug(object):
  207. def __init__(
  208. self,
  209. aug_type=0,
  210. geometry_p=0.5,
  211. deterioration_p=0.25,
  212. colorjitter_p=0.25,
  213. **kwargs,
  214. ):
  215. self.transforms = Compose(
  216. [
  217. SVTRGeometry(
  218. aug_type=aug_type,
  219. degrees=45,
  220. translate=(0.0, 0.0),
  221. scale=(0.5, 2.0),
  222. shear=(45, 15),
  223. distortion=0.5,
  224. p=geometry_p,
  225. ),
  226. ParseQDeterioration(
  227. var=20, degrees=6, lam=20, radius=2.0, factor=4, p=deterioration_p
  228. ),
  229. CVColorJitter(
  230. brightness=0.5,
  231. contrast=0.5,
  232. saturation=0.5,
  233. hue=0.1,
  234. p=colorjitter_p,
  235. ),
  236. ]
  237. )
  238. def __call__(self, data):
  239. img = data["image"]
  240. img = self.transforms(img)
  241. data["image"] = img
  242. return data
  243. class ClsResizeImg(object):
  244. def __init__(self, image_shape, **kwargs):
  245. self.image_shape = image_shape
  246. def __call__(self, data):
  247. img = data["image"]
  248. norm_img, _ = resize_norm_img(img, self.image_shape)
  249. data["image"] = norm_img
  250. return data
  251. class RecResizeImg(object):
  252. def __init__(
  253. self,
  254. image_shape,
  255. infer_mode=False,
  256. eval_mode=False,
  257. character_dict_path="./ppocr/utils/ppocr_keys_v1.txt",
  258. padding=True,
  259. **kwargs,
  260. ):
  261. self.image_shape = image_shape
  262. self.infer_mode = infer_mode
  263. self.eval_mode = eval_mode
  264. self.character_dict_path = character_dict_path
  265. self.padding = padding
  266. def __call__(self, data):
  267. img = data["image"]
  268. if self.eval_mode or (self.infer_mode and self.character_dict_path is not None):
  269. norm_img, valid_ratio = resize_norm_img_chinese(img, self.image_shape)
  270. else:
  271. norm_img, valid_ratio = resize_norm_img(img, self.image_shape, self.padding)
  272. data["image"] = norm_img
  273. data["valid_ratio"] = valid_ratio
  274. if "iluvatar_gpu" in get_device():
  275. data["valid_ratio"] = np.float32(valid_ratio)
  276. return data
  277. class VLRecResizeImg(object):
  278. def __init__(
  279. self,
  280. image_shape,
  281. infer_mode=False,
  282. character_dict_path="./ppocr/utils/ppocr_keys_v1.txt",
  283. padding=True,
  284. **kwargs,
  285. ):
  286. self.image_shape = image_shape
  287. self.infer_mode = infer_mode
  288. self.character_dict_path = character_dict_path
  289. self.padding = padding
  290. def __call__(self, data):
  291. img = data["image"]
  292. imgC, imgH, imgW = self.image_shape
  293. resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
  294. resized_w = imgW
  295. resized_image = resized_image.astype("float32")
  296. if self.image_shape[0] == 1:
  297. resized_image = resized_image / 255
  298. norm_img = resized_image[np.newaxis, :]
  299. else:
  300. norm_img = resized_image.transpose((2, 0, 1)) / 255
  301. valid_ratio = min(1.0, float(resized_w / imgW))
  302. data["image"] = norm_img
  303. data["valid_ratio"] = valid_ratio
  304. if "iluvatar_gpu" in get_device():
  305. data["valid_ratio"] = np.float32(valid_ratio)
  306. return data
  307. class RFLRecResizeImg(object):
  308. def __init__(self, image_shape, padding=True, interpolation=1, **kwargs):
  309. self.image_shape = image_shape
  310. self.padding = padding
  311. self.interpolation = interpolation
  312. if self.interpolation == 0:
  313. self.interpolation = cv2.INTER_NEAREST
  314. elif self.interpolation == 1:
  315. self.interpolation = cv2.INTER_LINEAR
  316. elif self.interpolation == 2:
  317. self.interpolation = cv2.INTER_CUBIC
  318. elif self.interpolation == 3:
  319. self.interpolation = cv2.INTER_AREA
  320. else:
  321. raise Exception("Unsupported interpolation type !!!")
  322. def __call__(self, data):
  323. img = data["image"]
  324. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  325. norm_img, valid_ratio = resize_norm_img(
  326. img, self.image_shape, self.padding, self.interpolation
  327. )
  328. data["image"] = norm_img
  329. data["valid_ratio"] = valid_ratio
  330. if "iluvatar_gpu" in get_device():
  331. data["valid_ratio"] = np.float32(valid_ratio)
  332. return data
  333. class SRNRecResizeImg(object):
  334. def __init__(self, image_shape, num_heads, max_text_length, **kwargs):
  335. self.image_shape = image_shape
  336. self.num_heads = num_heads
  337. self.max_text_length = max_text_length
  338. def __call__(self, data):
  339. img = data["image"]
  340. norm_img = resize_norm_img_srn(img, self.image_shape)
  341. data["image"] = norm_img
  342. [
  343. encoder_word_pos,
  344. gsrm_word_pos,
  345. gsrm_slf_attn_bias1,
  346. gsrm_slf_attn_bias2,
  347. ] = srn_other_inputs(self.image_shape, self.num_heads, self.max_text_length)
  348. data["encoder_word_pos"] = encoder_word_pos
  349. data["gsrm_word_pos"] = gsrm_word_pos
  350. data["gsrm_slf_attn_bias1"] = gsrm_slf_attn_bias1
  351. data["gsrm_slf_attn_bias2"] = gsrm_slf_attn_bias2
  352. return data
  353. class SARRecResizeImg(object):
  354. def __init__(self, image_shape, width_downsample_ratio=0.25, **kwargs):
  355. self.image_shape = image_shape
  356. self.width_downsample_ratio = width_downsample_ratio
  357. def __call__(self, data):
  358. img = data["image"]
  359. norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar(
  360. img, self.image_shape, self.width_downsample_ratio
  361. )
  362. data["image"] = norm_img
  363. data["resized_shape"] = resize_shape
  364. data["pad_shape"] = pad_shape
  365. data["valid_ratio"] = valid_ratio
  366. if "iluvatar_gpu" in get_device():
  367. data["valid_ratio"] = np.float32(valid_ratio)
  368. return data
  369. class PRENResizeImg(object):
  370. def __init__(self, image_shape, **kwargs):
  371. """
  372. According to original paper's realization, it's a hard resize method here.
  373. So maybe you should optimize it to fit for your task better.
  374. """
  375. self.dst_h, self.dst_w = image_shape
  376. def __call__(self, data):
  377. img = data["image"]
  378. resized_img = cv2.resize(
  379. img, (self.dst_w, self.dst_h), interpolation=cv2.INTER_LINEAR
  380. )
  381. resized_img = resized_img.transpose((2, 0, 1)) / 255
  382. resized_img -= 0.5
  383. resized_img /= 0.5
  384. data["image"] = resized_img.astype(np.float32)
  385. return data
  386. class SPINRecResizeImg(object):
  387. def __init__(
  388. self,
  389. image_shape,
  390. interpolation=2,
  391. mean=(127.5, 127.5, 127.5),
  392. std=(127.5, 127.5, 127.5),
  393. **kwargs,
  394. ):
  395. self.image_shape = image_shape
  396. self.mean = np.array(mean, dtype=np.float32)
  397. self.std = np.array(std, dtype=np.float32)
  398. self.interpolation = interpolation
  399. def __call__(self, data):
  400. img = data["image"]
  401. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  402. # different interpolation type corresponding the OpenCV
  403. if self.interpolation == 0:
  404. interpolation = cv2.INTER_NEAREST
  405. elif self.interpolation == 1:
  406. interpolation = cv2.INTER_LINEAR
  407. elif self.interpolation == 2:
  408. interpolation = cv2.INTER_CUBIC
  409. elif self.interpolation == 3:
  410. interpolation = cv2.INTER_AREA
  411. else:
  412. raise Exception("Unsupported interpolation type !!!")
  413. # Deal with the image error during image loading
  414. if img is None:
  415. return None
  416. img = cv2.resize(img, tuple(self.image_shape), interpolation)
  417. img = np.array(img, np.float32)
  418. img = np.expand_dims(img, -1)
  419. img = img.transpose((2, 0, 1))
  420. # normalize the image
  421. img = img.copy().astype(np.float32)
  422. mean = np.float64(self.mean.reshape(1, -1))
  423. stdinv = 1 / np.float64(self.std.reshape(1, -1))
  424. img -= mean
  425. img *= stdinv
  426. data["image"] = img
  427. return data
  428. class GrayRecResizeImg(object):
  429. def __init__(
  430. self,
  431. image_shape,
  432. resize_type,
  433. inter_type="Image.Resampling.LANCZOS",
  434. scale=True,
  435. padding=False,
  436. **kwargs,
  437. ):
  438. self.image_shape = image_shape
  439. self.resize_type = resize_type
  440. self.padding = padding
  441. self.inter_type = eval(inter_type)
  442. self.scale = scale
  443. def __call__(self, data):
  444. img = data["image"]
  445. img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  446. image_shape = self.image_shape
  447. if self.padding:
  448. imgC, imgH, imgW = image_shape
  449. # todo: change to 0 and modified image shape
  450. h = img.shape[0]
  451. w = img.shape[1]
  452. ratio = w / float(h)
  453. if math.ceil(imgH * ratio) > imgW:
  454. resized_w = imgW
  455. else:
  456. resized_w = int(math.ceil(imgH * ratio))
  457. resized_image = cv2.resize(img, (resized_w, imgH))
  458. norm_img = np.expand_dims(resized_image, -1)
  459. norm_img = norm_img.transpose((2, 0, 1))
  460. resized_image = norm_img.astype(np.float32) / 128.0 - 1.0
  461. padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
  462. padding_im[:, :, 0:resized_w] = resized_image
  463. data["image"] = padding_im
  464. return data
  465. if self.resize_type == "PIL":
  466. image_pil = Image.fromarray(np.uint8(img))
  467. img = image_pil.resize(self.image_shape, self.inter_type)
  468. img = np.array(img)
  469. if self.resize_type == "OpenCV":
  470. img = cv2.resize(img, self.image_shape)
  471. norm_img = np.expand_dims(img, -1)
  472. norm_img = norm_img.transpose((2, 0, 1))
  473. if self.scale:
  474. data["image"] = norm_img.astype(np.float32) / 128.0 - 1.0
  475. else:
  476. data["image"] = norm_img.astype(np.float32) / 255.0
  477. return data
  478. class ABINetRecResizeImg(object):
  479. def __init__(self, image_shape, **kwargs):
  480. self.image_shape = image_shape
  481. def __call__(self, data):
  482. img = data["image"]
  483. norm_img, valid_ratio = resize_norm_img_abinet(img, self.image_shape)
  484. data["image"] = norm_img
  485. data["valid_ratio"] = valid_ratio
  486. if "iluvatar_gpu" in get_device():
  487. data["valid_ratio"] = np.float32(valid_ratio)
  488. return data
  489. class SVTRRecResizeImg(object):
  490. def __init__(self, image_shape, padding=True, **kwargs):
  491. self.image_shape = image_shape
  492. self.padding = padding
  493. def __call__(self, data):
  494. img = data["image"]
  495. norm_img, valid_ratio = resize_norm_img(img, self.image_shape, self.padding)
  496. data["image"] = norm_img
  497. data["valid_ratio"] = valid_ratio
  498. if "iluvatar_gpu" in get_device():
  499. data["valid_ratio"] = np.float32(valid_ratio)
  500. return data
  501. class RobustScannerRecResizeImg(object):
  502. def __init__(
  503. self, image_shape, max_text_length, width_downsample_ratio=0.25, **kwargs
  504. ):
  505. self.image_shape = image_shape
  506. self.width_downsample_ratio = width_downsample_ratio
  507. self.max_text_length = max_text_length
  508. def __call__(self, data):
  509. img = data["image"]
  510. norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar(
  511. img, self.image_shape, self.width_downsample_ratio
  512. )
  513. word_positons = np.array(range(0, self.max_text_length)).astype("int64")
  514. data["image"] = norm_img
  515. data["resized_shape"] = resize_shape
  516. data["pad_shape"] = pad_shape
  517. data["valid_ratio"] = valid_ratio
  518. if "iluvatar_gpu" in get_device():
  519. data["valid_ratio"] = np.float32(valid_ratio)
  520. data["word_positons"] = word_positons
  521. return data
  522. def resize_norm_img_sar(img, image_shape, width_downsample_ratio=0.25):
  523. imgC, imgH, imgW_min, imgW_max = image_shape
  524. h = img.shape[0]
  525. w = img.shape[1]
  526. valid_ratio = 1.0
  527. # make sure new_width is an integral multiple of width_divisor.
  528. width_divisor = int(1 / width_downsample_ratio)
  529. # resize
  530. ratio = w / float(h)
  531. resize_w = math.ceil(imgH * ratio)
  532. if resize_w % width_divisor != 0:
  533. resize_w = round(resize_w / width_divisor) * width_divisor
  534. if imgW_min is not None:
  535. resize_w = max(imgW_min, resize_w)
  536. if imgW_max is not None:
  537. valid_ratio = min(1.0, 1.0 * resize_w / imgW_max)
  538. resize_w = min(imgW_max, resize_w)
  539. resized_image = cv2.resize(img, (resize_w, imgH))
  540. resized_image = resized_image.astype("float32")
  541. # norm
  542. if image_shape[0] == 1:
  543. resized_image = resized_image / 255
  544. resized_image = resized_image[np.newaxis, :]
  545. else:
  546. resized_image = resized_image.transpose((2, 0, 1)) / 255
  547. resized_image -= 0.5
  548. resized_image /= 0.5
  549. resize_shape = resized_image.shape
  550. padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32)
  551. padding_im[:, :, 0:resize_w] = resized_image
  552. pad_shape = padding_im.shape
  553. return padding_im, resize_shape, pad_shape, valid_ratio
  554. def resize_norm_img(img, image_shape, padding=True, interpolation=cv2.INTER_LINEAR):
  555. imgC, imgH, imgW = image_shape
  556. h = img.shape[0]
  557. w = img.shape[1]
  558. if not padding:
  559. resized_image = cv2.resize(img, (imgW, imgH), interpolation=interpolation)
  560. resized_w = imgW
  561. else:
  562. ratio = w / float(h)
  563. if math.ceil(imgH * ratio) > imgW:
  564. resized_w = imgW
  565. else:
  566. resized_w = int(math.ceil(imgH * ratio))
  567. resized_image = cv2.resize(img, (resized_w, imgH))
  568. resized_image = resized_image.astype("float32")
  569. if image_shape[0] == 1:
  570. resized_image = resized_image / 255
  571. resized_image = resized_image[np.newaxis, :]
  572. else:
  573. resized_image = resized_image.transpose((2, 0, 1)) / 255
  574. resized_image -= 0.5
  575. resized_image /= 0.5
  576. padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
  577. padding_im[:, :, 0:resized_w] = resized_image
  578. valid_ratio = min(1.0, float(resized_w / imgW))
  579. return padding_im, valid_ratio
  580. def resize_norm_img_chinese(img, image_shape):
  581. imgC, imgH, imgW = image_shape
  582. # todo: change to 0 and modified image shape
  583. max_wh_ratio = imgW * 1.0 / imgH
  584. h, w = img.shape[0], img.shape[1]
  585. ratio = w * 1.0 / h
  586. max_wh_ratio = max(max_wh_ratio, ratio)
  587. imgW = int(imgH * max_wh_ratio)
  588. if math.ceil(imgH * ratio) > imgW:
  589. resized_w = imgW
  590. else:
  591. resized_w = int(math.ceil(imgH * ratio))
  592. resized_image = cv2.resize(img, (resized_w, imgH))
  593. resized_image = resized_image.astype("float32")
  594. if image_shape[0] == 1:
  595. resized_image = resized_image / 255
  596. resized_image = resized_image[np.newaxis, :]
  597. else:
  598. resized_image = resized_image.transpose((2, 0, 1)) / 255
  599. resized_image -= 0.5
  600. resized_image /= 0.5
  601. padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
  602. padding_im[:, :, 0:resized_w] = resized_image
  603. valid_ratio = min(1.0, float(resized_w / imgW))
  604. return padding_im, valid_ratio
  605. def resize_norm_img_srn(img, image_shape):
  606. imgC, imgH, imgW = image_shape
  607. img_black = np.zeros((imgH, imgW))
  608. im_hei = img.shape[0]
  609. im_wid = img.shape[1]
  610. if im_wid <= im_hei * 1:
  611. img_new = cv2.resize(img, (imgH * 1, imgH))
  612. elif im_wid <= im_hei * 2:
  613. img_new = cv2.resize(img, (imgH * 2, imgH))
  614. elif im_wid <= im_hei * 3:
  615. img_new = cv2.resize(img, (imgH * 3, imgH))
  616. else:
  617. img_new = cv2.resize(img, (imgW, imgH))
  618. img_np = np.asarray(img_new)
  619. img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
  620. img_black[:, 0 : img_np.shape[1]] = img_np
  621. img_black = img_black[:, :, np.newaxis]
  622. row, col, c = img_black.shape
  623. c = 1
  624. return np.reshape(img_black, (c, row, col)).astype(np.float32)
  625. def resize_norm_img_abinet(img, image_shape):
  626. imgC, imgH, imgW = image_shape
  627. resized_image = cv2.resize(img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
  628. resized_w = imgW
  629. resized_image = resized_image.astype("float32")
  630. resized_image = resized_image / 255.0
  631. mean = np.array([0.485, 0.456, 0.406])
  632. std = np.array([0.229, 0.224, 0.225])
  633. resized_image = (resized_image - mean[None, None, ...]) / std[None, None, ...]
  634. resized_image = resized_image.transpose((2, 0, 1))
  635. resized_image = resized_image.astype("float32")
  636. valid_ratio = min(1.0, float(resized_w / imgW))
  637. return resized_image, valid_ratio
  638. def srn_other_inputs(image_shape, num_heads, max_text_length):
  639. imgC, imgH, imgW = image_shape
  640. feature_dim = int((imgH / 8) * (imgW / 8))
  641. encoder_word_pos = (
  642. np.array(range(0, feature_dim)).reshape((feature_dim, 1)).astype("int64")
  643. )
  644. gsrm_word_pos = (
  645. np.array(range(0, max_text_length))
  646. .reshape((max_text_length, 1))
  647. .astype("int64")
  648. )
  649. gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
  650. gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
  651. [1, max_text_length, max_text_length]
  652. )
  653. gsrm_slf_attn_bias1 = np.tile(gsrm_slf_attn_bias1, [num_heads, 1, 1]) * [-1e9]
  654. gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
  655. [1, max_text_length, max_text_length]
  656. )
  657. gsrm_slf_attn_bias2 = np.tile(gsrm_slf_attn_bias2, [num_heads, 1, 1]) * [-1e9]
  658. return [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2]
  659. def flag():
  660. """
  661. flag
  662. """
  663. return 1 if random.random() > 0.5000001 else -1
  664. def hsv_aug(img):
  665. """
  666. cvtColor
  667. """
  668. hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
  669. delta = 0.001 * random.random() * flag()
  670. hsv[:, :, 2] = hsv[:, :, 2] * (1 + delta)
  671. new_img = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
  672. return new_img
  673. def blur(img):
  674. """
  675. blur
  676. """
  677. h, w, _ = img.shape
  678. if h > 10 and w > 10:
  679. return cv2.GaussianBlur(img, (5, 5), 1)
  680. else:
  681. return img
  682. def jitter(img):
  683. """
  684. jitter
  685. """
  686. w, h, _ = img.shape
  687. if h > 10 and w > 10:
  688. thres = min(w, h)
  689. s = int(random.random() * thres * 0.01)
  690. src_img = img.copy()
  691. for i in range(s):
  692. img[i:, i:, :] = src_img[: w - i, : h - i, :]
  693. return img
  694. else:
  695. return img
  696. def add_gaussian_noise(image, mean=0, var=0.1):
  697. """
  698. Gaussian noise
  699. """
  700. noise = np.random.normal(mean, var**0.5, image.shape)
  701. out = image + 0.5 * noise
  702. out = np.clip(out, 0, 255)
  703. out = np.uint8(out)
  704. return out
  705. def get_crop(image):
  706. """
  707. random crop
  708. """
  709. h, w, _ = image.shape
  710. top_min = 1
  711. top_max = 8
  712. top_crop = int(random.randint(top_min, top_max))
  713. top_crop = min(top_crop, h - 1)
  714. crop_img = image.copy()
  715. ratio = random.randint(0, 1)
  716. if ratio:
  717. crop_img = crop_img[top_crop:h, :, :]
  718. else:
  719. crop_img = crop_img[0 : h - top_crop, :, :]
  720. return crop_img
  721. def rad(x):
  722. """
  723. rad
  724. """
  725. return x * np.pi / 180
  726. def get_warpR(config):
  727. """
  728. get_warpR
  729. """
  730. anglex, angley, anglez, fov, w, h, r = (
  731. config.anglex,
  732. config.angley,
  733. config.anglez,
  734. config.fov,
  735. config.w,
  736. config.h,
  737. config.r,
  738. )
  739. if w > 69 and w < 112:
  740. anglex = anglex * 1.5
  741. z = np.sqrt(w**2 + h**2) / 2 / np.tan(rad(fov / 2))
  742. # Homogeneous coordinate transformation matrix
  743. rx = np.array(
  744. [
  745. [1, 0, 0, 0],
  746. [0, np.cos(rad(anglex)), -np.sin(rad(anglex)), 0],
  747. [
  748. 0,
  749. -np.sin(rad(anglex)),
  750. np.cos(rad(anglex)),
  751. 0,
  752. ],
  753. [0, 0, 0, 1],
  754. ],
  755. np.float32,
  756. )
  757. ry = np.array(
  758. [
  759. [np.cos(rad(angley)), 0, np.sin(rad(angley)), 0],
  760. [0, 1, 0, 0],
  761. [
  762. -np.sin(rad(angley)),
  763. 0,
  764. np.cos(rad(angley)),
  765. 0,
  766. ],
  767. [0, 0, 0, 1],
  768. ],
  769. np.float32,
  770. )
  771. rz = np.array(
  772. [
  773. [np.cos(rad(anglez)), np.sin(rad(anglez)), 0, 0],
  774. [-np.sin(rad(anglez)), np.cos(rad(anglez)), 0, 0],
  775. [0, 0, 1, 0],
  776. [0, 0, 0, 1],
  777. ],
  778. np.float32,
  779. )
  780. r = rx.dot(ry).dot(rz)
  781. # generate 4 points
  782. pcenter = np.array([h / 2, w / 2, 0, 0], np.float32)
  783. p1 = np.array([0, 0, 0, 0], np.float32) - pcenter
  784. p2 = np.array([w, 0, 0, 0], np.float32) - pcenter
  785. p3 = np.array([0, h, 0, 0], np.float32) - pcenter
  786. p4 = np.array([w, h, 0, 0], np.float32) - pcenter
  787. dst1 = r.dot(p1)
  788. dst2 = r.dot(p2)
  789. dst3 = r.dot(p3)
  790. dst4 = r.dot(p4)
  791. list_dst = np.array([dst1, dst2, dst3, dst4])
  792. org = np.array([[0, 0], [w, 0], [0, h], [w, h]], np.float32)
  793. dst = np.zeros((4, 2), np.float32)
  794. # Project onto the image plane
  795. dst[:, 0] = list_dst[:, 0] * z / (z - list_dst[:, 2]) + pcenter[0]
  796. dst[:, 1] = list_dst[:, 1] * z / (z - list_dst[:, 2]) + pcenter[1]
  797. warpR = cv2.getPerspectiveTransform(org, dst)
  798. dst1, dst2, dst3, dst4 = dst
  799. r1 = int(min(dst1[1], dst2[1]))
  800. r2 = int(max(dst3[1], dst4[1]))
  801. c1 = int(min(dst1[0], dst3[0]))
  802. c2 = int(max(dst2[0], dst4[0]))
  803. try:
  804. ratio = min(1.0 * h / (r2 - r1), 1.0 * w / (c2 - c1))
  805. dx = -c1
  806. dy = -r1
  807. T1 = np.float32([[1.0, 0, dx], [0, 1.0, dy], [0, 0, 1.0 / ratio]])
  808. ret = T1.dot(warpR)
  809. except:
  810. ratio = 1.0
  811. T1 = np.float32([[1.0, 0, 0], [0, 1.0, 0], [0, 0, 1.0]])
  812. ret = T1
  813. return ret, (-r1, -c1), ratio, dst
  814. def get_warpAffine(config):
  815. """
  816. get_warpAffine
  817. """
  818. anglez = config.anglez
  819. rz = np.array(
  820. [
  821. [np.cos(rad(anglez)), np.sin(rad(anglez)), 0],
  822. [-np.sin(rad(anglez)), np.cos(rad(anglez)), 0],
  823. ],
  824. np.float32,
  825. )
  826. return rz