latexocr_dataset.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. This code is refer from:
  16. https://github.com/lukas-blecher/LaTeX-OCR/blob/main/pix2tex/dataset/dataset.py
  17. """
  18. import numpy as np
  19. import cv2
  20. import math
  21. import os
  22. import json
  23. import pickle
  24. import random
  25. import traceback
  26. import paddle
  27. from paddle.io import Dataset
  28. from .imaug.label_ops import LatexOCRLabelEncode
  29. from .imaug import transform, create_operators
  30. class LaTeXOCRDataSet(Dataset):
  31. def __init__(self, config, mode, logger, seed=None):
  32. super(LaTeXOCRDataSet, self).__init__()
  33. self.logger = logger
  34. self.mode = mode.lower()
  35. global_config = config["Global"]
  36. dataset_config = config[mode]["dataset"]
  37. loader_config = config[mode]["loader"]
  38. pkl_path = dataset_config.pop("data")
  39. self.data_dir = dataset_config["data_dir"]
  40. self.min_dimensions = dataset_config.pop("min_dimensions")
  41. self.max_dimensions = dataset_config.pop("max_dimensions")
  42. self.batchsize = dataset_config.pop("batch_size_per_pair")
  43. self.keep_smaller_batches = dataset_config.pop("keep_smaller_batches")
  44. self.max_seq_len = global_config.pop("max_seq_len")
  45. self.rec_char_dict_path = global_config.pop("rec_char_dict_path")
  46. self.tokenizer = LatexOCRLabelEncode(self.rec_char_dict_path)
  47. file = open(pkl_path, "rb")
  48. data = pickle.load(file)
  49. temp = {}
  50. for k in data:
  51. if (
  52. self.min_dimensions[0] <= k[0] <= self.max_dimensions[0]
  53. and self.min_dimensions[1] <= k[1] <= self.max_dimensions[1]
  54. ):
  55. temp[k] = data[k]
  56. self.data = temp
  57. self.do_shuffle = loader_config["shuffle"]
  58. self.seed = seed
  59. if self.mode == "train" and self.do_shuffle:
  60. random.seed(self.seed)
  61. self.pairs = []
  62. for k in self.data:
  63. info = np.array(self.data[k], dtype=object)
  64. p = (
  65. paddle.randperm(len(info))
  66. if self.mode == "train" and self.do_shuffle
  67. else paddle.arange(len(info))
  68. )
  69. for i in range(0, len(info), self.batchsize):
  70. batch = info[p[i : i + self.batchsize]]
  71. if len(batch.shape) == 1:
  72. batch = batch[None, :]
  73. if len(batch) < self.batchsize and not self.keep_smaller_batches:
  74. continue
  75. self.pairs.append(batch)
  76. if self.do_shuffle:
  77. self.pairs = np.random.permutation(np.array(self.pairs, dtype=object))
  78. else:
  79. self.pairs = np.array(self.pairs, dtype=object)
  80. self.size = len(self.pairs)
  81. self.set_epoch_as_seed(self.seed, dataset_config)
  82. self.ops = create_operators(dataset_config["transforms"], global_config)
  83. self.ext_op_transform_idx = dataset_config.get("ext_op_transform_idx", 2)
  84. self.need_reset = True
  85. def set_epoch_as_seed(self, seed, dataset_config):
  86. if self.mode == "train":
  87. try:
  88. border_map_id = [
  89. index
  90. for index, dictionary in enumerate(dataset_config["transforms"])
  91. if "MakeBorderMap" in dictionary
  92. ][0]
  93. shrink_map_id = [
  94. index
  95. for index, dictionary in enumerate(dataset_config["transforms"])
  96. if "MakeShrinkMap" in dictionary
  97. ][0]
  98. dataset_config["transforms"][border_map_id]["MakeBorderMap"][
  99. "epoch"
  100. ] = (seed if seed is not None else 0)
  101. dataset_config["transforms"][shrink_map_id]["MakeShrinkMap"][
  102. "epoch"
  103. ] = (seed if seed is not None else 0)
  104. except Exception as E:
  105. print(E)
  106. return
  107. def shuffle_data_random(self):
  108. random.seed(self.seed)
  109. random.shuffle(self.data_lines)
  110. return
  111. def __getitem__(self, idx):
  112. batch = self.pairs[idx]
  113. eqs, ims = batch.T
  114. try:
  115. max_width, max_height, max_length = 0, 0, 0
  116. images_transform = []
  117. for file_name in ims:
  118. img_path = os.path.join(self.data_dir, file_name)
  119. data = {
  120. "img_path": img_path,
  121. }
  122. with open(data["img_path"], "rb") as f:
  123. img = f.read()
  124. data["image"] = img
  125. item = transform(data, self.ops)
  126. images_transform.append(np.array(item[0]))
  127. image_concat = np.concatenate(images_transform, axis=0)[:, np.newaxis, :, :]
  128. images_transform = image_concat.astype(np.float32)
  129. labels, attention_mask, max_length = self.tokenizer(list(eqs))
  130. if self.max_seq_len < max_length:
  131. rnd_idx = (
  132. np.random.randint(self.__len__())
  133. if self.mode == "train"
  134. else (idx + 1) % self.__len__()
  135. )
  136. return self.__getitem__(rnd_idx)
  137. return (images_transform, labels, attention_mask)
  138. except:
  139. self.logger.error(
  140. "When parsing line {}, error happened with msg: {}".format(
  141. data["img_path"], traceback.format_exc()
  142. )
  143. )
  144. outs = None
  145. if outs is None:
  146. # during evaluation, we should fix the idx to get same results for many times of evaluation.
  147. rnd_idx = (
  148. np.random.randint(self.__len__())
  149. if self.mode == "train"
  150. else (idx + 1) % self.__len__()
  151. )
  152. return self.__getitem__(rnd_idx)
  153. return outs
  154. def __len__(self):
  155. return self.size