| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524 |
- from typing import List
- import numpy as np
- from shapely.geometry import Polygon
- import math
- import copy
- from utils.imgproc_utils import union_area, xywh2xyxypoly, rotate_polygons
- import cv2
- LANG_LIST = ['eng', 'ja', 'unknown']
- LANGCLS2IDX = {'eng': 0, 'ja': 1, 'unknown': 2}
- class TextBlock(object):
- def __init__(self, xyxy: List,
- lines: List = None,
- language: str = 'unknown',
- vertical: bool = False,
- font_size: float = -1,
- distance: List = None,
- angle: int = 0,
- vec: List = None,
- norm: float = -1,
- merged: bool = False,
- weight: float = -1,
- text: List = None,
- translation: str = "",
- fg_r = 0,
- fg_g = 0,
- fg_b = 0,
- bg_r = 0,
- bg_g = 0,
- bg_b = 0,
- line_spacing = 1.,
- font_family: str = "",
- bold: bool = False,
- underline: bool = False,
- italic: bool = False,
- alignment: int = -1,
- alpha: float = 255,
- rich_text: str = "",
- _bounding_rect: List = None,
- accumulate_color = True,
- default_stroke_width = 0.2,
- target_lang: str = "",
- **kwargs) -> None:
- self.xyxy = [int(num) for num in xyxy] # boundingbox of textblock
- self.lines = [] if lines is None else lines # polygons of textlines
- self.vertical = vertical # orientation of textlines
- self.language = language
- self.font_size = font_size # font pixel size
- self.distance = None if distance is None else np.array(distance, np.float64) # distance between textlines and "origin"
- self.angle = angle # rotation angle of textlines
- self.vec = None if vec is None else np.array(vec, np.float64) # primary vector of textblock
- self.norm = norm # primary norm of textblock
- self.merged = merged
- self.weight = weight
- self.text = text if text is not None else []
- self.prob = 1
- self.translation = translation
- # note they're accumulative rgb values of textlines
- self.fg_r = fg_r
- self.fg_g = fg_g
- self.fg_b = fg_b
- self.bg_r = bg_r
- self.bg_g = bg_g
- self.bg_b = bg_b
- # self.stroke_width = stroke_width
- self.font_family: str = font_family
- self.bold: bool = bold
- self.underline: bool = underline
- self.italic: bool = italic
- self.alpha = alpha
- self.rich_text = rich_text
- self.line_spacing = line_spacing
- # self.alignment = alignment
- self._alignment = alignment
- self._target_lang = target_lang
- self._bounding_rect = _bounding_rect
- self.default_stroke_width = default_stroke_width
- self.accumulate_color = accumulate_color
- def adjust_bbox(self, with_bbox=False):
- lines = self.lines_array().astype(np.int32)
- if with_bbox:
- self.xyxy[0] = min(lines[..., 0].min(), self.xyxy[0])
- self.xyxy[1] = min(lines[..., 1].min(), self.xyxy[1])
- self.xyxy[2] = max(lines[..., 0].max(), self.xyxy[2])
- self.xyxy[3] = max(lines[..., 1].max(), self.xyxy[3])
- else:
- self.xyxy[0] = lines[..., 0].min()
- self.xyxy[1] = lines[..., 1].min()
- self.xyxy[2] = lines[..., 0].max()
- self.xyxy[3] = lines[..., 1].max()
- def sort_lines(self):
- if self.distance is not None:
- idx = np.argsort(self.distance)
- self.distance = self.distance[idx]
- lines = np.array(self.lines, dtype=np.int32)
- self.lines = lines[idx].tolist()
- def lines_array(self, dtype=np.float64):
- return np.array(self.lines, dtype=dtype)
- def aspect_ratio(self) -> float:
- min_rect = self.min_rect()
- middle_pnts = (min_rect[:, [1, 2, 3, 0]] + min_rect) / 2
- norm_v = np.linalg.norm(middle_pnts[:, 2] - middle_pnts[:, 0])
- norm_h = np.linalg.norm(middle_pnts[:, 1] - middle_pnts[:, 3])
- return norm_v / norm_h
- def center(self):
- xyxy = np.array(self.xyxy)
- return (xyxy[:2] + xyxy[2:]) / 2
-
- def min_rect(self, rotate_back=True):
- angled = self.angle != 0
- center = self.center()
- polygons = self.lines_array().reshape(-1, 8)
- if angled:
- polygons = rotate_polygons(center, polygons, self.angle)
- min_x = polygons[:, ::2].min()
- min_y = polygons[:, 1::2].min()
- max_x = polygons[:, ::2].max()
- max_y = polygons[:, 1::2].max()
- min_bbox = np.array([[min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y]])
- if angled and rotate_back:
- min_bbox = rotate_polygons(center, min_bbox, -self.angle)
- return min_bbox.reshape(-1, 4, 2).astype(np.int64)
- # equivalent to qt's boundingRect, ignore angle
- def bounding_rect(self):
- if self._bounding_rect is None:
- # if True:
- min_bbox = self.min_rect(rotate_back=False)[0]
- x, y = min_bbox[0]
- w, h = min_bbox[2] - min_bbox[0]
- return [x, y, w, h]
- return self._bounding_rect
- def __getattribute__(self, name: str):
- if name == 'pts':
- return self.lines_array()
- # else:
- return object.__getattribute__(self, name)
- def __len__(self):
- return len(self.lines)
- def __getitem__(self, idx):
- return self.lines[idx]
- def to_dict(self):
- blk_dict = copy.deepcopy(vars(self))
- return blk_dict
- def get_transformed_region(self, img, idx, textheight) -> np.ndarray :
- im_h, im_w = img.shape[:2]
- direction = 'v' if self.vertical else 'h'
- src_pts = np.array(self.lines[idx], dtype=np.float64)
- if self.language == 'eng' or (self.language == 'unknown' and not self.vertical):
- e_size = self.font_size / 3
- src_pts[..., 0] += np.array([-e_size, e_size, e_size, -e_size])
- src_pts[..., 1] += np.array([-e_size, -e_size, e_size, e_size])
- src_pts[..., 0] = np.clip(src_pts[..., 0], 0, im_w)
- src_pts[..., 1] = np.clip(src_pts[..., 1], 0, im_h)
- middle_pnt = (src_pts[[1, 2, 3, 0]] + src_pts) / 2
- vec_v = middle_pnt[2] - middle_pnt[0] # vertical vectors of textlines
- vec_h = middle_pnt[1] - middle_pnt[3] # horizontal vectors of textlines
- ratio = np.linalg.norm(vec_v) / np.linalg.norm(vec_h)
- if direction == 'h' :
- h = int(textheight)
- w = int(round(textheight / ratio))
- dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32)
- M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
- region = cv2.warpPerspective(img, M, (w, h))
- elif direction == 'v' :
- w = int(textheight)
- h = int(round(textheight * ratio))
- dst_pts = np.array([[0, 0], [w - 1, 0], [w - 1, h - 1], [0, h - 1]]).astype(np.float32)
- M, _ = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
- region = cv2.warpPerspective(img, M, (w, h))
- region = cv2.rotate(region, cv2.ROTATE_90_COUNTERCLOCKWISE)
- # cv2.imshow('region'+str(idx), region)
- # cv2.waitKey(0)
- return region
- def get_text(self):
- if isinstance(self.text, str):
- return self.text
- return ' '.join(self.text).strip()
- def set_font_colors(self, frgb, srgb, accumulate=True):
- self.accumulate_color = accumulate
- num_lines = len(self.lines) if accumulate and len(self.lines) > 0 else 1
- # set font color
- frgb = np.array(frgb) * num_lines
- self.fg_r, self.fg_g, self.fg_b = frgb
- # set stroke color
- srgb = np.array(srgb) * num_lines
- self.bg_r, self.bg_g, self.bg_b = srgb
- def get_font_colors(self, bgr=False):
- num_lines = len(self.lines)
- frgb = np.array([self.fg_r, self.fg_g, self.fg_b])
- brgb = np.array([self.bg_r, self.bg_g, self.bg_b])
- if self.accumulate_color:
- if num_lines > 0:
- frgb = (frgb / num_lines).astype(np.int32)
- brgb = (brgb / num_lines).astype(np.int32)
- if bgr:
- return frgb[::-1], brgb[::-1]
- else:
- return frgb, brgb
- else:
- return [0, 0, 0], [0, 0, 0]
- else:
- return frgb, brgb
- def xywh(self):
- x, y, w, h = self.xyxy
- return [x, y, w-x, h-y]
- # alignleft: 0, center: 1, right: 2
- def alignment(self):
- if self._alignment >= 0:
- return self._alignment
- elif self.vertical:
- return 0
- lines = self.lines_array()
- if len(lines) == 1:
- return 0
- angled = self.angle != 0
- polygons = lines.reshape(-1, 8)
- if angled:
- polygons = rotate_polygons((0, 0), polygons, self.angle)
- polygons = polygons.reshape(-1, 4, 2)
-
- left_std = np.std(polygons[:, 0, 0])
- # right_std = np.std(polygons[:, 1, 0])
- center_std = np.std((polygons[:, 0, 0] + polygons[:, 1, 0]) / 2)
- if left_std < center_std:
- return 0
- else:
- return 1
- def target_lang(self):
- return self.target_lang
- @property
- def stroke_width(self):
- var = np.array([self.fg_r, self.fg_g, self.fg_b]) \
- - np.array([self.bg_r, self.bg_g, self.bg_b])
- var = np.abs(var).sum()
- if var > 40:
- return self.default_stroke_width
- return 0
- def sort_textblk_list(blk_list: List[TextBlock], im_w: int, im_h: int) -> List[TextBlock]:
- if len(blk_list) == 0:
- return blk_list
- num_ja = 0
- xyxy = []
- for blk in blk_list:
- if blk.language == 'ja':
- num_ja += 1
- xyxy.append(blk.xyxy)
- xyxy = np.array(xyxy)
- flip_lr = num_ja > len(blk_list) / 2
- im_oriw = im_w
- if im_w > im_h:
- im_w /= 2
- num_gridy, num_gridx = 4, 3
- img_area = im_h * im_w
- center_x = (xyxy[:, 0] + xyxy[:, 2]) / 2
- if flip_lr:
- if im_w != im_oriw:
- center_x = im_oriw - center_x
- else:
- center_x = im_w - center_x
- grid_x = (center_x / im_w * num_gridx).astype(np.int32)
- center_y = (xyxy[:, 1] + xyxy[:, 3]) / 2
- grid_y = (center_y / im_h * num_gridy).astype(np.int32)
- grid_indices = grid_y * num_gridx + grid_x
- grid_weights = grid_indices * img_area + 1.2 * (center_x - grid_x * im_w / num_gridx) + (center_y - grid_y * im_h / num_gridy)
- if im_w != im_oriw:
- grid_weights[np.where(grid_x >= num_gridx)] += img_area * num_gridy * num_gridx
-
- for blk, weight in zip(blk_list, grid_weights):
- blk.weight = weight
- blk_list.sort(key=lambda blk: blk.weight)
- return blk_list
- def examine_textblk(blk: TextBlock, im_w: int, im_h: int, sort: bool = False) -> None:
- lines = blk.lines_array()
- middle_pnts = (lines[:, [1, 2, 3, 0]] + lines) / 2
- vec_v = middle_pnts[:, 2] - middle_pnts[:, 0] # vertical vectors of textlines
- vec_h = middle_pnts[:, 1] - middle_pnts[:, 3] # horizontal vectors of textlines
- # if sum of vertical vectors is longer, then text orientation is vertical, and vice versa.
- center_pnts = (lines[:, 0] + lines[:, 2]) / 2
- v = np.sum(vec_v, axis=0)
- h = np.sum(vec_h, axis=0)
- norm_v, norm_h = np.linalg.norm(v), np.linalg.norm(h)
- if blk.language == 'ja':
- vertical = norm_v > norm_h
- else:
- vertical = norm_v > norm_h * 2
- # calculate distance between textlines and origin
- if vertical:
- primary_vec, primary_norm = v, norm_v
- distance_vectors = center_pnts - np.array([[im_w, 0]], dtype=np.float64) # vertical manga text is read from right to left, so origin is (imw, 0)
- font_size = int(round(norm_h / len(lines)))
- else:
- primary_vec, primary_norm = h, norm_h
- distance_vectors = center_pnts - np.array([[0, 0]], dtype=np.float64)
- font_size = int(round(norm_v / len(lines)))
-
- rotation_angle = int(math.atan2(primary_vec[1], primary_vec[0]) / math.pi * 180) # rotation angle of textlines
- distance = np.linalg.norm(distance_vectors, axis=1) # distance between textlinecenters and origin
- rad_matrix = np.arccos(np.einsum('ij, j->i', distance_vectors, primary_vec) / (distance * primary_norm))
- distance = np.abs(np.sin(rad_matrix) * distance)
- blk.lines = lines.astype(np.int32).tolist()
- blk.distance = distance
- blk.angle = rotation_angle
- if vertical:
- blk.angle -= 90
- if abs(blk.angle) < 3:
- blk.angle = 0
- blk.font_size = font_size
- blk.vertical = vertical
- blk.vec = primary_vec
- blk.norm = primary_norm
- if sort:
- blk.sort_lines()
- def try_merge_textline(blk: TextBlock, blk2: TextBlock, fntsize_tol=1.3, distance_tol=2) -> bool:
- if blk2.merged:
- return False
- fntsize_div = blk.font_size / blk2.font_size
- num_l1, num_l2 = len(blk), len(blk2)
- fntsz_avg = (blk.font_size * num_l1 + blk2.font_size * num_l2) / (num_l1 + num_l2)
- vec_prod = blk.vec @ blk2.vec
- vec_sum = blk.vec + blk2.vec
- cos_vec = vec_prod / blk.norm / blk2.norm
- distance = blk2.distance[-1] - blk.distance[-1]
- distance_p1 = np.linalg.norm(np.array(blk2.lines[-1][0]) - np.array(blk.lines[-1][0]))
- l1, l2 = Polygon(blk.lines[-1]), Polygon(blk2.lines[-1])
- if not l1.intersects(l2):
- if fntsize_div > fntsize_tol or 1 / fntsize_div > fntsize_tol:
- return False
- if abs(cos_vec) < 0.866: # cos30
- return False
- if distance > distance_tol * fntsz_avg or distance_p1 > fntsz_avg * 2.5:
- return False
- # merge
- blk.lines.append(blk2.lines[0])
- blk.vec = vec_sum
- blk.angle = int(round(np.rad2deg(math.atan2(vec_sum[1], vec_sum[0]))))
- if blk.vertical:
- blk.angle -= 90
- blk.norm = np.linalg.norm(vec_sum)
- blk.distance = np.append(blk.distance, blk2.distance[-1])
- blk.font_size = fntsz_avg
- blk2.merged = True
- return True
- def merge_textlines(blk_list: List[TextBlock]) -> List[TextBlock]:
- if len(blk_list) < 2:
- return blk_list
- blk_list.sort(key=lambda blk: blk.distance[0])
- merged_list = []
- for ii, current_blk in enumerate(blk_list):
- if current_blk.merged:
- continue
- for jj, blk in enumerate(blk_list[ii+1:]):
- try_merge_textline(current_blk, blk)
- merged_list.append(current_blk)
- for blk in merged_list:
- blk.adjust_bbox(with_bbox=False)
- return merged_list
- def split_textblk(blk: TextBlock):
- font_size, distance, lines = blk.font_size, blk.distance, blk.lines
- l0 = np.array(blk.lines[0])
- lines.sort(key=lambda line: np.linalg.norm(np.array(line[0]) - l0[0]))
- distance_tol = font_size * 2
- current_blk = copy.deepcopy(blk)
- current_blk.lines = [l0]
- sub_blk_list = [current_blk]
- textblock_splitted = False
- for jj, line in enumerate(lines[1:]):
- l1, l2 = Polygon(lines[jj]), Polygon(line)
- split = False
- if not l1.intersects(l2):
- line_disance = abs(distance[jj+1] - distance[jj])
- if line_disance > distance_tol:
- split = True
- elif blk.vertical and abs(blk.angle) < 15:
- if len(current_blk.lines) > 1 or line_disance > font_size:
- split = abs(lines[jj][0][1] - line[0][1]) > font_size
- if split:
- current_blk = copy.deepcopy(current_blk)
- current_blk.lines = [line]
- sub_blk_list.append(current_blk)
- else:
- current_blk.lines.append(line)
- if len(sub_blk_list) > 1:
- textblock_splitted = True
- for current_blk in sub_blk_list:
- current_blk.adjust_bbox(with_bbox=False)
- return textblock_splitted, sub_blk_list
- def group_output(blks, lines, im_w, im_h, mask=None, sort_blklist=True) -> List[TextBlock]:
- blk_list: List[TextBlock] = []
- scattered_lines = {'ver': [], 'hor': []}
- for bbox, cls, conf in zip(*blks):
- # cls could give wrong result
- blk_list.append(TextBlock(bbox, language=LANG_LIST[cls]))
- # step1: filter & assign lines to textblocks
- bbox_score_thresh = 0.4
- mask_score_thresh = 0.1
- for ii, line in enumerate(lines):
- bx1, bx2 = line[:, 0].min(), line[:, 0].max()
- by1, by2 = line[:, 1].min(), line[:, 1].max()
- bbox_score, bbox_idx = -1, -1
- line_area = (by2-by1) * (bx2-bx1)
- for jj, blk in enumerate(blk_list):
- score = union_area(blk.xyxy, [bx1, by1, bx2, by2]) / line_area
- if bbox_score < score:
- bbox_score = score
- bbox_idx = jj
- if bbox_score > bbox_score_thresh:
- blk_list[bbox_idx].lines.append(line)
- else: # if no textblock was assigned, check whether there is "enough" textmask
- if mask is not None:
- mask_score = mask[by1: by2, bx1: bx2].mean() / 255
- if mask_score < mask_score_thresh:
- continue
- blk = TextBlock([bx1, by1, bx2, by2], [line])
- examine_textblk(blk, im_w, im_h, sort=False)
- if blk.vertical:
- scattered_lines['ver'].append(blk)
- else:
- scattered_lines['hor'].append(blk)
- # step2: filter textblocks, sort & split textlines
- final_blk_list = []
- for blk in blk_list:
- # filter textblocks
- if len(blk.lines) == 0:
- bx1, by1, bx2, by2 = blk.xyxy
- if mask is not None:
- mask_score = mask[by1: by2, bx1: bx2].mean() / 255
- if mask_score < mask_score_thresh:
- continue
- xywh = np.array([[bx1, by1, bx2-bx1, by2-by1]])
- blk.lines = xywh2xyxypoly(xywh).reshape(-1, 4, 2).tolist()
- examine_textblk(blk, im_w, im_h, sort=True)
-
- # split manga text if there is a distance gap
- textblock_splitted = False
- if len(blk.lines) > 1:
- if blk.language == 'ja':
- textblock_splitted = True
- elif blk.vertical:
- textblock_splitted = True
- if textblock_splitted:
- textblock_splitted, sub_blk_list = split_textblk(blk)
- else:
- sub_blk_list = [blk]
- # modify textblock to fit its textlines
- if not textblock_splitted:
- for blk in sub_blk_list:
- blk.adjust_bbox(with_bbox=True)
- final_blk_list += sub_blk_list
- # step3: merge scattered lines, sort textblocks by "grid"
- final_blk_list += merge_textlines(scattered_lines['hor'])
- final_blk_list += merge_textlines(scattered_lines['ver'])
- if sort_blklist:
- final_blk_list = sort_textblk_list(final_blk_list, im_w, im_h)
- for blk in final_blk_list:
- if blk.language == 'eng' and not blk.vertical:
- num_lines = len(blk.lines)
- if num_lines == 0:
- continue
- # blk.line_spacing = blk.bounding_rect()[3] / num_lines / blk.font_size
- expand_size = max(int(blk.font_size * 0.1), 2)
- rad = np.deg2rad(blk.angle)
- shifted_vec = np.array([[[-1, -1],[1, -1],[1, 1],[-1, 1]]])
- shifted_vec = shifted_vec * np.array([[[np.sin(rad), np.cos(rad)]]]) * expand_size
- lines = blk.lines_array() + shifted_vec
- lines[..., 0] = np.clip(lines[..., 0], 0, im_w-1)
- lines[..., 1] = np.clip(lines[..., 1], 0, im_h-1)
- blk.lines = lines.astype(np.int64).tolist()
- blk.font_size += expand_size
-
- return final_blk_list
- def visualize_textblocks(canvas, blk_list: List[TextBlock]):
- lw = max(round(sum(canvas.shape) / 2 * 0.003), 2) # line width
- for ii, blk in enumerate(blk_list):
- bx1, by1, bx2, by2 = blk.xyxy
- cv2.rectangle(canvas, (bx1, by1), (bx2, by2), (127, 255, 127), lw)
- lines = blk.lines_array(dtype=np.int32)
- for jj, line in enumerate(lines):
- cv2.putText(canvas, str(jj), line[0], cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,127,0), 1)
- cv2.polylines(canvas, [line], True, (0,127,255), 2)
- cv2.polylines(canvas, [blk.min_rect()], True, (127,127,0), 2)
- center = [int((bx1 + bx2)/2), int((by1 + by2)/2)]
- cv2.putText(canvas, str(blk.angle), center, cv2.FONT_HERSHEY_SIMPLEX, 1, (127,127,255), 2)
- cv2.putText(canvas, str(ii), (bx1, by1 + lw + 2), 0, lw / 3, (255,127,127), max(lw-1, 1), cv2.LINE_AA)
- return canvas
|