yichael
/
AutoAndroidController


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
							# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import Any, Dict

import torch
import unicodedata2
from torchvision import transforms
from torchvision.transforms import InterpolationMode
from torchvision.transforms import functional as F
from zhconv import convert

from modelscope.utils.constant import ModeKeys
from .base import OfaBasePreprocessor


def ocr_resize(img, patch_image_size, is_document=False):
    r"""
    Image resize function for OCR tasks.
    """
    img = img.convert('RGB')
    width, height = img.size

    if is_document:
        new_height, new_width = 64, 1920
    else:
        if width >= height:
            new_width = max(64, patch_image_size)
            new_height = max(64, int(patch_image_size * (height / width)))
            top = (patch_image_size - new_height) // 2
            bottom = patch_image_size - new_height - top
            left, right = 0, 0
        else:
            new_height = max(64, patch_image_size)
            new_width = max(64, int(patch_image_size * (width / height)))
            left = (patch_image_size - new_width) // 2
            right = patch_image_size - new_width - left
            top, bottom = 0, 0

    img_new = F.resize(
        img,
        (new_height, new_width),
        interpolation=InterpolationMode.BICUBIC,
    )

    if is_document:
        img_split = transforms.ToTensor()(img_new).chunk(4, dim=-1)
        img_new = transforms.ToPILImage()(torch.cat(img_split, dim=-2))
        new_width, new_height = img_new.size
        top = (patch_image_size - new_height) // 2
        bottom = patch_image_size - new_height - top
        left, right = 0, 0

    img_new = F.pad(
        img_new, padding=[left, top, right, bottom], padding_mode='edge')
    assert img_new.size == (patch_image_size, patch_image_size)

    return img_new


class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
    r"""
    OFA preprocessor for OCR recognition tasks.
    """

    def __init__(self,
                 cfg,
                 model_dir,
                 mode=ModeKeys.INFERENCE,
                 *args,
                 **kwargs):
        """preprocess the data

        Args:
            cfg(modelscope.utils.config.ConfigDict) : model config
            model_dir (str): model path,
            mode: preprocessor mode (model mode)
        """
        super(OfaOcrRecognitionPreprocessor,
              self).__init__(cfg, model_dir, mode, *args, **kwargs)

        self.patch_resize_transform = transforms.Compose([
            lambda image: ocr_resize(
                image,
                self.patch_image_size,
                is_document=self.cfg.model.get('is_document', False)),
            transforms.ToTensor(),
            transforms.Normalize(mean=self.mean, std=self.std),
        ])

    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        if self.mode == ModeKeys.TRAIN:
            return self._build_train_sample(data)
        else:
            return self._build_infer_sample(data)

    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
        r"""
        Building training samples.

        step 1. Preprocess the data using the logic of `_build_infer_sample`
            and make sure the label data in the result.
        step 2. Preprocess the label data. Contains:
            - do tripe to the label value.
            - tokenize the label as `target` value without `bos` token.
            - add `bos` token and remove `eos` token of `target` as `prev_output_tokens`.

        Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `image`, `prompt` and `label`,
                the former refers the image input data, and the later refers the text input data
                the `label` is the supervised data for training.
        Return:
            A dict object, contains source, image, mask, label, target tokens,
            and previous output tokens data.
        """
        sample = self._build_infer_sample(data)
        target = sample['label']
        target_token_list = target.strip().split()
        target = ' '.join(target_token_list[:self.max_tgt_length])
        sample['target'] = self.tokenize_text(target, add_bos=False)
        sample['prev_output_tokens'] = torch.cat(
            [self.bos_item, sample['target'][:-1]])
        return sample

    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
        r"""
        Building inference samples.

        step 1. Get the pillow image.
        step 2. Do some transforms for the pillow image as the image input,
            such as resize, normalize, to tensor etc.
        step 3. Tokenize the prompt as text input.
        step 4. Determine Whether or not to add labels to the sample.

        Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `image` and `prompt`,
                the former refers the image input data, and the later refers the text input data.
        Return:
            A dict object, contains source, image, image patch mask and label data.
        """
        image = self.get_img_pil(data[self.column_map['image']])
        patch_image = self.patch_resize_transform(image)
        prompt = self.cfg.model.get('prompt', '图片上的文字是什么?')
        inputs = self.tokenize_text(prompt)

        sample = {
            'source': inputs,
            'patch_image': patch_image,
            'patch_mask': torch.tensor([True])
        }
        if 'text' in self.column_map and self.column_map['text'] in data:
            target = data[self.column_map['text']]
            sample['label'] = unicodedata2.normalize(
                'NFKC', convert(target, 'zh-hans'))
        return sample