yichael
/
AutoAndroidController


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
							# Copyright (c) Alibaba, Inc. and its affiliates.
import math
import os
import os.path as osp
from typing import Any, Dict

import cv2
import numpy as np
import torch

from modelscope.metainfo import Pipelines
from modelscope.models.cv.ocr_detection import OCRDetection
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Input, Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import LoadImage
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.device import device_placement
from modelscope.utils.logger import get_logger
from .ocr_utils import cal_width, nms_python, rboxes_to_polygons

logger = get_logger()

# constant
RBOX_DIM = 5
OFFSET_DIM = 6
WORD_POLYGON_DIM = 8
OFFSET_VARIANCE = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
TF_NODE_THRESHOLD = 0.4
TF_LINK_THRESHOLD = 0.6


@PIPELINES.register_module(
    Tasks.ocr_detection, module_name=Pipelines.ocr_detection)
class OCRDetectionPipeline(Pipeline):
    """ OCR Detection Pipeline.

    Example:

    ```python
    >>> from modelscope.pipelines import pipeline

    >>> ocr_detection = pipeline('ocr-detection', model='damo/cv_resnet18_ocr-detection-line-level_damo')
    >>> result = ocr_detection('https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/ocr_detection.jpg')

        {'polygons': array([[220,  14, 780,  14, 780,  64, 220,  64],
       [196, 369, 604, 370, 604, 425, 196, 425],
       [ 21, 730, 425, 731, 425, 787,  21, 786],
       [421, 731, 782, 731, 782, 789, 421, 789],
       [  0, 121, 109,   0, 147,  35,  26, 159],
       [697, 160, 773, 160, 773, 197, 697, 198],
       [547, 205, 623, 205, 623, 244, 547, 244],
       [548, 161, 623, 161, 623, 199, 547, 199],
       [698, 206, 772, 206, 772, 244, 698, 244]])}
    ```
    note:
    model = damo/cv_resnet18_ocr-detection-line-level_damo, for general text line detection, based on SegLink++.
    model = damo/cv_resnet18_ocr-detection-word-level_damo, for general text word detection, based on SegLink++.
    model = damo/cv_resnet50_ocr-detection-vlpt, for toaltext dataset, based on VLPT_pretrained DBNet.
    model = damo/cv_resnet18_ocr-detection-db-line-level_damo, for general text line detection, based on DBNet.

    """

    def __init__(self, model: str, **kwargs):
        """
        use `model` to create a OCR detection pipeline for prediction
        Args:
            model: model id on modelscope hub.
        """
        assert isinstance(model, str), 'model must be a single str'
        super().__init__(model=model, **kwargs)
        logger.info(f'loading model from dir {model}')
        cfgs = Config.from_file(os.path.join(model, ModelFile.CONFIGURATION))
        if hasattr(cfgs, 'model') and hasattr(cfgs.model, 'model_type'):
            self.model_type = cfgs.model.model_type
        else:
            self.model_type = 'SegLink++'

        if self.model_type == 'DBNet':
            self.ocr_detector = self.model.to(self.device)
            self.ocr_detector.eval()
            logger.info('loading model done')
        else:
            # for model seglink++
            import tensorflow as tf

            if tf.__version__ >= '2.0':
                tf = tf.compat.v1
            tf.compat.v1.disable_eager_execution()

            tf.app.flags.DEFINE_float('node_threshold', TF_NODE_THRESHOLD,
                                      'Confidence threshold for nodes')
            tf.app.flags.DEFINE_float('link_threshold', TF_LINK_THRESHOLD,
                                      'Confidence threshold for links')
            tf.reset_default_graph()
            model_path = osp.join(
                osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
                'checkpoint-80000')
            self._graph = tf.get_default_graph()
            config = tf.ConfigProto(allow_soft_placement=True)
            config.gpu_options.allow_growth = True
            self._session = tf.Session(config=config)

            with self._graph.as_default():
                with device_placement(self.framework, self.device_name):
                    self.input_images = tf.placeholder(
                        tf.float32,
                        shape=[1, 1024, 1024, 3],
                        name='input_images')
                    self.output = {}

                    with tf.variable_scope('', reuse=tf.AUTO_REUSE):
                        global_step = tf.get_variable(
                            'global_step', [],
                            initializer=tf.constant_initializer(0),
                            dtype=tf.int64,
                            trainable=False)
                        variable_averages = tf.train.ExponentialMovingAverage(
                            0.997, global_step)

                        from .ocr_utils import SegLinkDetector, combine_segments_python, decode_segments_links_python
                        # detector
                        detector = SegLinkDetector()
                        all_maps = detector.build_model(
                            self.input_images, is_training=False)

                        # decode local predictions
                        all_nodes, all_links, all_reg = [], [], []
                        for i, maps in enumerate(all_maps):
                            cls_maps, lnk_maps, reg_maps = maps[0], maps[
                                1], maps[2]
                            reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)

                            cls_prob = tf.nn.softmax(
                                tf.reshape(cls_maps, [-1, 2]))

                            lnk_prob_pos = tf.nn.softmax(
                                tf.reshape(lnk_maps, [-1, 4])[:, :2])
                            lnk_prob_mut = tf.nn.softmax(
                                tf.reshape(lnk_maps, [-1, 4])[:, 2:])
                            lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut],
                                                 axis=1)

                            all_nodes.append(cls_prob)
                            all_links.append(lnk_prob)
                            all_reg.append(reg_maps)

                        # decode segments and links
                        image_size = tf.shape(self.input_images)[1:3]
                        segments, group_indices, segment_counts, _ = decode_segments_links_python(
                            image_size,
                            all_nodes,
                            all_links,
                            all_reg,
                            anchor_sizes=list(detector.anchor_sizes))

                        # combine segments
                        combined_rboxes, combined_counts = combine_segments_python(
                            segments, group_indices, segment_counts)
                        self.output['combined_rboxes'] = combined_rboxes
                        self.output['combined_counts'] = combined_counts

                    with self._session.as_default() as sess:
                        logger.info(f'loading model from {model_path}')
                        # load model
                        model_loader = tf.train.Saver(
                            variable_averages.variables_to_restore())
                        model_loader.restore(sess, model_path)

    def __call__(self, input, **kwargs):
        """
        Detect text instance in the text image.

        Args:
            input (`Image`):
                The pipeline handles three types of images:

                - A string containing an HTTP link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL or opencv directly

                The pipeline currently supports single image input.

        Return:
            An array of contour polygons of detected N text instances in image,
            every row is [x1, y1, x2, y2, x3, y3, x4, y4, ...].
        """
        return super().__call__(input, **kwargs)

    def preprocess(self, input: Input) -> Dict[str, Any]:
        if self.model_type == 'DBNet':
            result = self.preprocessor(input)
            return result
        else:
            # for model seglink++
            img = LoadImage.convert_to_ndarray(input)

            h, w, c = img.shape
            img_pad = np.zeros((max(h, w), max(h, w), 3), dtype=np.float32)
            img_pad[:h, :w, :] = img

            resize_size = 1024
            img_pad_resize = cv2.resize(img_pad, (resize_size, resize_size))
            img_pad_resize = cv2.cvtColor(img_pad_resize, cv2.COLOR_RGB2BGR)
            img_pad_resize = img_pad_resize - np.array(
                [123.68, 116.78, 103.94], dtype=np.float32)

            import tensorflow as tf
            with self._graph.as_default():
                resize_size = tf.stack([resize_size, resize_size])
                orig_size = tf.stack([max(h, w), max(h, w)])
                self.output['orig_size'] = orig_size
                self.output['resize_size'] = resize_size

            result = {'img': np.expand_dims(img_pad_resize, axis=0)}
            return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        if self.model_type == 'DBNet':
            outputs = self.ocr_detector(input)
            return outputs
        else:
            with self._graph.as_default():
                with self._session.as_default():
                    feed_dict = {self.input_images: input['img']}
                    sess_outputs = self._session.run(
                        self.output, feed_dict=feed_dict)
                    return sess_outputs

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        if self.model_type == 'DBNet':
            result = {OutputKeys.POLYGONS: inputs['det_polygons']}
            return result
        else:
            rboxes = inputs['combined_rboxes'][0]
            count = inputs['combined_counts'][0]
            if count == 0 or count < rboxes.shape[0]:
                raise Exception('modelscope error: No text detected')
            rboxes = rboxes[:count, :]

            # convert rboxes to polygons and find its coordinates on the original image
            orig_h, orig_w = inputs['orig_size']
            resize_h, resize_w = inputs['resize_size']
            polygons = rboxes_to_polygons(rboxes)
            scale_y = float(orig_h) / float(resize_h)
            scale_x = float(orig_w) / float(resize_w)

            # confine polygons inside image
            polygons[:, ::2] = np.maximum(
                0, np.minimum(polygons[:, ::2] * scale_x, orig_w - 1))
            polygons[:, 1::2] = np.maximum(
                0, np.minimum(polygons[:, 1::2] * scale_y, orig_h - 1))
            polygons = np.round(polygons).astype(np.int32)

            # nms
            dt_n9 = [o + [cal_width(o)] for o in polygons.tolist()]
            dt_nms = nms_python(dt_n9)
            dt_polygons = np.array([o[:8] for o in dt_nms])

            result = {OutputKeys.POLYGONS: dt_polygons}
            return result