yichael
/
AutoAndroidController


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
							# Copyright (c) Alibaba, Inc. and its affiliates.

import os
import os.path as osp
import re
from typing import Any, Dict

import numpy as np
import tensorflow as tf

from modelscope.metainfo import Pipelines
from modelscope.models.base import Model
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Pipeline
from modelscope.pipelines.builder import PIPELINES
from modelscope.utils.config import Config, ConfigFields
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger

if tf.__version__ >= '2.0':
    tf = tf.compat.v1
    tf.disable_eager_execution()

logger = get_logger()

__all__ = ['LanguageIdentificationPipeline']


@PIPELINES.register_module(
    Tasks.text_classification, module_name=Pipelines.language_identification)
class LanguageIdentificationPipeline(Pipeline):
    r""" Language Identification Pipeline.

    Examples:

    >>> from modelscope.pipelines import pipeline
    >>> from modelscope.utils.constant import Tasks

    >>> pipeline_ins = pipeline(Tasks.text_classification, 'damo/nlp_language_identification-classification-base')
    >>> pipeline_ins('Elon Musk, co-founder and chief executive officer of Tesla Motors.\n' \
    >>>              'Gleichzeitig nahm die Legion an der Befriedung Algeriens teil, die von.\n' \
    >>>              '使用pipeline推理及在线体验功能的时候，尽量输入单句文本，如果是多句长文本建议人工分句。'

    >>> {
    >>>    "labels":[
    >>>        "en",
    >>>        "de",
    >>>        "zh"
    >>>    ],
    >>>    "scores":[
    >>>        [('en', 0.99)],
    >>>        [('de', 1.0)],
    >>>        [('zh', 1.0)]
    >>>    ]
    >>> }
    """

    def __init__(self, model: str, **kwargs):
        """Build a language identification pipeline with a model dir or a model id in the model hub.

        Args:
            model: A Model instance.
        """
        super().__init__(model=model, **kwargs)
        export_dir = model
        self.debug = False

        self.cfg = Config.from_file(
            os.path.join(export_dir, ModelFile.CONFIGURATION))

        joint_vocab_file = os.path.join(
            export_dir, self.cfg[ConfigFields.preprocessor]['vocab'])
        vocabfiles = []
        vocabfiles_reverse = []
        for i, w in enumerate(open(joint_vocab_file, 'rb')):
            w = w.strip()
            try:
                w = w.decode('utf-8')
                vocabfiles.append((w, i))
                vocabfiles_reverse.append((i, w))
            except UnicodeDecodeError:
                # [debug] print error info
                if self.debug:
                    print('error vocab:', w, i)
                pass
        self.vocab = dict(vocabfiles)
        self.vocab_reverse = dict(vocabfiles_reverse)
        self.unk_id = self.vocab.get('<UNK>', 1)
        self.pad_id = self.vocab.get('</S>', 0)

        joint_label_file = os.path.join(
            export_dir, self.cfg[ConfigFields.preprocessor]['label'])
        self.label = dict([(i, w.strip()) for i, w in enumerate(
            open(joint_label_file, 'r', encoding='utf8'))])
        self.unk_label = 'unk'

        tf.reset_default_graph()
        tf_config = tf.ConfigProto(allow_soft_placement=True)
        tf_config.gpu_options.allow_growth = True
        self._session = tf.Session(config=tf_config)
        tf.saved_model.loader.load(self._session,
                                   [tf.saved_model.tag_constants.SERVING],
                                   export_dir)
        default_graph = tf.get_default_graph()
        # [debug] print graph ops
        if self.debug:
            for op in default_graph.get_operations():
                print(op.name, op.values())

        self.input_ids = default_graph.get_tensor_by_name('src_cid:0')
        output_label = default_graph.get_tensor_by_name('output_label:0')
        output_score = default_graph.get_tensor_by_name('predict_score:0')

        self.output = {
            'output_ids': output_label,
            'output_score': output_score
        }
        init = tf.global_variables_initializer()
        local_init = tf.local_variables_initializer()
        self._session.run([init, local_init])
        tf.saved_model.loader.load(self._session,
                                   [tf.saved_model.tag_constants.SERVING],
                                   export_dir)

    def _lid_preprocess(self, input: str) -> list:
        sentence = input.lower()
        # HtmlToText
        CLEANR = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'
        sentence = re.sub(CLEANR, '', sentence)
        # RemoveLinks
        URLRE = r'\S+[./]\S+\s?'
        sentence = re.sub(URLRE, '', sentence)
        EMAILRE = r'\S*@\S*\s?'
        sentence = re.sub(EMAILRE, '', sentence)

        # SBC2DBC
        def stringpartQ2B(uchar):
            inside_code = ord(uchar)
            if 0xFF00 < inside_code or inside_code > 0xFF5F:
                inside_code -= 0xFEE0
            elif inside_code == 0x3000:
                inside_code = 0x0020
            elif inside_code in [
                    0x301D, 0x301E, 0x201C, 0x201D, 0x201E, 0x201F
            ]:
                inside_code = 0x0022
            elif inside_code in [0x2018, 0x2019, 0x201A, 0x201B]:
                inside_code = 0x0027
            return chr(inside_code)

        # RemoveNoisyChars
        m_noisyChars = ",-+\"\'\\&.!=:;°·$«»|±[]{}_?<>~^*/%#@()，。！《》？、`\xc2\xa0…‼️"
        sentence = ''.join([
            stringpartQ2B(c) if c not in m_noisyChars else ' '
            for c in sentence
        ])
        EMOJIRE = re.compile(
            '['
            u'\U0001F600-\U0001F64F'  # emoticons
            u'\U0001F300-\U0001F5FF'  # symbols & pictographs
            u'\U0001F680-\U0001F6FF'  # transport & map symbols
            u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
            u'\U0001f926-\U0001f937'  # emoji
            u'\U00010000-\U0010ffff'  # char emoji
            u'\U00002702-\U000027B0'  # char emoji
            u'\u2640-\u2642\u2600-\u2B55'
            u'\u200d\u23cf\u23e9\u231a\ufe0f\u3030'  # dingbats
            ']+',
            re.UNICODE)
        sentence = re.sub(EMOJIRE, '', sentence)
        # RemoveDigitalWords
        sentence = ' '.join([
            item for item in sentence.split()
            if (not bool(re.search(r'\d', item))
                or not bool(re.match(r'^[a-z0-9+-_]+$', item)))
        ])
        # replaceBrandWords
        # wordCorrection
        # removeSpaces
        outids = []
        for w in sentence.strip():
            tmp = self.vocab.get(w, self.unk_id)
            if len(outids
                   ) > 0 and tmp == self.unk_id and outids[-1] == self.unk_id:
                continue
            outids.append(tmp)
        if len(outids) > 0 and outids[0] == self.unk_id:
            outids = outids[1:]
        if len(outids) > 0 and outids[-1] == self.unk_id:
            outids = outids[:-1]
        return outids

    def preprocess(self, input: str) -> Dict[str, Any]:
        sentencelt = input.split('\n')
        input_ids_lt = [
            self._lid_preprocess(sentence) for sentence in sentencelt
            if sentence.strip() != ''
        ]

        # [debug] print info example:
        if self.debug:
            for sentence, input_ids in zip(sentencelt, input_ids_lt):
                print('raw:', sentence)
                print(
                    'res:', ''.join([
                        self.vocab_reverse.get(wid, self.unk_id).replace(
                            '<UNK>', ' ') for wid in input_ids
                    ]))
        maxlen = max([len(ids) for ids in input_ids_lt])
        for ids in input_ids_lt:
            ids.extend([self.pad_id] * (maxlen - len(ids)))
        input_ids = np.array(input_ids_lt)

        result = {'input_ids': input_ids}
        return result

    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
        with self._session.as_default():
            feed_dict = {self.input_ids: input['input_ids']}
            sess_outputs = self._session.run(self.output, feed_dict=feed_dict)
            return sess_outputs

    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        output_scores_raw = inputs['output_score']

        supported_104_lang = set([
            'af', 'am', 'ar', 'az', 'be', 'bg', 'bn', 'bs', 'ca', 'ce', 'co',
            'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa',
            'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'haw', 'he', 'hi',
            'hmn', 'hr', 'ht', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv',
            'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lo', 'lt', 'lv',
            'mg', 'mi', 'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'ne', 'nl',
            'no', 'ny', 'pa', 'pl', 'ps', 'pt', 'ro', 'ru', 'sd', 'si', 'sk',
            'sl', 'sm', 'sn', 'so', 'sq', 'sr', 'st', 'su', 'sv', 'sw', 'ta',
            'te', 'tg', 'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'uz', 'vi', 'xh',
            'yi', 'yo', 'zh', 'zh-tw', 'zu'
        ])
        labels_scores_lt = []
        output_labels = []
        for output_score in output_scores_raw:
            tmplt = []
            for s, l in zip(output_score, self.label.values()):
                if l not in supported_104_lang:
                    continue
                tmplt.append((l, s))
            tmplt = sorted(tmplt, key=lambda i: i[1], reverse=True)[:3]
            if len(tmplt) == 0:
                tmplt = [(0, 1.00)]
            labels_scores_lt.append(tmplt)
            output_labels.append(tmplt[0][0])
        output_scores = [[(label, round(score, 2))
                          for label, score in labels_scores if score > 0.01]
                         for labels_scores in labels_scores_lt]

        result = {
            OutputKeys.LABELS: output_labels,
            OutputKeys.SCORES: output_scores
        }
        return result