| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154 |
- # Copyright (c) Alibaba, Inc. and its affiliates.
- import io
- import os
- from typing import Union
- import numpy as np
- import soundfile as sf
- import torch
- import torchaudio
- from modelscope.fileio import File
- from modelscope.metainfo import Pipelines
- from modelscope.outputs import OutputKeys
- from modelscope.pipelines.base import InputModel, Pipeline
- from modelscope.pipelines.builder import PIPELINES
- from modelscope.utils.constant import Tasks
- from modelscope.utils.logger import get_logger
- logger = get_logger()
- __all__ = ['LanguageRecognitionPipeline']
- @PIPELINES.register_module(
- Tasks.speech_language_recognition,
- module_name=Pipelines.speech_language_recognition)
- class LanguageRecognitionPipeline(Pipeline):
- """Language Recognition Inference Pipeline
- use `model` to create a Language Recognition pipeline.
- Args:
- model (LanguageRecognitionPipeline): A model instance, or a model local dir, or a model id in the model hub.
- kwargs (dict, `optional`):
- Extra kwargs passed into the pipeline's constructor.
- Example:
- >>> from modelscope.pipelines import pipeline
- >>> from modelscope.utils.constant import Tasks
- >>> p = pipeline(
- >>> task=Tasks.speech_language_recognition, model='damo/speech_campplus_lre_en-cn_16k')
- >>> print(p(audio_in))
- """
- def __init__(self, model: InputModel, **kwargs):
- """use `model` to create a Language Recognition pipeline for prediction
- Args:
- model (str): a valid official model id
- """
- super().__init__(model=model, **kwargs)
- self.model_config = self.model.model_config
- self.languages = self.model_config['languages']
- def __call__(self,
- in_audios: Union[str, list, np.ndarray],
- out_file: str = None):
- wavs = self.preprocess(in_audios)
- scores, results = self.forward(wavs)
- outputs = self.postprocess(results, scores, in_audios, out_file)
- return outputs
- def forward(self, inputs: list):
- scores = []
- results = []
- for x in inputs:
- score, result = self.model(x)
- scores.append(score.tolist())
- results.append(result.item())
- return scores, results
- def postprocess(self,
- inputs: list,
- scores: list,
- in_audios: Union[str, list, np.ndarray],
- out_file=None):
- if isinstance(in_audios, str):
- output = {
- OutputKeys.TEXT: self.languages[inputs[0]],
- OutputKeys.SCORE: scores
- }
- else:
- output = {
- OutputKeys.TEXT: [self.languages[i] for i in inputs],
- OutputKeys.SCORE: scores
- }
- if out_file is not None:
- out_lines = []
- for i, audio in enumerate(in_audios):
- if isinstance(audio, str):
- audio_id = os.path.basename(audio).rsplit('.', 1)[0]
- else:
- audio_id = i
- out_lines.append('%s %s\n' %
- (audio_id, self.languages[inputs[i]]))
- with open(out_file, 'w') as f:
- for i in out_lines:
- f.write(i)
- return output
- def preprocess(self, inputs: Union[str, list, np.ndarray]):
- output = []
- if isinstance(inputs, str):
- file_bytes = File.read(inputs)
- data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32')
- if len(data.shape) == 2:
- data = data[:, 0]
- data = torch.from_numpy(data).unsqueeze(0)
- if fs != self.model_config['sample_rate']:
- logger.warning(
- 'The sample rate of audio is not %d, resample it.'
- % self.model_config['sample_rate'])
- data, fs = torchaudio.sox_effects.apply_effects_tensor(
- data,
- fs,
- effects=[['rate',
- str(self.model_config['sample_rate'])]])
- data = data.squeeze(0)
- output.append(data)
- else:
- for i in range(len(inputs)):
- if isinstance(inputs[i], str):
- file_bytes = File.read(inputs[i])
- data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32')
- if len(data.shape) == 2:
- data = data[:, 0]
- data = torch.from_numpy(data).unsqueeze(0)
- if fs != self.model_config['sample_rate']:
- logger.warning(
- 'The sample rate of audio is not %d, resample it.'
- % self.model_config['sample_rate'])
- data, fs = torchaudio.sox_effects.apply_effects_tensor(
- data,
- fs,
- effects=[[
- 'rate',
- str(self.model_config['sample_rate'])
- ]])
- data = data.squeeze(0)
- elif isinstance(inputs[i], np.ndarray):
- assert len(
- inputs[i].shape
- ) == 1, 'modelscope error: Input array should be [N, T]'
- data = inputs[i]
- if data.dtype in ['int16', 'int32', 'int64']:
- data = (data / (1 << 15)).astype('float32')
- else:
- data = data.astype('float32')
- data = torch.from_numpy(data)
- else:
- raise ValueError(
- 'modelscope error: The input type is restricted to audio address and nump array.'
- )
- output.append(data)
- return output
|