| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278 |
- # Copyright (c) Alibaba, Inc. and its affiliates.
- from __future__ import (absolute_import, division, print_function,
- unicode_literals)
- import datetime
- import os
- import shutil
- import sys
- import wave
- import zipfile
- import json
- import matplotlib.pyplot as plt
- import numpy as np
- import yaml
- from modelscope.metainfo import Models
- from modelscope.models.base import Model
- from modelscope.models.builder import MODELS
- from modelscope.utils.audio.audio_utils import (TtsCustomParams, TtsTrainType,
- ndarray_pcm_to_wav)
- from modelscope.utils.audio.tts_exceptions import (
- TtsFrontendInitializeFailedException,
- TtsFrontendLanguageTypeInvalidException, TtsModelConfigurationException,
- TtsVoiceNotExistsException)
- from modelscope.utils.constant import Tasks
- from modelscope.utils.logger import get_logger
- from .voice import Voice
- __all__ = ['SambertHifigan']
- logger = get_logger()
- @MODELS.register_module(
- Tasks.text_to_speech, module_name=Models.sambert_hifigan)
- class SambertHifigan(Model):
- def __init__(self, model_dir, *args, **kwargs):
- super().__init__(model_dir, *args, **kwargs)
- self.model_dir = model_dir
- self.sample_rate = kwargs.get('sample_rate', 16000)
- self.is_train = False
- if 'is_train' in kwargs:
- is_train = kwargs['is_train']
- if isinstance(is_train, bool):
- self.is_train = is_train
- # check legacy modelcard
- self.ignore_mask = False
- if 'am' in kwargs:
- if 'linguistic_unit' in kwargs['am']:
- self.ignore_mask = not kwargs['am']['linguistic_unit'].get(
- 'has_mask', True)
- self.voices, self.voice_cfg, self.lang_type = self.load_voice(
- model_dir, kwargs.get('custom_ckpt', {}))
- if len(self.voices) == 0 or len(self.voice_cfg.get('voices', [])) == 0:
- raise TtsVoiceNotExistsException('modelscope error: voices empty')
- if self.voice_cfg['voices']:
- self.default_voice_name = self.voice_cfg['voices'][0]
- else:
- raise TtsVoiceNotExistsException(
- 'modelscope error: voices is empty in voices.json')
- # initialize frontend
- if sys.version_info >= (3, 11):
- raise ImportError('Python version needs to be <= 3.10')
- import ttsfrd
- frontend = ttsfrd.TtsFrontendEngine()
- zip_file = os.path.join(model_dir, 'resource.zip')
- self.res_path = os.path.join(model_dir, 'resource')
- with zipfile.ZipFile(zip_file, 'r') as zip_ref:
- zip_ref.extractall(model_dir)
- if not frontend.initialize(self.res_path):
- raise TtsFrontendInitializeFailedException(
- 'modelscope error: resource invalid: {}'.format(self.res_path))
- if not frontend.set_lang_type(self.lang_type):
- raise TtsFrontendLanguageTypeInvalidException(
- 'modelscope error: language type invalid: {}'.format(
- self.lang_type))
- self.frontend = frontend
- def build_voice_from_custom(self, model_dir, custom_ckpt):
- necessary_files = (TtsCustomParams.VOICE_NAME, TtsCustomParams.AM_CKPT,
- TtsCustomParams.VOC_CKPT, TtsCustomParams.AM_CONFIG,
- TtsCustomParams.VOC_CONFIG)
- voices = {}
- voices_cfg = {}
- lang_type = 'PinYin'
- for k in necessary_files:
- if k not in custom_ckpt:
- raise TtsModelNotExistsException(
- f'custom ckpt must have: {necessary_files}')
- voice_name = custom_ckpt[TtsCustomParams.VOICE_NAME]
- voice = Voice(
- voice_name=voice_name,
- voice_path=model_dir,
- custom_ckpt=custom_ckpt,
- ignore_mask=self.ignore_mask,
- is_train=self.is_train)
- voices[voice_name] = voice
- voices_cfg['voices'] = [voice_name]
- lang_type = voice.lang_type
- return voices, voices_cfg, lang_type
- def load_voice(self, model_dir, custom_ckpt):
- voices = {}
- voices_path = os.path.join(model_dir, 'voices')
- voices_json_path = os.path.join(voices_path, 'voices.json')
- lang_type = 'PinYin'
- if len(custom_ckpt) != 0:
- return self.build_voice_from_custom(model_dir, custom_ckpt)
- if not os.path.exists(voices_path) or not os.path.exists(
- voices_json_path):
- return voices, {}, lang_type
- with open(voices_json_path, 'r', encoding='utf-8') as f:
- voice_cfg = json.load(f)
- if 'voices' not in voice_cfg:
- return voices, {}, lang_type
- for name in voice_cfg['voices']:
- voice_path = os.path.join(voices_path, name)
- if not os.path.exists(voice_path):
- continue
- voices[name] = Voice(
- name,
- voice_path,
- ignore_mask=self.ignore_mask,
- is_train=self.is_train)
- lang_type = voices[name].lang_type
- return voices, voice_cfg, lang_type
- def save_voices(self):
- voices_json_path = os.path.join(self.model_dir, 'voices',
- 'voices.json')
- if os.path.exists(voices_json_path):
- os.remove(voices_json_path)
- save_voices = {}
- save_voices['voices'] = []
- for k in self.voices.keys():
- save_voices['voices'].append(k)
- with open(voices_json_path, 'w', encoding='utf-8') as f:
- json.dump(save_voices, f)
- def get_voices(self):
- return self.voices, self.voice_cfg
- def create_empty_voice(self, voice_name, audio_config, am_config_path,
- voc_config_path):
- voice_name_path = os.path.join(self.model_dir, 'voices', voice_name)
- if os.path.exists(voice_name_path):
- shutil.rmtree(voice_name_path)
- os.makedirs(voice_name_path, exist_ok=True)
- if audio_config and os.path.exists(audio_config) and os.path.isfile(
- audio_config):
- shutil.copy(audio_config, voice_name_path)
- voice_am_path = os.path.join(voice_name_path, 'am')
- voice_voc_path = os.path.join(voice_name_path, 'voc')
- if am_config_path and os.path.exists(
- am_config_path) and os.path.isfile(am_config):
- am_config_name = os.path.join(voice_am_path, 'config.yaml')
- shutil.copy(am_config_path, am_config_name)
- if voc_config_path and os.path.exists(
- voc_config_path) and os.path.isfile(voc_config):
- voc_config_name = os.path.join(voice_am_path, 'config.yaml')
- shutil.copy(voc_config_path, voc_config_name)
- am_ckpt_path = os.path.join(voice_am_path, 'ckpt')
- voc_ckpt_path = os.path.join(voice_voc_path, 'ckpt')
- os.makedirs(am_ckpt_path, exist_ok=True)
- os.makedirs(voc_ckpt_path, exist_ok=True)
- self.voices[voice_name] = Voice(
- voice_name=voice_name,
- voice_path=voice_name_path,
- allow_empty=True)
- def get_voice_audio_config_path(self, voice):
- if voice not in self.voices:
- return ''
- return self.voices[voice].audio_config
- def get_voice_se_model_path(self, voice):
- if voice not in self.voices:
- return ''
- if self.voices[voice].se_enable:
- return self.voices[voice].se_model_path
- else:
- return ''
- def get_voice_lang_path(self, voice):
- if voice not in self.voices:
- return ''
- return self.voices[voice].lang_dir
- def synthesis_one_sentences(self, voice_name, text):
- if voice_name not in self.voices:
- raise TtsVoiceNotExistsException(
- f'modelscope error: Voice {voice_name} not exists')
- return self.voices[voice_name].forward(text)
- def train(self,
- voice,
- dirs,
- train_type,
- configs_path_dict=None,
- ignore_pretrain=False,
- create_if_not_exists=False,
- hparam=None):
- plt.set_loglevel('info')
- work_dir = dirs['work_dir']
- am_dir = dirs['am_tmp_dir']
- voc_dir = dirs['voc_tmp_dir']
- data_dir = dirs['data_dir']
- target_voice = None
- if voice not in self.voices:
- if not create_if_not_exists:
- raise TtsVoiceNotExistsException(
- f'modelscope error: Voice {voice_name} not exists')
- am_config_path = configs_path_dict.get('am_config',
- 'am_config.yaml')
- voc_config_path = configs_path_dict.get('voc_config',
- 'voc_config.yaml')
- if TtsTrainType.TRAIN_TYPE_SAMBERT in train_type and not am_config:
- raise TtsTrainingCfgNotExistsException(
- 'training new voice am with empty am_config')
- if TtsTrainType.TRAIN_TYPE_VOC in train_type and not voc_config:
- raise TtsTrainingCfgNotExistsException(
- 'training new voice voc with empty voc_config')
- else:
- target_voice = self.voices[voice]
- am_config_path = target_voice.am_config_path
- voc_config_path = target_voice.voc_config_path
- if configs_path_dict:
- if 'am_config' in configs_path_dict:
- am_override = configs_path_dict['am_config']
- if os.path.exists(am_override):
- am_config_path = am_override
- if 'voc_config' in configs_path_dict:
- voc_override = configs_path_dict['voc_config']
- if os.path.exists(voc_override):
- voc_config_path = voc_override
- logger.info('Start training....')
- if TtsTrainType.TRAIN_TYPE_SAMBERT in train_type:
- logger.info('Start SAMBERT training...')
- totaltime = datetime.datetime.now()
- hparams = train_type[TtsTrainType.TRAIN_TYPE_SAMBERT]
- target_voice.train_sambert(work_dir, am_dir, data_dir,
- am_config_path, ignore_pretrain,
- hparams)
- totaltime = datetime.datetime.now() - totaltime
- logger.info('SAMBERT training spent: {:.2f} hours\n'.format(
- totaltime.total_seconds() / 3600.0))
- else:
- logger.info('skip SAMBERT training...')
- if TtsTrainType.TRAIN_TYPE_VOC in train_type:
- logger.info('Start HIFIGAN training...')
- totaltime = datetime.datetime.now()
- hparams = train_type[TtsTrainType.TRAIN_TYPE_VOC]
- target_voice.train_hifigan(work_dir, voc_dir, data_dir,
- voc_config_path, ignore_pretrain,
- hparams)
- totaltime = datetime.datetime.now() - totaltime
- logger.info('HIFIGAN training spent: {:.2f} hours\n'.format(
- totaltime.total_seconds() / 3600.0))
- else:
- logger.info('skip HIFIGAN training...')
- def forward(self, text: str, voice_name: str = None):
- voice = self.default_voice_name
- if voice_name is not None:
- voice = voice_name
- result = self.frontend.gen_tacotron_symbols(text)
- texts = [s for s in result.splitlines() if s != '']
- audio_total = np.empty((0), dtype='int16')
- for line in texts:
- line = line.strip().split('\t')
- audio = self.synthesis_one_sentences(voice, line[1])
- audio = 32768.0 * audio
- audio_total = np.append(audio_total, audio.astype('int16'), axis=0)
- return ndarray_pcm_to_wav(self.sample_rate, audio_total)
|