| import logging |
| logging.getLogger('numba').setLevel(logging.WARNING) |
| logging.getLogger('matplotlib').setLevel(logging.WARNING) |
| logging.getLogger('urllib3').setLevel(logging.WARNING) |
| import json |
| import re |
| import numpy as np |
| import IPython.display as ipd |
| import torch |
| import commons |
| import utils |
| from models import SynthesizerTrn |
| from text.symbols import symbols |
| from text import text_to_sequence |
| import gradio as gr |
| import time |
| import datetime |
| import os |
| import pickle |
| import openai |
| from scipy.io.wavfile import write |
| def is_japanese(string): |
| for ch in string: |
| if ord(ch) > 0x3040 and ord(ch) < 0x30FF: |
| return True |
| return False |
|
|
| def is_english(string): |
| import re |
| pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$') |
| if pattern.fullmatch(string): |
| return True |
| else: |
| return False |
|
|
| def to_html(chat_history): |
| chat_html = "" |
| for item in chat_history: |
| if item['role'] == 'user': |
| chat_html += f""" |
| <div style="margin-bottom: 20px;"> |
| <div style="text-align: right; margin-right: 20px;"> |
| <span style="background-color: #4CAF50; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;"> |
| {item['content']} |
| </span> |
| </div> |
| </div> |
| """ |
| else: |
| chat_html += f""" |
| <div style="margin-bottom: 20px;"> |
| <div style="text-align: left; margin-left: 20px;"> |
| <span style="background-color: white; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;"> |
| {item['content']} |
| </span> |
| </div> |
| </div> |
| """ |
| output_html = f""" |
| <div style="height: 400px; overflow-y: scroll; padding: 10px;"> |
| {chat_html} |
| </div> |
| """ |
| return output_html |
|
|
| def extrac(text): |
| text = re.sub("<[^>]*>","",text) |
| result_list = re.split(r'\n', text) |
| final_list = [] |
| for i in result_list: |
| if is_english(i): |
| i = romajitable.to_kana(i).katakana |
| i = i.replace('\n','').replace(' ','') |
| |
| if len(i)>1: |
| if len(i) > 20: |
| try: |
| cur_list = re.split(r'。|!', i) |
| for i in cur_list: |
| if len(i)>1: |
| final_list.append(i+'。') |
| except: |
| pass |
| else: |
| final_list.append(i) |
| final_list = [x for x in final_list if x != ''] |
| print(final_list) |
| return final_list |
|
|
| def to_numpy(tensor: torch.Tensor): |
| return tensor.detach().cpu().numpy() if tensor.requires_grad \ |
| else tensor.detach().numpy() |
|
|
| def chatgpt(text): |
| messages = [] |
| try: |
| with open('log.pickle', 'rb') as f: |
| messages = pickle.load(f) |
| messages.append({"role": "user", "content": text},) |
| chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages) |
| reply = chat.choices[0].message.content |
| messages.append({"role": "assistant", "content": reply}) |
| print(messages[-1]) |
| if len(messages) == 12: |
| messages[6:10] = messages[8:] |
| del messages[-2:] |
| with open('log.pickle', 'wb') as f: |
| messages2 = [] |
| pickle.dump(messages2, f) |
| return reply,messages |
| except: |
| messages.append({"role": "user", "content": text},) |
| chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages) |
| reply = chat.choices[0].message.content |
| messages.append({"role": "assistant", "content": reply}) |
| print(messages[-1]) |
| if len(messages) == 12: |
| messages[6:10] = messages[8:] |
| del messages[-2:] |
| with open('log.pickle', 'wb') as f: |
| pickle.dump(messages, f) |
| return reply,messages |
|
|
| def get_symbols_from_json(path): |
| assert os.path.isfile(path) |
| with open(path, 'r') as f: |
| data = json.load(f) |
| return data['symbols'] |
|
|
| def sle(language,text): |
| text = text.replace('\n', ' ').replace('\r', '').replace(" ", "") |
| if language == "中文": |
| tts_input1 = "[ZH]" + text + "[ZH]" |
| return tts_input1 |
| elif language == "自动": |
| tts_input1 = f"[JA]{text}[JA]" if is_japanese(text) else f"[ZH]{text}[ZH]" |
| return tts_input1 |
| elif language == "日文": |
| tts_input1 = "[JA]" + text + "[JA]" |
| return tts_input1 |
| elif language == "英文": |
| tts_input1 = "[EN]" + text + "[EN]" |
| return tts_input1 |
| elif language == "手动": |
| return text |
|
|
| def get_text(text,hps_ms): |
| text_norm = text_to_sequence(text,hps_ms.data.text_cleaners) |
| if hps_ms.data.add_blank: |
| text_norm = commons.intersperse(text_norm, 0) |
| text_norm = torch.LongTensor(text_norm) |
| return text_norm |
|
|
| def create_tts_fn(net_g,hps,speaker_id): |
| speaker_id = int(speaker_id) |
| def tts_fn(is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ): |
| repeat_ime = int(repeat_time) |
| if is_gpt: |
| openai.api_key = api_key |
| text,messages = chatgpt(text) |
| htm = to_html(messages) |
| else: |
| htm = '' |
| if not extract: |
| t1 = time.time() |
| stn_tst = get_text(sle(language,text),hps) |
| with torch.no_grad(): |
| x_tst = stn_tst.unsqueeze(0).to(dev) |
| x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) |
| sid = torch.LongTensor([speaker_id]).to(dev) |
| audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() |
| t2 = time.time() |
| spending_time = "推理时间为:"+str(t2-t1)+"s" |
| print(spending_time) |
| file_path = "subtitles.srt" |
| try: |
| write(audiopath + '.wav',22050,audio) |
| if is_audio: |
| for i in range(repeat_time): |
| cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i)) |
| os.system(cmd) |
| except: |
| pass |
| return (hps.data.sampling_rate, audio),file_path,htm |
| else: |
| a = ['【','[','(','('] |
| b = ['】',']',')',')'] |
| for i in a: |
| text = text.replace(i,'<') |
| for i in b: |
| text = text.replace(i,'>') |
| final_list = extrac(text.replace('“','').replace('”','')) |
| audio_fin = [] |
| c = 0 |
| t = datetime.timedelta(seconds=0) |
| for sentence in final_list: |
| try: |
| f1 = open("subtitles.srt",'w',encoding='utf-8') |
| c +=1 |
| stn_tst = get_text(sle(language,sentence),hps) |
| with torch.no_grad(): |
| x_tst = stn_tst.unsqueeze(0).to(dev) |
| x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev) |
| sid = torch.LongTensor([speaker_id]).to(dev) |
| t1 = time.time() |
| audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy() |
| t2 = time.time() |
| spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s" |
| print(spending_time) |
| time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3] |
| last_time = datetime.timedelta(seconds=len(audio)/float(22050)) |
| t+=last_time |
| time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3] |
| print(time_end) |
| f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n') |
| audio_fin.append(audio) |
| except: |
| pass |
| try: |
| write(audiopath + '.wav',22050,np.concatenate(audio_fin)) |
| if is_audio: |
| for i in range(repeat_time): |
| cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i)) |
| os.system(cmd) |
| |
| except: |
| pass |
| |
| file_path = "subtitles.srt" |
| return (hps.data.sampling_rate, np.concatenate(audio_fin)),file_path,htm |
| return tts_fn |
|
|
| if __name__ == '__main__': |
| hps = utils.get_hparams_from_file('checkpoints/Nijigaku/config.json') |
| dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
| models = [] |
| schools = ["Nijigasaki High School"] |
| lan = ["中文","日文","自动","手动"] |
| with open("checkpoints/info.json", "r", encoding="utf-8") as f: |
| models_info = json.load(f) |
| net_g = SynthesizerTrn( |
| len(symbols), |
| hps.data.filter_length // 2 + 1, |
| hps.train.segment_size // hps.data.hop_length, |
| n_speakers=hps.data.n_speakers, |
| **hps.model).to(dev) |
| _ = net_g.eval() |
| _ = utils.load_checkpoint("checkpoints/Nijigaku/model.pth" , net_g) |
| for i in models_info: |
| school = models_info[i] |
| speakers = school["speakers"] |
| phone_dict = { |
| symbol: i for i, symbol in enumerate(symbols) |
| } |
| content = [] |
| for j in speakers: |
| sid = int(speakers[j]['sid']) |
| title = school |
| example = speakers[j]['speech'] |
| name = speakers[j]["name"] |
| content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid))) |
| models.append(content) |
| |
| with gr.Blocks() as app: |
| with gr.Tabs(): |
| for i in schools: |
| with gr.TabItem(i): |
| for (sid, name, title, example, tts_fn) in models[schools.index(i)]: |
| with gr.TabItem(name): |
| with gr.Column(): |
| with gr.Row(): |
| with gr.Row(): |
| gr.Markdown( |
| '<div align="center">' |
| f'<img style="width:auto;height:400px;" src="file/image/{name}.png">' |
| '</div>' |
| ) |
| output_UI = gr.outputs.HTML() |
| with gr.Row(): |
| with gr.Column(scale=0.85): |
| input1 = gr.TextArea(label="Text", value=example,lines = 1) |
| with gr.Column(scale=0.15, min_width=0): |
| btnVC = gr.Button("Send") |
| output1 = gr.Audio(label="采样率22050") |
| with gr.Accordion(label="Setting(TTS)", open=False): |
| input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True) |
| input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6) |
| input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668) |
| input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1) |
| with gr.Accordion(label="Advanced Setting(GPT3.5接口+长句子合成,建议克隆本仓库后运行main.py)", open=False): |
| input3 = gr.Checkbox(value=False, label="长句切割(小说合成)") |
| output2 = gr.outputs.File(label="字幕文件:subtitles.srt") |
| api_input1 = gr.Checkbox(value=False, label="接入chatgpt") |
| api_input2 = gr.TextArea(label="api-key",lines=1,value = '见 https://openai.com/blog/openai-api') |
| audio_input1 = gr.Checkbox(value=False, label="修改音频路径(live2d)") |
| audio_input2 = gr.TextArea(label="音频路径",lines=1,value = '#参考 D:/app_develop/live2d_whole/2010002/sounds/temp.wav') |
| audio_input3 = gr.Dropdown(label="重复生成次数", choices=list(range(101)), value='0', interactive=True) |
| btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI]) |
| |
| app.launch() |