Spaces:

Mahiruoshi
/

Lovelive-Nijigasaku-Chat-iSTFT-GPT3

Running

App Files Files Community

Lovelive-Nijigasaku-Chat-iSTFT-GPT3 / app.py

Mahiruoshi

Upload 90 files

9c49b65 about 3 years ago

Raw

History Blame Contribute Delete

13.5 kB

	import logging
	logging.getLogger('numba').setLevel(logging.WARNING)
	logging.getLogger('matplotlib').setLevel(logging.WARNING)
	logging.getLogger('urllib3').setLevel(logging.WARNING)
	import json
	import re
	import numpy as np
	import IPython.display as ipd
	import torch
	import commons
	import utils
	from models import SynthesizerTrn
	from text.symbols import symbols
	from text import text_to_sequence
	import gradio as gr
	import time
	import datetime
	import os
	import pickle
	import openai
	from scipy.io.wavfile import write
	def is_japanese(string):
	for ch in string:
	if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
	return True
	return False

	def is_english(string):
	import re
	pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
	if pattern.fullmatch(string):
	return True
	else:
	return False

	def to_html(chat_history):
	chat_html = ""
	for item in chat_history:
	if item['role'] == 'user':
	chat_html += f"""
	<div style="margin-bottom: 20px;">
	<div style="text-align: right; margin-right: 20px;">
	<span style="background-color: #4CAF50; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;">
	{item['content']}
	</span>
	</div>
	</div>
	"""
	else:
	chat_html += f"""
	<div style="margin-bottom: 20px;">
	<div style="text-align: left; margin-left: 20px;">
	<span style="background-color: white; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;">
	{item['content']}
	</span>
	</div>
	</div>
	"""
	output_html = f"""
	<div style="height: 400px; overflow-y: scroll; padding: 10px;">
	{chat_html}
	</div>
	"""
	return output_html

	def extrac(text):
	text = re.sub("<[^>]*>","",text)
	result_list = re.split(r'\n', text)
	final_list = []
	for i in result_list:
	if is_english(i):
	i = romajitable.to_kana(i).katakana
	i = i.replace('\n','').replace(' ','')
	#Current length of single sentence: 20
	if len(i)>1:
	if len(i) > 20:
	try:
	cur_list = re.split(r'。\|！', i)
	for i in cur_list:
	if len(i)>1:
	final_list.append(i+'。')
	except:
	pass
	else:
	final_list.append(i)
	final_list = [x for x in final_list if x != '']
	print(final_list)
	return final_list

	def to_numpy(tensor: torch.Tensor):
	return tensor.detach().cpu().numpy() if tensor.requires_grad \
	else tensor.detach().numpy()

	def chatgpt(text):
	messages = []
	try:
	with open('log.pickle', 'rb') as f:
	messages = pickle.load(f)
	messages.append({"role": "user", "content": text},)
	chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
	reply = chat.choices[0].message.content
	messages.append({"role": "assistant", "content": reply})
	print(messages[-1])
	if len(messages) == 12:
	messages[6:10] = messages[8:]
	del messages[-2:]
	with open('log.pickle', 'wb') as f:
	messages2 = []
	pickle.dump(messages2, f)
	return reply,messages
	except:
	messages.append({"role": "user", "content": text},)
	chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
	reply = chat.choices[0].message.content
	messages.append({"role": "assistant", "content": reply})
	print(messages[-1])
	if len(messages) == 12:
	messages[6:10] = messages[8:]
	del messages[-2:]
	with open('log.pickle', 'wb') as f:
	pickle.dump(messages, f)
	return reply,messages

	def get_symbols_from_json(path):
	assert os.path.isfile(path)
	with open(path, 'r') as f:
	data = json.load(f)
	return data['symbols']

	def sle(language,text):
	text = text.replace('\n', ' ').replace('\r', '').replace(" ", "")
	if language == "中文":
	tts_input1 = "[ZH]" + text + "[ZH]"
	return tts_input1
	elif language == "自动":
	tts_input1 = f"[JA]{text}[JA]" if is_japanese(text) else f"[ZH]{text}[ZH]"
	return tts_input1
	elif language == "日文":
	tts_input1 = "[JA]" + text + "[JA]"
	return tts_input1
	elif language == "英文":
	tts_input1 = "[EN]" + text + "[EN]"
	return tts_input1
	elif language == "手动":
	return text

	def get_text(text,hps_ms):
	text_norm = text_to_sequence(text,hps_ms.data.text_cleaners)
	if hps_ms.data.add_blank:
	text_norm = commons.intersperse(text_norm, 0)
	text_norm = torch.LongTensor(text_norm)
	return text_norm

	def create_tts_fn(net_g,hps,speaker_id):
	speaker_id = int(speaker_id)
	def tts_fn(is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
	repeat_ime = int(repeat_time)
	if is_gpt:
	openai.api_key = api_key
	text,messages = chatgpt(text)
	htm = to_html(messages)
	else:
	htm = ''
	if not extract:
	t1 = time.time()
	stn_tst = get_text(sle(language,text),hps)
	with torch.no_grad():
	x_tst = stn_tst.unsqueeze(0).to(dev)
	x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
	sid = torch.LongTensor([speaker_id]).to(dev)
	audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
	t2 = time.time()
	spending_time = "推理时间为："+str(t2-t1)+"s"
	print(spending_time)
	file_path = "subtitles.srt"
	try:
	write(audiopath + '.wav',22050,audio)
	if is_audio:
	for i in range(repeat_time):
	cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
	os.system(cmd)
	except:
	pass
	return (hps.data.sampling_rate, audio),file_path,htm
	else:
	a = ['【','[','(','（']
	b = ['】',']',')','）']
	for i in a:
	text = text.replace(i,'<')
	for i in b:
	text = text.replace(i,'>')
	final_list = extrac(text.replace('“','').replace('”',''))
	audio_fin = []
	c = 0
	t = datetime.timedelta(seconds=0)
	for sentence in final_list:
	try:
	f1 = open("subtitles.srt",'w',encoding='utf-8')
	c +=1
	stn_tst = get_text(sle(language,sentence),hps)
	with torch.no_grad():
	x_tst = stn_tst.unsqueeze(0).to(dev)
	x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
	sid = torch.LongTensor([speaker_id]).to(dev)
	t1 = time.time()
	audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
	t2 = time.time()
	spending_time = "第"+str(c)+"句的推理时间为："+str(t2-t1)+"s"
	print(spending_time)
	time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
	last_time = datetime.timedelta(seconds=len(audio)/float(22050))
	t+=last_time
	time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
	print(time_end)
	f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
	audio_fin.append(audio)
	except:
	pass
	try:
	write(audiopath + '.wav',22050,np.concatenate(audio_fin))
	if is_audio:
	for i in range(repeat_time):
	cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
	os.system(cmd)

	except:
	pass

	file_path = "subtitles.srt"
	return (hps.data.sampling_rate, np.concatenate(audio_fin)),file_path,htm
	return tts_fn

	if __name__ == '__main__':
	hps = utils.get_hparams_from_file('checkpoints/Nijigaku/config.json')
	dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	models = []
	schools = ["Nijigasaki High School"]
	lan = ["中文","日文","自动","手动"]
	with open("checkpoints/info.json", "r", encoding="utf-8") as f:
	models_info = json.load(f)
	net_g = SynthesizerTrn(
	len(symbols),
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	n_speakers=hps.data.n_speakers,
	**hps.model).to(dev)
	_ = net_g.eval()
	_ = utils.load_checkpoint("checkpoints/Nijigaku/model.pth" , net_g)
	for i in models_info:
	school = models_info[i]
	speakers = school["speakers"]
	phone_dict = {
	symbol: i for i, symbol in enumerate(symbols)
	}
	content = []
	for j in speakers:
	sid = int(speakers[j]['sid'])
	title = school
	example = speakers[j]['speech']
	name = speakers[j]["name"]
	content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
	models.append(content)

	with gr.Blocks() as app:
	with gr.Tabs():
	for i in schools:
	with gr.TabItem(i):
	for (sid, name, title, example, tts_fn) in models[schools.index(i)]:
	with gr.TabItem(name):
	with gr.Column():
	with gr.Row():
	with gr.Row():
	gr.Markdown(
	'<div align="center">'
	f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
	'</div>'
	)
	output_UI = gr.outputs.HTML()
	with gr.Row():
	with gr.Column(scale=0.85):
	input1 = gr.TextArea(label="Text", value=example,lines = 1)
	with gr.Column(scale=0.15, min_width=0):
	btnVC = gr.Button("Send")
	output1 = gr.Audio(label="采样率22050")
	with gr.Accordion(label="Setting(TTS)", open=False):
	input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
	input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale)，以控制情感", value=0.6)
	input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w)，以控制音素长短", value=0.668)
	input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
	with gr.Accordion(label="Advanced Setting(GPT3.5接口+长句子合成，建议克隆本仓库后运行main.py)", open=False):
	input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
	output2 = gr.outputs.File(label="字幕文件：subtitles.srt")
	api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
	api_input2 = gr.TextArea(label="api-key",lines=1,value = '见 https://openai.com/blog/openai-api')
	audio_input1 = gr.Checkbox(value=False, label="修改音频路径(live2d)")
	audio_input2 = gr.TextArea(label="音频路径",lines=1,value = '#参考 D:/app_develop/live2d_whole/2010002/sounds/temp.wav')
	audio_input3 = gr.Dropdown(label="重复生成次数", choices=list(range(101)), value='0', interactive=True)
	btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])

	app.launch()