wasm-spad / app.py
wasmdashai's picture
Update app.py
f1aa1e4 verified
raw
history blame contribute delete
No virus
12.2 kB
import gradio as gr
import os
from transformers import AutoTokenizer,VitsModel
import google.generativeai as genai
import torch
import torchaudio
api_key =os.environ.get("id_gmkey")
token=os.environ.get("key_")
genai.configure(api_key=api_key)
tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vits-ar-sa-huba",token=token)
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_vits=VitsModel.from_pretrained("wasmdashai/vits-ar-sa-huba",token=token)#.to(device)
model_vits.decoder.apply_weight_norm()
# torch.nn.utils.weight_norm(self.decoder.conv_pre)
# torch.nn.utils.weight_norm(self.decoder.conv_post)
for flow in model_vits.flow.flows:
torch.nn.utils.weight_norm(flow.conv_pre)
torch.nn.utils.weight_norm(flow.conv_post)
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
import requests
API_URL = "https://api-inference.huggingface.co/models/wasmdashai/vits-ar-sa-huba"
headers = {"Authorization": f"Bearer {token}"}
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.content
model = genai.GenerativeModel(
model_name="gemini-1.5-pro",
generation_config=generation_config,
# safety_settings = Adjust safety settings
# See https://ai.google.dev/gemini-api/docs/safety-settings
)
import torch
from typing import Any, Callable, Optional, Tuple, Union,Iterator
import numpy as np
import torch.nn as nn # Import the missing module
def _inference_forward_stream(
self,
input_ids: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
speaker_embeddings: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
padding_mask: Optional[torch.Tensor] = None,
chunk_size: int = 32, # Chunk size for streaming output
) -> Iterator[torch.Tensor]:
"""Generates speech waveforms in a streaming fashion."""
if attention_mask is not None:
padding_mask = attention_mask.unsqueeze(-1).float()
else:
padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float()
text_encoder_output = self.text_encoder(
input_ids=input_ids,
padding_mask=padding_mask,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state
hidden_states = hidden_states.transpose(1, 2)
input_padding_mask = padding_mask.transpose(1, 2)
prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means
prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances
if self.config.use_stochastic_duration_prediction:
log_duration = self.duration_predictor(
hidden_states,
input_padding_mask,
speaker_embeddings,
reverse=True,
noise_scale=self.noise_scale_duration,
)
else:
log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
length_scale = 1.0 / self.speaking_rate
duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
# Create a padding mask for the output lengths of shape (batch, 1, max_output_length)
indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
# Reconstruct an attention tensor of shape (batch, 1, out_length, in_length)
attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
batch_size, _, output_length, input_length = attn_mask.shape
cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
valid_indices = indices.unsqueeze(0) < cum_duration
valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
# Expand prior distribution
prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)
prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
spectrogram = latents * output_padding_mask
for i in range(0, spectrogram.size(-1), chunk_size):
with torch.no_grad():
wav=self.decoder(spectrogram[:,:,i : i + chunk_size] ,speaker_embeddings)
yield wav.squeeze().cpu().numpy()
def create_chat_session():
chat_session = model.start_chat(
history=[
{
"role": "user",
"parts": [
"السلام عليكم اريد منك ان ترد على اسئلتي دائما باللهجة السعودية النجدية \n\n",
],
},
{
"role": "model",
"parts": [
"هلا والله، إسأل ما في خاطرك وأنا حاضر أساعدك، بس بشرط واحد، أسئلتك تكون واضحة عشان أفهم عليك عدل وأعطيك الجواب الزين. قل وش تبي وأنا حاضر! \n",
],
},
{
"role": "user",
"parts": [
"كيف حالك اخبارك\n",
],
},
{
"role": "model",
"parts": [
"هلا والله وغلا، أنا طيب وبخير الحمد لله، انت كيفك؟ عساك طيب؟ \n \n وش عندك أخبار؟ عسى كلها زينة. \n",
],
},
{
"role": "user",
"parts": [
"اريد ايضا ان تكون اجابتك مختصره على سبيل المثال ااكثر اجابة سطرين\n",
],
},
{
"role": "model",
"parts": [
"خلاص، فهمتك. من عيوني، أسئلتك من اليوم وطالع أجوبتها ما تتعدى سطرين. \n \n إسأل وشف! \n",
],
},
]
)
return chat_session
# AI=create_chat_session()
def generate_audio(text,speaker_id=None):
inputs = tokenizer(text, return_tensors="pt")#.input_ids
speaker_embeddings = None
#torch.cuda.empty_cache()
with torch.no_grad():
for chunk in _inference_forward_stream(model_vits,input_ids=inputs.input_ids,attention_mask=inputs.attention_mask,speaker_embeddings= speaker_embeddings,chunk_size=256):
yield 16000,chunk#.squeeze().cpu().numpy()#.astype(np.int16).tobytes()
def get_answer_ai(text,session_ai):
#if session_ai is None:
session_ai=create_chat_session()
try:
response = session_ai.send_message(text,stream=True)
return response,session_ai
except :
session_ai=create_chat_session()
response = session_ai.send_message(text,stream=True)
return response,session_ai
def modelspeech(text):
audio_bytes = query({"inputs":text })
wav, sr = torchaudio.load(audio_bytes)
yield sr,wav.squeeze().cpu().numpy()
with torch.no_grad():
inputs = tokenizer(text, return_tensors="pt")#.cuda()
wav = model_vits(input_ids=inputs["input_ids"]).waveform.cpu().numpy().reshape(-1)
# display(Audio(wav, rate=model.config.sampling_rate))
return model_vits.config.sampling_rate,wav#remove_noise_nr(wav)
def modelspeechstr(text):
with torch.no_grad():
inputs = tokenizer(text, return_tensors="pt")#.cuda()
wav = model_vits(input_ids=inputs["input_ids"]).waveform.cpu().numpy().reshape(-1)
# display(Audio(wav, rate=model.config.sampling_rate))
return np.array2string(wav)
import re
def clean_text(text):
# Remove symbols and extra spaces
cleaned_text = re.sub(r'[^\w\s]', ' ', text) # Remove symbols
cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Normalize spaces
return cleaned_text.strip() # Remove leading/trailing spaces
def text_to_speech(text,session_ai):
response = dash(text,session_ai,False)
pad_text=''
k=0
for chunk in response:
chunk,session_ai=chunk
pad_text+=str(clean_text(chunk))
if pad_text!='' and len(pad_text)>10:
out=pad_text
pad_text=''
k+=1
# yield modelspeech(out),session_ai
for outmodel in modelspeech(out):
yield outmodel,session_ai
if pad_text!='':
for outmodel in modelspeech(pad_text):
yield outmodel,session_ai
# for stream_wav in generate_audio(pad_text):
# yield stream_wav
def text_to_speechstr(text,session_ai):
response = dash(text,session_ai,False)
pad_text=''
k=0
for chunk in response:
chunk,session_ai=chunk
pad_text+=str(clean_text(chunk))
if pad_text!='' and len(pad_text)>10:
out=pad_text
pad_text=''
k+=1
yield modelspeechstr(out),session_ai
# for stream_wav in generate_audio(out):
# yield stream_wav
if pad_text!='':
yield modelspeechstr(pad_text),session_ai
def dash(text,session_ai,is_state=True):
response,session_ai=get_answer_ai(text,session_ai)
txt=' '
for chunk in response:
if chunk is not None:
if is_state:
txt+=chunk.text
else:
txt=chunk.text
yield txt,session_ai
# demo = gr.Interface(fn=dash, inputs=["text"], outputs=['text'])
# demo.launch()
with gr.Blocks() as demo:
session_ai=gr.State()
with gr.Tab("AI Text "):
gr.Markdown("# Text to Speech")
text_input = gr.Textbox(label="Enter Text")
text_out = gr.Textbox()
text_input.submit(dash, [text_input,session_ai],[text_out,session_ai])
with gr.Tab("AI Speech"):
gr.Markdown("# Text to Speech")
text_input2 = gr.Textbox(label="Enter Text")
audio_output = gr.Audio(streaming=True,autoplay=True)
text_input2.submit(text_to_speech, [text_input2,session_ai], [audio_output,session_ai])
with gr.Tab("AI Speechstr"):
gr.Markdown("# Text to Speech")
text_input3 = gr.Textbox(label="Enter Text")
text_input4 = gr.Textbox(label="out Text")
text_input3.submit(text_to_speechstr, [text_input3,session_ai], [text_input4,session_ai])
demo.launch(show_error=True)