Spaces:
Runtime error
Runtime error
import concurrent | |
import os | |
import tempfile | |
from typing import Optional, Tuple | |
import numpy as np | |
import spaces | |
from transformers import pipeline | |
import gradio as gr | |
import torch | |
import torchaudio | |
from resemble_enhance.enhancer.inference import denoise, enhance | |
from flore200_codes import flores_codes | |
from tts import BambaraTTS | |
# Check if CUDA is available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Translation pipeline | |
translation_model = "oza75/nllb-600M-mt-french-bambara" | |
# translation_model = "oza75/nllb-1.3B-mt-french-bambara" | |
translator = pipeline("translation", model=translation_model, max_length=512) | |
# Text-to-Speech pipeline | |
tts_model = "oza75/bambara-tts" | |
tts = BambaraTTS(tts_model) | |
# Function to translate text to Bambara | |
def translate_to_bambara(text, src_lang): | |
translation = translator(text, src_lang=src_lang, tgt_lang="bam_Latn") | |
return str(translation[0]['translation_text']) | |
# Function to convert text to speech | |
def text_to_speech(bambara_text, reference_speaker: str, reference_audio: Optional[Tuple] = None): | |
if reference_audio is not None: | |
ref_sr, ref_audio = reference_audio | |
ref_audio = torch.from_numpy(ref_audio) | |
# Add a channel dimension if the audio is 1D | |
if ref_audio.ndim == 1: | |
ref_audio = ref_audio.unsqueeze(0) | |
# Save the reference audio to a temporary file if it's not None | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp: | |
torchaudio.save(tmp.name, ref_audio, ref_sr) | |
tmp_path = tmp.name | |
# Use the temporary file as the speaker reference | |
sr, audio = tts.text_to_speech(bambara_text, speaker_reference_wav_path=tmp_path, enable_text_splitting=True) | |
# Clean up the temporary file | |
os.unlink(tmp_path) | |
else: | |
# If no reference audio provided, proceed with the reference_speaker | |
sr, audio = tts.text_to_speech(bambara_text, speaker_reference_wav_path=reference_speaker) | |
audio = audio.mean(dim=0) | |
return audio, sr | |
# Function to enhance speech | |
def enhance_speech(audio_array, sampling_rate, solver, nfe, tau, denoise_before_enhancement): | |
solver = solver.lower() | |
nfe = int(nfe) | |
lambd = 0.9 if denoise_before_enhancement else 0.1 | |
def denoise_audio(): | |
try: | |
return denoise(audio_array, sampling_rate, device) | |
except Exception as e: | |
print("> Error while denoising : ", str(e)) | |
return audio_array, sampling_rate | |
def enhance_audio(): | |
try: | |
return enhance(audio_array, sampling_rate, device, nfe=nfe, solver=solver, lambd=lambd, tau=tau) | |
except Exception as e: | |
print("> Error while enhancement : ", str(e)) | |
return audio_array, sampling_rate | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
future_denoise = executor.submit(denoise_audio) | |
future_enhance = executor.submit(enhance_audio) | |
denoised_audio, new_sr1 = future_denoise.result() | |
enhanced_audio, new_sr2 = future_enhance.result() | |
# Convert to numpy and return | |
return (new_sr1, denoised_audio.cpu().numpy()), (new_sr2, enhanced_audio.cpu().numpy()) | |
# Define the Gradio interface | |
def _fn( | |
src_lang, | |
text, | |
reference_speaker, | |
reference_audio=None, | |
solver="Midpoint", | |
nfe=128, | |
prior_temp=0.01, | |
denoise_before_enhancement=False | |
): | |
source_lang = flores_codes[src_lang] | |
reference_speaker = os.path.join("./audios", reference_speaker) | |
# Step 1: Translate the text to Bambara | |
bambara_text = translate_to_bambara(text, source_lang) | |
yield bambara_text, None, None, None | |
# Step 2: Convert the translated text to speech with reference audio | |
if reference_audio is not None: | |
audio_array, sampling_rate = text_to_speech(bambara_text, reference_speaker, reference_audio) | |
else: | |
audio_array, sampling_rate = text_to_speech(bambara_text, reference_speaker=reference_speaker) | |
yield bambara_text, (sampling_rate, audio_array.numpy()), None, None | |
# Step 3: Enhance the audio | |
denoised_audio, enhanced_audio = enhance_speech( | |
audio_array, | |
sampling_rate, | |
solver, | |
nfe, | |
prior_temp, | |
denoise_before_enhancement | |
) | |
yield bambara_text, (sampling_rate, audio_array.numpy()), denoised_audio, enhanced_audio | |
def main(): | |
lang_codes = list(flores_codes.keys()) | |
# List all files in the ./audios directory for the dropdown | |
audio_files = [f for f in os.listdir('./audios') if os.path.isfile(os.path.join('./audios', f))] | |
# Build Gradio app | |
app = gr.Interface( | |
fn=_fn, | |
inputs=[ | |
gr.Dropdown(label="Source Language", choices=lang_codes, value='French'), | |
gr.Textbox(label="Text to Translate", lines=3, value="Thomas Isidore Noël Sankara est fils d'un père Peul — originaire du village de Sitoèga dans le département de Bokin dans la province du Passoré — et d'une mère mossi, et grandit entre valeurs militaires et religiosité chrétienne."), | |
gr.Dropdown(label="Voice", choices=audio_files, value="male_3.wav"), | |
gr.Audio(label="Clone your voice (optional)", type="numpy", format="wav", elem_id="clone_voice_input"), | |
# gr.Dropdown( | |
# choices=["Midpoint", "RK4", "Euler"], value="Midpoint", | |
# label="ODE Solver (Midpoint is recommended)" | |
# ), | |
# gr.Slider(minimum=1, maximum=128, value=64, step=1, label="Number of Function Evaluations"), | |
# gr.Slider(minimum=0.1, maximum=1, value=0.5, step=0.01, label="Prior Temperature"), | |
# gr.Checkbox(value=False, label="Denoise Before Enhancement") | |
], | |
outputs=[ | |
gr.Textbox(label="Translated Text"), | |
gr.Audio(label="Original TTS Audio", format='wav'), | |
gr.Audio(label="Denoised Audio", format='wav'), | |
gr.Audio(label="Enhanced Audio", format='wav') | |
], | |
examples=[ | |
['French', "Mais il n'avait pas l'air content. Je lui ai même dit : « Ce n'est pas de ma faute. » Il n'a pas répondu. J'ai pensé alors que je n'aurais pas dû lui dire cela. En somme, je n'avais pas à m'excuser. C'était plutôt à lui de me présenter ses condoléances.", "male_3.wav", None], | |
['French', "l'asile de vieillards est à Marengo, à quatre-vingts kilomètres d'Alger. Je prendrai l'autobus à deux heures et j'arriverai dans l'après-midi. Ainsi, je pourrai veiller et je rentrerai demain soir. J'ai demandé deux jours de congé à mon patron et il ne pouvait pas me les refuser avec une excuse pareille", "male_3.wav", None], | |
['English', "Today, my mother is dead. Or maybe yesterday, I don't know. I received a telegram from the asylum: “Mother deceased. Hand burial. Distinguished feelings.“ It does not mean anything. Maybe it was yesterday.", "male_2.wav", None], | |
], | |
css="#clone_voice_input .audio-container button.boundedheight { height: 147px !important; }", | |
title="Bambara Translation and Text to Speech with Audio Enhancement", | |
description="Translate text to Bambara and convert it to speech with options to enhance audio quality." | |
) | |
app.launch(share=False) | |
if __name__ == "__main__": | |
main() | |