import os import gradio as gr import numpy as np import torch import ollama import emoji from datasets import load_dataset from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline from huggingface_hub import login from TTS_models import * login(token = os.getenv('HF_TOKEN')) device = "cuda:0" if torch.cuda.is_available() else "cpu" # load speech translation checkpoint STT_model_id = "openai/whisper-tiny" # load llm llm_model_id = "gemma2:2b" # init TTS model TTS_model_id = "tts_models/en/ljspeech/tacotron2-DDC_ph" client = ollama.Client() llmpipe = pipeline( "text-generation", model="google/gemma-2-2b-it", model_kwargs={"torch_dtype": torch.bfloat16}, device=device ) def translate(audio): global STT_model_id asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device) outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language":"fr"}) print(f'Translated {outputs} using {asr_pipe.model}') return outputs["text"] def transcribe(audio): global STT_model_id asr_pipe = pipeline("automatic-speech-recognition", model=STT_model_id, device=device) outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"}) print(f'[transcribe] Transcribe {outputs}') return outputs["text"] def chatCompletion(text): global llm_model_id global llmpipe global client messages = [ {"role": "user", "content": "You are a helpful assistant. Answer in English only in text.\n\n"+text}, ] try: # try to get a ollama client response: ollama.ListResponse = ollama.list() response = client.chat( model=llm_model_id, messages=messages, stream=True, options={ 'num_predict': 256, 'temperature': 0.5, 'low_vram': True, }, ) buffer = "" for chunk in response: buffer += chunk["message"]["content"] print(f'[chatCompletion] {buffer}') return buffer except: # get a HF piepline LLM outputs = llmpipe(messages, max_new_tokens=256) buffer = outputs[0]["generated_text"][-1]["content"].strip() print(f'[chatCompletion] {buffer}') return buffer def synthesise(text): global TTS_model_id text = emoji.replace_emoji(text, replace="!") synthesiser = XTTS(TTS_model_id) speech = synthesiser.synthesize(text) return (np.array(speech)* 32767).astype(np.int16) def speech_to_speech_translation(audioMic, audioFile): audio = None if audioMic is not None: audio = audioMic elif audioFile is not None: audio = audioFile translated_text = translate(audio) synthesised_speech = synthesise(translated_text) return (22050, synthesised_speech), translated_text def speech_to_speech(audioMic, audioFile): audio = None if audioMic is not None: audio = audioMic elif audioFile is not None: audio = audioFile translated_text = "Sorry no audio was found." if audio is not None: # Transcribe audio translated_text = transcribe(audio) # Call LLM answer = chatCompletion(translated_text) # Synthesize answer synthesised_speech = synthesise(answer) print(f'[speech_to_speech] Transcribed text {translated_text}') print(f'[speech_to_speech] LLM answer {answer}') return (22050, synthesised_speech), translated_text, answer with gr.Blocks() as demo: options = gr.WaveformOptions(sample_rate=22050) with gr.Tab("Instant Translation"): gr.Markdown( """ # Tanslation of audio to audio The aime of this tab is to demonstrate the speech-to-speech translation capabilities of the [whisper-tiny](https://huggingface.co/openai/whisper-tiny) model. It uses: - [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe, - and glow-tts as a voice synthesizer. You can either record yourself or upload an audio file in the tabs below. This will translate to english. """) with gr.Row(): with gr.Column(scale=1): with gr.Tab("Record Audio"): audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath") with gr.Tab("Upload Audio"): audioFile = gr.Audio(sources="upload", type="filepath") transcribeBtn = gr.Button("Submit", size='lg') with gr.Column(scale=1): textOutput = gr.Textbox(label="Transcribed text") audioOutput = gr.Audio(waveform_options=options, type="numpy") transcribeBtn.click(fn=speech_to_speech_translation, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput], api_name="report_generation") with gr.Tab("Voice Assistant"): gr.Markdown( """ # Voice Assistant This is a demo to show what are the possibilities for building your own voice assistant. This demo uses: - [whisper-tiny](https://huggingface.co/openai/whisper-tiny) to transcribe, - [ollama/gemma2:2b](https://ollama.com/library/gemma2:2b) model to generate the answer of the assistant, - and glow-tts as a voice synthesizer. This means that you need to install ollama on your machine to be able to use this. You can either record yourself or upload an audio file in the tabs below. """) with gr.Row(): with gr.Column(scale=1): with gr.Tab("Record Audio"): audioMic = gr.Audio(sources="microphone", waveform_options=options, type="filepath") with gr.Tab("Upload Audio"): audioFile = gr.Audio(sources="upload", type="filepath") translateBtn = gr.Button("Submit", size='lg') with gr.Column(scale=1): textOutput = gr.Textbox(label="Transcribed text") textAnswer = gr.Textbox(label="Assistant's Answer") audioOutput = gr.Audio(waveform_options=options, type="numpy") translateBtn.click(fn=speech_to_speech, inputs=[audioMic, audioFile], outputs=[audioOutput, textOutput, textAnswer], api_name="report_generation") demo.launch()