AudioTranscribe / app.py
ZennyKenny's picture
support for ad hoc recording
e88a1f3 verified
raw
history blame
3.02 kB
import gradio as gr
import torch
from transformers import pipeline
import librosa
import soundfile as sf
import spaces
import os
def split_audio(audio_data, sr, chunk_duration=30):
"""Split audio into chunks of chunk_duration seconds."""
chunks = []
for start in range(0, len(audio_data), int(chunk_duration * sr)):
end = start + int(chunk_duration * sr)
chunks.append(audio_data[start:end])
return chunks
def transcribe_long_audio(audio_input, transcriber, chunk_duration=30):
"""Transcribe long audio by splitting into smaller chunks."""
# Check if audio_input is a file path or raw data
if isinstance(audio_input, str):
audio_data, sr = librosa.load(audio_input, sr=None)
else: # Raw audio data (e.g., from recording)
audio_data, sr = audio_input
chunks = split_audio(audio_data, sr, chunk_duration)
transcriptions = []
for i, chunk in enumerate(chunks):
temp_path = f"temp_chunk_{i}.wav"
sf.write(temp_path, chunk, sr) # Save the chunk as a WAV file
transcription = transcriber(temp_path)["text"]
transcriptions.append(transcription)
os.remove(temp_path) # Clean up temporary files
return " ".join(transcriptions)
@spaces.GPU(duration=3)
def main():
# Force GPU if available, fallback to CPU
device = 0 if torch.cuda.is_available() else -1
try:
# Load models with explicit device
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
except Exception as e:
print(f"Error loading models: {e}")
raise
# Function to process audio
def process_audio(audio_input):
try:
# Transcribe the audio (long-form support)
transcription = transcribe_long_audio(audio_input, transcriber, chunk_duration=30)
# Summarize the transcription
summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
return transcription, summary
except Exception as e:
return f"Error processing audio: {e}", ""
# Gradio Interface with Horizontal Layout
with gr.Blocks() as interface:
with gr.Row():
with gr.Column():
audio_input = gr.Audio(source="microphone", type="numpy", label="Record or Upload Audio")
process_button = gr.Button("Process Audio")
with gr.Column():
transcription_output = gr.Textbox(label="Full Transcription", lines=10)
summary_output = gr.Textbox(label="Summary", lines=5)
process_button.click(
process_audio,
inputs=[audio_input],
outputs=[transcription_output, summary_output]
)
# Launch the interface with optional public sharing
interface.launch(share=True)
# Run the main function
if __name__ == "__main__":
main()