|
|
|
from transformers import BlipProcessor, BlipForConditionalGeneration, MBartForConditionalGeneration, MBart50Tokenizer |
|
from gtts import gTTS |
|
from PIL import Image |
|
import gradio as gr |
|
|
|
|
|
class ImageToText: |
|
def __init__(self): |
|
"""Initializes the BLIP model for image captioning.""" |
|
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") |
|
self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") |
|
print("BLIP Image Captioning Model Loaded") |
|
|
|
def generate_caption(self, img): |
|
"""Generates a caption for the given image.""" |
|
inputs = self.processor(images=img, return_tensors="pt") |
|
generated_ids = self.model.generate(**inputs) |
|
caption = self.processor.decode(generated_ids[0], skip_special_tokens=True) |
|
return caption |
|
|
|
|
|
class ArabicTranslator: |
|
def __init__(self): |
|
"""Initializes the mBART model for English to Arabic translation.""" |
|
self.tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") |
|
self.model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") |
|
print("mBART Arabic Translation Model Loaded") |
|
|
|
def translate(self, text): |
|
"""Translates the given English text to Arabic.""" |
|
inputs = self.tokenizer(text, return_tensors="pt", src_lang="en_XX") |
|
translated = self.model.generate(inputs["input_ids"], forced_bos_token_id=self.tokenizer.lang_code_to_id["ar_AR"]) |
|
translated_text = self.tokenizer.batch_decode(translated, skip_special_tokens=True)[0] |
|
return translated_text |
|
|
|
|
|
class TextToSpeech: |
|
def __init__(self, lang='ar'): |
|
"""Initializes the Text-to-Speech system for Arabic.""" |
|
self.lang = lang |
|
|
|
def generate_audio(self, text): |
|
"""Generates audio from the given Arabic text.""" |
|
tts = gTTS(text=text, lang=self.lang, slow=False) |
|
audio_file_path = 'output.mp3' |
|
tts.save(audio_file_path) |
|
return audio_file_path |
|
|
|
|
|
class ImageToArabicSpeechPipeline: |
|
def __init__(self): |
|
"""Initializes all pipeline components.""" |
|
self.caption_model = ImageToText() |
|
self.translation_model = ArabicTranslator() |
|
self.tts_model = TextToSpeech() |
|
|
|
def process_image(self, img): |
|
"""Processes the image, generates a caption, translates it to Arabic, and converts it to speech.""" |
|
caption = self.caption_model.generate_caption(img) |
|
translated_text = self.translation_model.translate(caption) |
|
audio_file = self.tts_model.generate_audio(translated_text) |
|
return caption, translated_text, audio_file |
|
|
|
|
|
def demo(image): |
|
"""Function to be used in Gradio for processing the image and returning caption, translation, and audio.""" |
|
img = Image.open(image) |
|
pipeline = ImageToArabicSpeechPipeline() |
|
caption, translated_text, audio_file = pipeline.process_image(img) |
|
return caption, translated_text, audio_file |
|
|
|
|
|
iface = gr.Interface( |
|
fn=demo, |
|
inputs=gr.Image(type="filepath"), |
|
outputs=[gr.Textbox(label="Caption"), gr.Textbox(label="Translated Text"), gr.Audio(label="Generated Speech")] |
|
) |
|
|
|
|
|
iface.launch() |
|
|