# Import necessary libraries and modules
from transformers import BlipProcessor, BlipForConditionalGeneration, MBartForConditionalGeneration, MBart50Tokenizer
from gtts import gTTS
from PIL import Image
import gradio as gr

# Pipeline Component 1: Image Captioning Model
class ImageToText:
    def __init__(self):
        """Initializes the BLIP model for image captioning."""
        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        print("BLIP Image Captioning Model Loaded")

    def generate_caption(self, img):
        """Generates a caption for the given image."""
        inputs = self.processor(images=img, return_tensors="pt")
        generated_ids = self.model.generate(**inputs)
        caption = self.processor.decode(generated_ids[0], skip_special_tokens=True)
        return caption

# Pipeline Component 2: Arabic Translation Model (mBART)
class ArabicTranslator:
    def __init__(self):
        """Initializes the mBART model for English to Arabic translation."""
        self.tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
        self.model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
        print("mBART Arabic Translation Model Loaded")

    def translate(self, text):
        """Translates the given English text to Arabic."""
        inputs = self.tokenizer(text, return_tensors="pt", src_lang="en_XX")
        translated = self.model.generate(inputs["input_ids"], forced_bos_token_id=self.tokenizer.lang_code_to_id["ar_AR"])
        translated_text = self.tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
        return translated_text

# Pipeline Component 3: Text-to-Speech Model (gTTS)
class TextToSpeech:
    def __init__(self, lang='ar'):
        """Initializes the Text-to-Speech system for Arabic."""
        self.lang = lang

    def generate_audio(self, text):
        """Generates audio from the given Arabic text."""
        tts = gTTS(text=text, lang=self.lang, slow=False)
        audio_file_path = 'output.mp3'
        tts.save(audio_file_path)
        return audio_file_path

# Main Pipeline Integration
class ImageToArabicSpeechPipeline:
    def __init__(self):
        """Initializes all pipeline components."""
        self.caption_model = ImageToText()
        self.translation_model = ArabicTranslator()
        self.tts_model = TextToSpeech()

    def process_image(self, img):
        """Processes the image, generates a caption, translates it to Arabic, and converts it to speech."""
        caption = self.caption_model.generate_caption(img)
        translated_text = self.translation_model.translate(caption)
        audio_file = self.tts_model.generate_audio(translated_text)
        return caption, translated_text, audio_file

# Gradio Interface Setup
def demo(image):
    """Function to be used in Gradio for processing the image and returning caption, translation, and audio."""
    img = Image.open(image)
    pipeline = ImageToArabicSpeechPipeline()
    caption, translated_text, audio_file = pipeline.process_image(img)
    return caption, translated_text, audio_file

# Define Gradio Interface
iface = gr.Interface(
    fn=demo,
    inputs=gr.Image(type="filepath"),
    outputs=[gr.Textbox(label="Caption"), gr.Textbox(label="Translated Text"), gr.Audio(label="Generated Speech")]
)

# Launch the Gradio Interface
iface.launch()