import pickle
import gradio as gr
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from transformers import AutoTokenizer
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    tfidf_vectorizer = pickle.load(vectorizer_file)

with open('mnb_model.pkl', 'rb') as model_file:
    mnb = pickle.load(model_file)


checkpoint = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tf_idf = TfidfVectorizer()

nlp = spacy.load("en_core_web_sm")

class TextPreprocessing:
    def __init__(self, text: str, tokenizer, tfidf_vectorizer: TfidfVectorizer = None):
        self.text = text
        self.tokenizer = tokenizer
        self.tfidf_vectorizer = tfidf_vectorizer or TfidfVectorizer()

    @staticmethod
    def Cleaning_text(text: str) -> str:
        """
        Cleans the input text by converting to lowercase,
        removing URLs, special characters, and unnecessary spaces.
        """
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        text = re.sub(r"[^a-zA-Z\s]", '', text)
        text = re.sub(r"n't", ' not', text)
        text = re.sub(r"'s", '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    @staticmethod
    def Tokenization_text(text: str) -> list:
        """
        Tokenizes the text into a list of words, excluding punctuations and spaces.
        """
        doc = nlp(text)
        tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
        return tokens

    @staticmethod
    def Lemmatization_text(text: str) -> str:
        """
        Performs lemmatization on the text and returns the lemmatized version.
        """
        doc = nlp(text)
        lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
        return lemmatized_text

    @staticmethod
    def Stopwords_removal(text: str) -> str:
        """
        Removes stopwords from the input text.
        """
        doc = nlp(text)
        text_without_stopwords = ' '.join([token.text for token in doc if not token.is_stop])
        return text_without_stopwords

    def ModernBert_Tokenization(self) -> dict:
        """
        Tokenizes the cleaned text using ModernBERT's tokenizer.
        """
        cleaned_text = self.Cleaning_text(self.text)
        tokenized_output = self.tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True)
        return tokenized_output

    def Tfidf_Transformation(self, texts: list) -> np.ndarray:
        """
        Applies TF-IDF transformation to a list of texts.

        Args:
            texts (list of str): List of text strings to apply the TF-IDF transformation.

        Returns:
            np.ndarray: TF-IDF feature matrix.
        """
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
        return tfidf_matrix.toarray()

    def BagOfWords_Transformation(self, texts: list) -> np.ndarray:
        """
        Applies Bag of Words (BoW) transformation to a list of texts.

        Args:
            texts (list of str): List of text strings to apply the BoW transformation.

        Returns:
            np.ndarray: Bag of Words feature matrix.
        """
        vectorizer = CountVectorizer()
        bow_matrix = vectorizer.fit_transform(texts)
        return bow_matrix.toarray()

    def Ngram_Transformation(self, texts: list, ngram_range=(1, 2)) -> np.ndarray:
        """
        Applies N-gram transformation (uni-grams, bi-grams, etc.) to a list of texts.

        Args:
            texts (list of str): List of text strings to apply the N-gram transformation.
            ngram_range (tuple): The range of n-values for n-grams to extract. Default is (1, 2) for unigrams and bigrams.

        Returns:
            np.ndarray: N-gram feature matrix.
        """
        vectorizer = CountVectorizer(ngram_range=ngram_range)
        ngram_matrix = vectorizer.fit_transform(texts)
        return ngram_matrix.toarray()

    
def preprocess_text(text):
    text_preprocessor = TextPreprocessing(text=None, tokenizer=None)
    cleaned_text = text_preprocessor.Cleaning_text(text)
    return cleaned_text


def predict_news(text):
    cleaned_text = preprocess_text(text)
    X_input = tfidf_vectorizer.transform([cleaned_text])
    prediction = mnb.predict(X_input)
    return "Fake News" if prediction == 0 else "Real News"


iface = gr.Interface(
    fn=predict_news, 
    inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."), 
    outputs="text", 
    title="Fake News Classification", 
    description="Classify news articles as real or fake."
)

iface.launch()