import pickle import gradio as gr import re import spacy from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer import numpy as np from transformers import AutoTokenizer import subprocess import sys subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"]) subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"]) with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file: tfidf_vectorizer = pickle.load(vectorizer_file) with open('mnb_model.pkl', 'rb') as model_file: mnb = pickle.load(model_file) checkpoint = "answerdotai/ModernBERT-base" tokenizer = AutoTokenizer.from_pretrained(checkpoint) tf_idf = TfidfVectorizer() nlp = spacy.load("en_core_web_sm") class TextPreprocessing: def __init__(self, text: str, tokenizer, tfidf_vectorizer: TfidfVectorizer = None): self.text = text self.tokenizer = tokenizer self.tfidf_vectorizer = tfidf_vectorizer or TfidfVectorizer() @staticmethod def Cleaning_text(text: str) -> str: """ Cleans the input text by converting to lowercase, removing URLs, special characters, and unnecessary spaces. """ text = text.lower() text = re.sub(r'http\S+|www\S+|https\S+', '', text) text = re.sub(r"[^a-zA-Z\s]", '', text) text = re.sub(r"n't", ' not', text) text = re.sub(r"'s", '', text) text = re.sub(r'\s+', ' ', text).strip() return text @staticmethod def Tokenization_text(text: str) -> list: """ Tokenizes the text into a list of words, excluding punctuations and spaces. """ doc = nlp(text) tokens = [token.text for token in doc if not token.is_punct and not token.is_space] return tokens @staticmethod def Lemmatization_text(text: str) -> str: """ Performs lemmatization on the text and returns the lemmatized version. """ doc = nlp(text) lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space]) return lemmatized_text @staticmethod def Stopwords_removal(text: str) -> str: """ Removes stopwords from the input text. """ doc = nlp(text) text_without_stopwords = ' '.join([token.text for token in doc if not token.is_stop]) return text_without_stopwords def ModernBert_Tokenization(self) -> dict: """ Tokenizes the cleaned text using ModernBERT's tokenizer. """ cleaned_text = self.Cleaning_text(self.text) tokenized_output = self.tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True) return tokenized_output def Tfidf_Transformation(self, texts: list) -> np.ndarray: """ Applies TF-IDF transformation to a list of texts. Args: texts (list of str): List of text strings to apply the TF-IDF transformation. Returns: np.ndarray: TF-IDF feature matrix. """ tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts) return tfidf_matrix.toarray() def BagOfWords_Transformation(self, texts: list) -> np.ndarray: """ Applies Bag of Words (BoW) transformation to a list of texts. Args: texts (list of str): List of text strings to apply the BoW transformation. Returns: np.ndarray: Bag of Words feature matrix. """ vectorizer = CountVectorizer() bow_matrix = vectorizer.fit_transform(texts) return bow_matrix.toarray() def Ngram_Transformation(self, texts: list, ngram_range=(1, 2)) -> np.ndarray: """ Applies N-gram transformation (uni-grams, bi-grams, etc.) to a list of texts. Args: texts (list of str): List of text strings to apply the N-gram transformation. ngram_range (tuple): The range of n-values for n-grams to extract. Default is (1, 2) for unigrams and bigrams. Returns: np.ndarray: N-gram feature matrix. """ vectorizer = CountVectorizer(ngram_range=ngram_range) ngram_matrix = vectorizer.fit_transform(texts) return ngram_matrix.toarray() def preprocess_text(text): text_preprocessor = TextPreprocessing(text=None, tokenizer=None) cleaned_text = text_preprocessor.Cleaning_text(text) return cleaned_text def predict_news(text): cleaned_text = preprocess_text(text) X_input = tfidf_vectorizer.transform([cleaned_text]) prediction = mnb.predict(X_input) return "Fake News" if prediction == 0 else "Real News" iface = gr.Interface( fn=predict_news, inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."), outputs="text", title="Fake News Classification", description="Classify news articles as real or fake." ) iface.launch()