RaghuCourage9605's picture
Update app.py
fcb77f0 verified
raw
history blame
4.94 kB
import pickle
import gradio as gr
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from transformers import AutoTokenizer
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
tfidf_vectorizer = pickle.load(vectorizer_file)
with open('mnb_model.pkl', 'rb') as model_file:
mnb = pickle.load(model_file)
checkpoint = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tf_idf = TfidfVectorizer()
nlp = spacy.load("en_core_web_sm")
class TextPreprocessing:
def __init__(self, text: str, tokenizer, tfidf_vectorizer: TfidfVectorizer = None):
self.text = text
self.tokenizer = tokenizer
self.tfidf_vectorizer = tfidf_vectorizer or TfidfVectorizer()
@staticmethod
def Cleaning_text(text: str) -> str:
"""
Cleans the input text by converting to lowercase,
removing URLs, special characters, and unnecessary spaces.
"""
text = text.lower()
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
text = re.sub(r"[^a-zA-Z\s]", '', text)
text = re.sub(r"n't", ' not', text)
text = re.sub(r"'s", '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
@staticmethod
def Tokenization_text(text: str) -> list:
"""
Tokenizes the text into a list of words, excluding punctuations and spaces.
"""
doc = nlp(text)
tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
return tokens
@staticmethod
def Lemmatization_text(text: str) -> str:
"""
Performs lemmatization on the text and returns the lemmatized version.
"""
doc = nlp(text)
lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
return lemmatized_text
@staticmethod
def Stopwords_removal(text: str) -> str:
"""
Removes stopwords from the input text.
"""
doc = nlp(text)
text_without_stopwords = ' '.join([token.text for token in doc if not token.is_stop])
return text_without_stopwords
def ModernBert_Tokenization(self) -> dict:
"""
Tokenizes the cleaned text using ModernBERT's tokenizer.
"""
cleaned_text = self.Cleaning_text(self.text)
tokenized_output = self.tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True)
return tokenized_output
def Tfidf_Transformation(self, texts: list) -> np.ndarray:
"""
Applies TF-IDF transformation to a list of texts.
Args:
texts (list of str): List of text strings to apply the TF-IDF transformation.
Returns:
np.ndarray: TF-IDF feature matrix.
"""
tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
return tfidf_matrix.toarray()
def BagOfWords_Transformation(self, texts: list) -> np.ndarray:
"""
Applies Bag of Words (BoW) transformation to a list of texts.
Args:
texts (list of str): List of text strings to apply the BoW transformation.
Returns:
np.ndarray: Bag of Words feature matrix.
"""
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(texts)
return bow_matrix.toarray()
def Ngram_Transformation(self, texts: list, ngram_range=(1, 2)) -> np.ndarray:
"""
Applies N-gram transformation (uni-grams, bi-grams, etc.) to a list of texts.
Args:
texts (list of str): List of text strings to apply the N-gram transformation.
ngram_range (tuple): The range of n-values for n-grams to extract. Default is (1, 2) for unigrams and bigrams.
Returns:
np.ndarray: N-gram feature matrix.
"""
vectorizer = CountVectorizer(ngram_range=ngram_range)
ngram_matrix = vectorizer.fit_transform(texts)
return ngram_matrix.toarray()
def preprocess_text(text):
text_preprocessor = TextPreprocessing(text=None, tokenizer=None)
cleaned_text = text_preprocessor.Cleaning_text(text)
return cleaned_text
def predict_news(text):
cleaned_text = preprocess_text(text)
X_input = tfidf_vectorizer.transform([cleaned_text])
prediction = mnb.predict(X_input)
return "Fake News" if prediction == 0 else "Real News"
iface = gr.Interface(
fn=predict_news,
inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."),
outputs="text",
title="Fake News Classification",
description="Classify news articles as real or fake."
)
iface.launch()