File size: 4,939 Bytes
29cba26 fcb77f0 772b01b 7c5724e 772b01b 29cba26 fcb77f0 d950d25 29cba26 fcb77f0 29cba26 fcb77f0 29cba26 fcb77f0 29cba26 fcb77f0 29cba26 fcb77f0 29cba26 fcb77f0 29cba26 fcb77f0 29cba26 fcb77f0 29cba26 fcb77f0 29cba26 fcb77f0 29cba26 fcb77f0 29cba26 fcb77f0 29cba26 73919ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import pickle
import gradio as gr
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from transformers import AutoTokenizer
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
tfidf_vectorizer = pickle.load(vectorizer_file)
with open('mnb_model.pkl', 'rb') as model_file:
mnb = pickle.load(model_file)
checkpoint = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tf_idf = TfidfVectorizer()
nlp = spacy.load("en_core_web_sm")
class TextPreprocessing:
def __init__(self, text: str, tokenizer, tfidf_vectorizer: TfidfVectorizer = None):
self.text = text
self.tokenizer = tokenizer
self.tfidf_vectorizer = tfidf_vectorizer or TfidfVectorizer()
@staticmethod
def Cleaning_text(text: str) -> str:
"""
Cleans the input text by converting to lowercase,
removing URLs, special characters, and unnecessary spaces.
"""
text = text.lower()
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
text = re.sub(r"[^a-zA-Z\s]", '', text)
text = re.sub(r"n't", ' not', text)
text = re.sub(r"'s", '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
@staticmethod
def Tokenization_text(text: str) -> list:
"""
Tokenizes the text into a list of words, excluding punctuations and spaces.
"""
doc = nlp(text)
tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
return tokens
@staticmethod
def Lemmatization_text(text: str) -> str:
"""
Performs lemmatization on the text and returns the lemmatized version.
"""
doc = nlp(text)
lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
return lemmatized_text
@staticmethod
def Stopwords_removal(text: str) -> str:
"""
Removes stopwords from the input text.
"""
doc = nlp(text)
text_without_stopwords = ' '.join([token.text for token in doc if not token.is_stop])
return text_without_stopwords
def ModernBert_Tokenization(self) -> dict:
"""
Tokenizes the cleaned text using ModernBERT's tokenizer.
"""
cleaned_text = self.Cleaning_text(self.text)
tokenized_output = self.tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True)
return tokenized_output
def Tfidf_Transformation(self, texts: list) -> np.ndarray:
"""
Applies TF-IDF transformation to a list of texts.
Args:
texts (list of str): List of text strings to apply the TF-IDF transformation.
Returns:
np.ndarray: TF-IDF feature matrix.
"""
tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
return tfidf_matrix.toarray()
def BagOfWords_Transformation(self, texts: list) -> np.ndarray:
"""
Applies Bag of Words (BoW) transformation to a list of texts.
Args:
texts (list of str): List of text strings to apply the BoW transformation.
Returns:
np.ndarray: Bag of Words feature matrix.
"""
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(texts)
return bow_matrix.toarray()
def Ngram_Transformation(self, texts: list, ngram_range=(1, 2)) -> np.ndarray:
"""
Applies N-gram transformation (uni-grams, bi-grams, etc.) to a list of texts.
Args:
texts (list of str): List of text strings to apply the N-gram transformation.
ngram_range (tuple): The range of n-values for n-grams to extract. Default is (1, 2) for unigrams and bigrams.
Returns:
np.ndarray: N-gram feature matrix.
"""
vectorizer = CountVectorizer(ngram_range=ngram_range)
ngram_matrix = vectorizer.fit_transform(texts)
return ngram_matrix.toarray()
def preprocess_text(text):
text_preprocessor = TextPreprocessing(text=None, tokenizer=None)
cleaned_text = text_preprocessor.Cleaning_text(text)
return cleaned_text
def predict_news(text):
cleaned_text = preprocess_text(text)
X_input = tfidf_vectorizer.transform([cleaned_text])
prediction = mnb.predict(X_input)
return "Fake News" if prediction == 0 else "Real News"
iface = gr.Interface(
fn=predict_news,
inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."),
outputs="text",
title="Fake News Classification",
description="Classify news articles as real or fake."
)
iface.launch() |