File size: 4,939 Bytes
29cba26
 
 
 
 
 
fcb77f0
772b01b
 
7c5724e
772b01b
 
29cba26
 
 
 
 
 
 
 
fcb77f0
 
 
 
 
d950d25
29cba26
fcb77f0
29cba26
 
 
 
 
 
fcb77f0
 
 
 
29cba26
 
 
 
 
 
 
 
 
 
fcb77f0
 
 
29cba26
 
 
 
 
 
fcb77f0
 
 
29cba26
 
 
 
 
 
fcb77f0
 
 
29cba26
 
 
 
fcb77f0
 
 
 
29cba26
 
 
 
 
fcb77f0
 
 
 
 
 
 
 
 
29cba26
 
 
 
fcb77f0
 
 
 
 
 
 
 
 
29cba26
 
 
 
 
fcb77f0
 
 
 
 
 
 
 
 
 
29cba26
 
 
 
fcb77f0
 
 
29cba26
 
 
 
 
 
 
 
 
 
fcb77f0
29cba26
 
 
fcb77f0
 
 
 
 
29cba26
 
73919ae
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import pickle
import gradio as gr
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from transformers import AutoTokenizer
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    tfidf_vectorizer = pickle.load(vectorizer_file)

with open('mnb_model.pkl', 'rb') as model_file:
    mnb = pickle.load(model_file)


checkpoint = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tf_idf = TfidfVectorizer()

nlp = spacy.load("en_core_web_sm")

class TextPreprocessing:
    def __init__(self, text: str, tokenizer, tfidf_vectorizer: TfidfVectorizer = None):
        self.text = text
        self.tokenizer = tokenizer
        self.tfidf_vectorizer = tfidf_vectorizer or TfidfVectorizer()

    @staticmethod
    def Cleaning_text(text: str) -> str:
        """
        Cleans the input text by converting to lowercase,
        removing URLs, special characters, and unnecessary spaces.
        """
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        text = re.sub(r"[^a-zA-Z\s]", '', text)
        text = re.sub(r"n't", ' not', text)
        text = re.sub(r"'s", '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    @staticmethod
    def Tokenization_text(text: str) -> list:
        """
        Tokenizes the text into a list of words, excluding punctuations and spaces.
        """
        doc = nlp(text)
        tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
        return tokens

    @staticmethod
    def Lemmatization_text(text: str) -> str:
        """
        Performs lemmatization on the text and returns the lemmatized version.
        """
        doc = nlp(text)
        lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
        return lemmatized_text

    @staticmethod
    def Stopwords_removal(text: str) -> str:
        """
        Removes stopwords from the input text.
        """
        doc = nlp(text)
        text_without_stopwords = ' '.join([token.text for token in doc if not token.is_stop])
        return text_without_stopwords

    def ModernBert_Tokenization(self) -> dict:
        """
        Tokenizes the cleaned text using ModernBERT's tokenizer.
        """
        cleaned_text = self.Cleaning_text(self.text)
        tokenized_output = self.tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True)
        return tokenized_output

    def Tfidf_Transformation(self, texts: list) -> np.ndarray:
        """
        Applies TF-IDF transformation to a list of texts.

        Args:
            texts (list of str): List of text strings to apply the TF-IDF transformation.

        Returns:
            np.ndarray: TF-IDF feature matrix.
        """
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
        return tfidf_matrix.toarray()

    def BagOfWords_Transformation(self, texts: list) -> np.ndarray:
        """
        Applies Bag of Words (BoW) transformation to a list of texts.

        Args:
            texts (list of str): List of text strings to apply the BoW transformation.

        Returns:
            np.ndarray: Bag of Words feature matrix.
        """
        vectorizer = CountVectorizer()
        bow_matrix = vectorizer.fit_transform(texts)
        return bow_matrix.toarray()

    def Ngram_Transformation(self, texts: list, ngram_range=(1, 2)) -> np.ndarray:
        """
        Applies N-gram transformation (uni-grams, bi-grams, etc.) to a list of texts.

        Args:
            texts (list of str): List of text strings to apply the N-gram transformation.
            ngram_range (tuple): The range of n-values for n-grams to extract. Default is (1, 2) for unigrams and bigrams.

        Returns:
            np.ndarray: N-gram feature matrix.
        """
        vectorizer = CountVectorizer(ngram_range=ngram_range)
        ngram_matrix = vectorizer.fit_transform(texts)
        return ngram_matrix.toarray()

    


def preprocess_text(text):
    text_preprocessor = TextPreprocessing(text=None, tokenizer=None)
    cleaned_text = text_preprocessor.Cleaning_text(text)
    return cleaned_text


def predict_news(text):
    cleaned_text = preprocess_text(text)
    X_input = tfidf_vectorizer.transform([cleaned_text])
    prediction = mnb.predict(X_input)
    return "Fake News" if prediction == 0 else "Real News"


iface = gr.Interface(
    fn=predict_news, 
    inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."), 
    outputs="text", 
    title="Fake News Classification", 
    description="Classify news articles as real or fake."
)

iface.launch()