RaghuCourage9605
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -4,36 +4,38 @@ import re
|
|
4 |
import spacy
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
6 |
import numpy as np
|
7 |
-
import
|
8 |
import subprocess
|
9 |
-
import os
|
10 |
import sys
|
11 |
|
12 |
-
|
13 |
subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
|
14 |
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
|
15 |
|
16 |
-
# Load pre-trained models and vectorizer
|
17 |
with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
|
18 |
tfidf_vectorizer = pickle.load(vectorizer_file)
|
19 |
|
20 |
with open('mnb_model.pkl', 'rb') as model_file:
|
21 |
mnb = pickle.load(model_file)
|
22 |
|
23 |
-
nlp = spacy.load("en_core_web_sm")
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
28 |
|
29 |
class TextPreprocessing:
|
30 |
-
def __init__(self, text: str, tokenizer
|
31 |
self.text = text
|
32 |
self.tokenizer = tokenizer
|
33 |
self.tfidf_vectorizer = tfidf_vectorizer or TfidfVectorizer()
|
34 |
|
35 |
@staticmethod
|
36 |
def Cleaning_text(text: str) -> str:
|
|
|
|
|
|
|
|
|
37 |
text = text.lower()
|
38 |
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
|
39 |
text = re.sub(r"[^a-zA-Z\s]", '', text)
|
@@ -44,88 +46,103 @@ class TextPreprocessing:
|
|
44 |
|
45 |
@staticmethod
|
46 |
def Tokenization_text(text: str) -> list:
|
|
|
|
|
|
|
47 |
doc = nlp(text)
|
48 |
tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
|
49 |
return tokens
|
50 |
|
51 |
@staticmethod
|
52 |
def Lemmatization_text(text: str) -> str:
|
|
|
|
|
|
|
53 |
doc = nlp(text)
|
54 |
lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
|
55 |
return lemmatized_text
|
56 |
|
57 |
@staticmethod
|
58 |
def Stopwords_removal(text: str) -> str:
|
|
|
|
|
|
|
59 |
doc = nlp(text)
|
60 |
text_without_stopwords = ' '.join([token.text for token in doc if not token.is_stop])
|
61 |
return text_without_stopwords
|
62 |
|
63 |
-
def ModernBert_Tokenization(self):
|
64 |
-
|
65 |
-
|
|
|
66 |
cleaned_text = self.Cleaning_text(self.text)
|
67 |
tokenized_output = self.tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True)
|
68 |
return tokenized_output
|
69 |
|
70 |
def Tfidf_Transformation(self, texts: list) -> np.ndarray:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
|
72 |
return tfidf_matrix.toarray()
|
73 |
|
74 |
def BagOfWords_Transformation(self, texts: list) -> np.ndarray:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
vectorizer = CountVectorizer()
|
76 |
bow_matrix = vectorizer.fit_transform(texts)
|
77 |
return bow_matrix.toarray()
|
78 |
|
79 |
def Ngram_Transformation(self, texts: list, ngram_range=(1, 2)) -> np.ndarray:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
vectorizer = CountVectorizer(ngram_range=ngram_range)
|
81 |
ngram_matrix = vectorizer.fit_transform(texts)
|
82 |
return ngram_matrix.toarray()
|
83 |
|
|
|
|
|
|
|
84 |
def preprocess_text(text):
|
85 |
text_preprocessor = TextPreprocessing(text=None, tokenizer=None)
|
86 |
cleaned_text = text_preprocessor.Cleaning_text(text)
|
87 |
return cleaned_text
|
88 |
|
89 |
-
def get_fact_checkability_score(text):
|
90 |
-
"""
|
91 |
-
Calls the ClaimBuster API to get the fact-checkability score of the text.
|
92 |
-
"""
|
93 |
-
headers = {"x-api-key": API_KEY}
|
94 |
-
response = requests.get(CLAIMBUSTER_API_URL, headers=headers, params={"query": text})
|
95 |
-
if response.status_code == 200:
|
96 |
-
data = response.json()
|
97 |
-
return data['results'][0]['score']
|
98 |
-
else:
|
99 |
-
return None
|
100 |
|
101 |
def predict_news(text):
|
102 |
-
"""
|
103 |
-
Predicts whether the news is real or fake and evaluates fact-checkability.
|
104 |
-
"""
|
105 |
cleaned_text = preprocess_text(text)
|
106 |
X_input = tfidf_vectorizer.transform([cleaned_text])
|
107 |
prediction = mnb.predict(X_input)
|
108 |
-
|
109 |
-
|
110 |
-
# Get fact-checkability score
|
111 |
-
score = get_fact_checkability_score(cleaned_text)
|
112 |
-
if score is not None:
|
113 |
-
if score > 0.5:
|
114 |
-
fact_check_message = "This article should be investigated for factual accuracy."
|
115 |
-
else:
|
116 |
-
fact_check_message = "This article does not require immediate factual investigation."
|
117 |
-
else:
|
118 |
-
fact_check_message = "Unable to retrieve fact-checkability score."
|
119 |
|
120 |
-
return f"{classification}\n\nFact-Checkability: {fact_check_message}"
|
121 |
|
122 |
-
# Gradio interface
|
123 |
iface = gr.Interface(
|
124 |
-
fn=predict_news,
|
125 |
-
inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."),
|
126 |
-
outputs="text",
|
127 |
-
title="Fake News Classification
|
128 |
-
description="Classify news articles as real or fake
|
129 |
)
|
130 |
|
131 |
iface.launch()
|
|
|
4 |
import spacy
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
6 |
import numpy as np
|
7 |
+
from transformers import AutoTokenizer
|
8 |
import subprocess
|
|
|
9 |
import sys
|
10 |
|
|
|
11 |
subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
|
12 |
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
|
13 |
|
|
|
14 |
with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
|
15 |
tfidf_vectorizer = pickle.load(vectorizer_file)
|
16 |
|
17 |
with open('mnb_model.pkl', 'rb') as model_file:
|
18 |
mnb = pickle.load(model_file)
|
19 |
|
|
|
20 |
|
21 |
+
checkpoint = "answerdotai/ModernBERT-base"
|
22 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
23 |
+
tf_idf = TfidfVectorizer()
|
24 |
+
|
25 |
+
nlp = spacy.load("en_core_web_sm")
|
26 |
|
27 |
class TextPreprocessing:
|
28 |
+
def __init__(self, text: str, tokenizer, tfidf_vectorizer: TfidfVectorizer = None):
|
29 |
self.text = text
|
30 |
self.tokenizer = tokenizer
|
31 |
self.tfidf_vectorizer = tfidf_vectorizer or TfidfVectorizer()
|
32 |
|
33 |
@staticmethod
|
34 |
def Cleaning_text(text: str) -> str:
|
35 |
+
"""
|
36 |
+
Cleans the input text by converting to lowercase,
|
37 |
+
removing URLs, special characters, and unnecessary spaces.
|
38 |
+
"""
|
39 |
text = text.lower()
|
40 |
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
|
41 |
text = re.sub(r"[^a-zA-Z\s]", '', text)
|
|
|
46 |
|
47 |
@staticmethod
|
48 |
def Tokenization_text(text: str) -> list:
|
49 |
+
"""
|
50 |
+
Tokenizes the text into a list of words, excluding punctuations and spaces.
|
51 |
+
"""
|
52 |
doc = nlp(text)
|
53 |
tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
|
54 |
return tokens
|
55 |
|
56 |
@staticmethod
|
57 |
def Lemmatization_text(text: str) -> str:
|
58 |
+
"""
|
59 |
+
Performs lemmatization on the text and returns the lemmatized version.
|
60 |
+
"""
|
61 |
doc = nlp(text)
|
62 |
lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
|
63 |
return lemmatized_text
|
64 |
|
65 |
@staticmethod
|
66 |
def Stopwords_removal(text: str) -> str:
|
67 |
+
"""
|
68 |
+
Removes stopwords from the input text.
|
69 |
+
"""
|
70 |
doc = nlp(text)
|
71 |
text_without_stopwords = ' '.join([token.text for token in doc if not token.is_stop])
|
72 |
return text_without_stopwords
|
73 |
|
74 |
+
def ModernBert_Tokenization(self) -> dict:
|
75 |
+
"""
|
76 |
+
Tokenizes the cleaned text using ModernBERT's tokenizer.
|
77 |
+
"""
|
78 |
cleaned_text = self.Cleaning_text(self.text)
|
79 |
tokenized_output = self.tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True)
|
80 |
return tokenized_output
|
81 |
|
82 |
def Tfidf_Transformation(self, texts: list) -> np.ndarray:
|
83 |
+
"""
|
84 |
+
Applies TF-IDF transformation to a list of texts.
|
85 |
+
|
86 |
+
Args:
|
87 |
+
texts (list of str): List of text strings to apply the TF-IDF transformation.
|
88 |
+
|
89 |
+
Returns:
|
90 |
+
np.ndarray: TF-IDF feature matrix.
|
91 |
+
"""
|
92 |
tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
|
93 |
return tfidf_matrix.toarray()
|
94 |
|
95 |
def BagOfWords_Transformation(self, texts: list) -> np.ndarray:
|
96 |
+
"""
|
97 |
+
Applies Bag of Words (BoW) transformation to a list of texts.
|
98 |
+
|
99 |
+
Args:
|
100 |
+
texts (list of str): List of text strings to apply the BoW transformation.
|
101 |
+
|
102 |
+
Returns:
|
103 |
+
np.ndarray: Bag of Words feature matrix.
|
104 |
+
"""
|
105 |
vectorizer = CountVectorizer()
|
106 |
bow_matrix = vectorizer.fit_transform(texts)
|
107 |
return bow_matrix.toarray()
|
108 |
|
109 |
def Ngram_Transformation(self, texts: list, ngram_range=(1, 2)) -> np.ndarray:
|
110 |
+
"""
|
111 |
+
Applies N-gram transformation (uni-grams, bi-grams, etc.) to a list of texts.
|
112 |
+
|
113 |
+
Args:
|
114 |
+
texts (list of str): List of text strings to apply the N-gram transformation.
|
115 |
+
ngram_range (tuple): The range of n-values for n-grams to extract. Default is (1, 2) for unigrams and bigrams.
|
116 |
+
|
117 |
+
Returns:
|
118 |
+
np.ndarray: N-gram feature matrix.
|
119 |
+
"""
|
120 |
vectorizer = CountVectorizer(ngram_range=ngram_range)
|
121 |
ngram_matrix = vectorizer.fit_transform(texts)
|
122 |
return ngram_matrix.toarray()
|
123 |
|
124 |
+
|
125 |
+
|
126 |
+
|
127 |
def preprocess_text(text):
|
128 |
text_preprocessor = TextPreprocessing(text=None, tokenizer=None)
|
129 |
cleaned_text = text_preprocessor.Cleaning_text(text)
|
130 |
return cleaned_text
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
def predict_news(text):
|
|
|
|
|
|
|
134 |
cleaned_text = preprocess_text(text)
|
135 |
X_input = tfidf_vectorizer.transform([cleaned_text])
|
136 |
prediction = mnb.predict(X_input)
|
137 |
+
return "Fake News" if prediction == 0 else "Real News"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
|
|
139 |
|
|
|
140 |
iface = gr.Interface(
|
141 |
+
fn=predict_news,
|
142 |
+
inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."),
|
143 |
+
outputs="text",
|
144 |
+
title="Fake News Classification",
|
145 |
+
description="Classify news articles as real or fake."
|
146 |
)
|
147 |
|
148 |
iface.launch()
|