RaghuCourage9605 commited on
Commit
fcb77f0
·
verified ·
1 Parent(s): 73919ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -44
app.py CHANGED
@@ -4,36 +4,38 @@ import re
4
  import spacy
5
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
6
  import numpy as np
7
- import requests
8
  import subprocess
9
- import os
10
  import sys
11
 
12
-
13
  subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
14
  subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
15
 
16
- # Load pre-trained models and vectorizer
17
  with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
18
  tfidf_vectorizer = pickle.load(vectorizer_file)
19
 
20
  with open('mnb_model.pkl', 'rb') as model_file:
21
  mnb = pickle.load(model_file)
22
 
23
- nlp = spacy.load("en_core_web_sm")
24
 
25
- # ClaimBuster API URL and key (replace YOUR_API_KEY with your actual key)
26
- CLAIMBUSTER_API_URL = "https://idir.uta.edu/claimbuster/api/v2/score/text/"
27
- API_KEY = os.getenv("CLAIMBURST_API")
 
 
28
 
29
  class TextPreprocessing:
30
- def __init__(self, text: str, tokenizer=None, tfidf_vectorizer: TfidfVectorizer = None):
31
  self.text = text
32
  self.tokenizer = tokenizer
33
  self.tfidf_vectorizer = tfidf_vectorizer or TfidfVectorizer()
34
 
35
  @staticmethod
36
  def Cleaning_text(text: str) -> str:
 
 
 
 
37
  text = text.lower()
38
  text = re.sub(r'http\S+|www\S+|https\S+', '', text)
39
  text = re.sub(r"[^a-zA-Z\s]", '', text)
@@ -44,88 +46,103 @@ class TextPreprocessing:
44
 
45
  @staticmethod
46
  def Tokenization_text(text: str) -> list:
 
 
 
47
  doc = nlp(text)
48
  tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
49
  return tokens
50
 
51
  @staticmethod
52
  def Lemmatization_text(text: str) -> str:
 
 
 
53
  doc = nlp(text)
54
  lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
55
  return lemmatized_text
56
 
57
  @staticmethod
58
  def Stopwords_removal(text: str) -> str:
 
 
 
59
  doc = nlp(text)
60
  text_without_stopwords = ' '.join([token.text for token in doc if not token.is_stop])
61
  return text_without_stopwords
62
 
63
- def ModernBert_Tokenization(self):
64
- if not self.tokenizer:
65
- raise ValueError("Tokenizer not provided.")
 
66
  cleaned_text = self.Cleaning_text(self.text)
67
  tokenized_output = self.tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True)
68
  return tokenized_output
69
 
70
  def Tfidf_Transformation(self, texts: list) -> np.ndarray:
 
 
 
 
 
 
 
 
 
71
  tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
72
  return tfidf_matrix.toarray()
73
 
74
  def BagOfWords_Transformation(self, texts: list) -> np.ndarray:
 
 
 
 
 
 
 
 
 
75
  vectorizer = CountVectorizer()
76
  bow_matrix = vectorizer.fit_transform(texts)
77
  return bow_matrix.toarray()
78
 
79
  def Ngram_Transformation(self, texts: list, ngram_range=(1, 2)) -> np.ndarray:
 
 
 
 
 
 
 
 
 
 
80
  vectorizer = CountVectorizer(ngram_range=ngram_range)
81
  ngram_matrix = vectorizer.fit_transform(texts)
82
  return ngram_matrix.toarray()
83
 
 
 
 
84
  def preprocess_text(text):
85
  text_preprocessor = TextPreprocessing(text=None, tokenizer=None)
86
  cleaned_text = text_preprocessor.Cleaning_text(text)
87
  return cleaned_text
88
 
89
- def get_fact_checkability_score(text):
90
- """
91
- Calls the ClaimBuster API to get the fact-checkability score of the text.
92
- """
93
- headers = {"x-api-key": API_KEY}
94
- response = requests.get(CLAIMBUSTER_API_URL, headers=headers, params={"query": text})
95
- if response.status_code == 200:
96
- data = response.json()
97
- return data['results'][0]['score']
98
- else:
99
- return None
100
 
101
  def predict_news(text):
102
- """
103
- Predicts whether the news is real or fake and evaluates fact-checkability.
104
- """
105
  cleaned_text = preprocess_text(text)
106
  X_input = tfidf_vectorizer.transform([cleaned_text])
107
  prediction = mnb.predict(X_input)
108
- classification = "Fake News" if prediction == 0 else "Real News"
109
-
110
- # Get fact-checkability score
111
- score = get_fact_checkability_score(cleaned_text)
112
- if score is not None:
113
- if score > 0.5:
114
- fact_check_message = "This article should be investigated for factual accuracy."
115
- else:
116
- fact_check_message = "This article does not require immediate factual investigation."
117
- else:
118
- fact_check_message = "Unable to retrieve fact-checkability score."
119
 
120
- return f"{classification}\n\nFact-Checkability: {fact_check_message}"
121
 
122
- # Gradio interface
123
  iface = gr.Interface(
124
- fn=predict_news,
125
- inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."),
126
- outputs="text",
127
- title="Fake News Classification with Fact-Checkability",
128
- description="Classify news articles as real or fake, and get a suggestion on whether the article should be investigated for factual accuracy."
129
  )
130
 
131
  iface.launch()
 
4
  import spacy
5
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
6
  import numpy as np
7
+ from transformers import AutoTokenizer
8
  import subprocess
 
9
  import sys
10
 
 
11
  subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
12
  subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
13
 
 
14
  with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
15
  tfidf_vectorizer = pickle.load(vectorizer_file)
16
 
17
  with open('mnb_model.pkl', 'rb') as model_file:
18
  mnb = pickle.load(model_file)
19
 
 
20
 
21
+ checkpoint = "answerdotai/ModernBERT-base"
22
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
23
+ tf_idf = TfidfVectorizer()
24
+
25
+ nlp = spacy.load("en_core_web_sm")
26
 
27
  class TextPreprocessing:
28
+ def __init__(self, text: str, tokenizer, tfidf_vectorizer: TfidfVectorizer = None):
29
  self.text = text
30
  self.tokenizer = tokenizer
31
  self.tfidf_vectorizer = tfidf_vectorizer or TfidfVectorizer()
32
 
33
  @staticmethod
34
  def Cleaning_text(text: str) -> str:
35
+ """
36
+ Cleans the input text by converting to lowercase,
37
+ removing URLs, special characters, and unnecessary spaces.
38
+ """
39
  text = text.lower()
40
  text = re.sub(r'http\S+|www\S+|https\S+', '', text)
41
  text = re.sub(r"[^a-zA-Z\s]", '', text)
 
46
 
47
  @staticmethod
48
  def Tokenization_text(text: str) -> list:
49
+ """
50
+ Tokenizes the text into a list of words, excluding punctuations and spaces.
51
+ """
52
  doc = nlp(text)
53
  tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
54
  return tokens
55
 
56
  @staticmethod
57
  def Lemmatization_text(text: str) -> str:
58
+ """
59
+ Performs lemmatization on the text and returns the lemmatized version.
60
+ """
61
  doc = nlp(text)
62
  lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
63
  return lemmatized_text
64
 
65
  @staticmethod
66
  def Stopwords_removal(text: str) -> str:
67
+ """
68
+ Removes stopwords from the input text.
69
+ """
70
  doc = nlp(text)
71
  text_without_stopwords = ' '.join([token.text for token in doc if not token.is_stop])
72
  return text_without_stopwords
73
 
74
+ def ModernBert_Tokenization(self) -> dict:
75
+ """
76
+ Tokenizes the cleaned text using ModernBERT's tokenizer.
77
+ """
78
  cleaned_text = self.Cleaning_text(self.text)
79
  tokenized_output = self.tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True)
80
  return tokenized_output
81
 
82
  def Tfidf_Transformation(self, texts: list) -> np.ndarray:
83
+ """
84
+ Applies TF-IDF transformation to a list of texts.
85
+
86
+ Args:
87
+ texts (list of str): List of text strings to apply the TF-IDF transformation.
88
+
89
+ Returns:
90
+ np.ndarray: TF-IDF feature matrix.
91
+ """
92
  tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
93
  return tfidf_matrix.toarray()
94
 
95
  def BagOfWords_Transformation(self, texts: list) -> np.ndarray:
96
+ """
97
+ Applies Bag of Words (BoW) transformation to a list of texts.
98
+
99
+ Args:
100
+ texts (list of str): List of text strings to apply the BoW transformation.
101
+
102
+ Returns:
103
+ np.ndarray: Bag of Words feature matrix.
104
+ """
105
  vectorizer = CountVectorizer()
106
  bow_matrix = vectorizer.fit_transform(texts)
107
  return bow_matrix.toarray()
108
 
109
  def Ngram_Transformation(self, texts: list, ngram_range=(1, 2)) -> np.ndarray:
110
+ """
111
+ Applies N-gram transformation (uni-grams, bi-grams, etc.) to a list of texts.
112
+
113
+ Args:
114
+ texts (list of str): List of text strings to apply the N-gram transformation.
115
+ ngram_range (tuple): The range of n-values for n-grams to extract. Default is (1, 2) for unigrams and bigrams.
116
+
117
+ Returns:
118
+ np.ndarray: N-gram feature matrix.
119
+ """
120
  vectorizer = CountVectorizer(ngram_range=ngram_range)
121
  ngram_matrix = vectorizer.fit_transform(texts)
122
  return ngram_matrix.toarray()
123
 
124
+
125
+
126
+
127
  def preprocess_text(text):
128
  text_preprocessor = TextPreprocessing(text=None, tokenizer=None)
129
  cleaned_text = text_preprocessor.Cleaning_text(text)
130
  return cleaned_text
131
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  def predict_news(text):
 
 
 
134
  cleaned_text = preprocess_text(text)
135
  X_input = tfidf_vectorizer.transform([cleaned_text])
136
  prediction = mnb.predict(X_input)
137
+ return "Fake News" if prediction == 0 else "Real News"
 
 
 
 
 
 
 
 
 
 
138
 
 
139
 
 
140
  iface = gr.Interface(
141
+ fn=predict_news,
142
+ inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."),
143
+ outputs="text",
144
+ title="Fake News Classification",
145
+ description="Classify news articles as real or fake."
146
  )
147
 
148
  iface.launch()