RaghuCourage9605 commited on
Commit
b7bbf35
1 Parent(s): aa31cef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -99
app.py CHANGED
@@ -2,34 +2,27 @@ import pickle
2
  import gradio as gr
3
  import re
4
  import spacy
5
- from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
6
  import numpy as np
7
- from transformers import AutoTokenizer
 
 
8
  import subprocess
9
  import sys
10
-
11
  subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
12
  subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
13
 
 
14
  with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
15
  tfidf_vectorizer = pickle.load(vectorizer_file)
16
 
17
- with open('mnb_model.pkl', 'rb') as model_file:
18
- mnb = pickle.load(model_file)
19
-
20
-
21
- checkpoint = "answerdotai/ModernBERT-base"
22
- tokenizer = AutoTokenizer.from_pretrained(checkpoint)
23
- tf_idf = TfidfVectorizer()
24
 
 
25
  nlp = spacy.load("en_core_web_sm")
26
 
27
  class TextPreprocessing:
28
- def __init__(self, text: str, tokenizer, tfidf_vectorizer: TfidfVectorizer = None):
29
- self.text = text
30
- self.tokenizer = tokenizer
31
- self.tfidf_vectorizer = tfidf_vectorizer or TfidfVectorizer()
32
-
33
  @staticmethod
34
  def Cleaning_text(text: str) -> str:
35
  """
@@ -44,105 +37,31 @@ class TextPreprocessing:
44
  text = re.sub(r'\s+', ' ', text).strip()
45
  return text
46
 
47
- @staticmethod
48
- def Tokenization_text(text: str) -> list:
49
- """
50
- Tokenizes the text into a list of words, excluding punctuations and spaces.
51
- """
52
- doc = nlp(text)
53
- tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
54
- return tokens
55
-
56
- @staticmethod
57
- def Lemmatization_text(text: str) -> str:
58
- """
59
- Performs lemmatization on the text and returns the lemmatized version.
60
- """
61
- doc = nlp(text)
62
- lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
63
- return lemmatized_text
64
-
65
- @staticmethod
66
- def Stopwords_removal(text: str) -> str:
67
- """
68
- Removes stopwords from the input text.
69
- """
70
- doc = nlp(text)
71
- text_without_stopwords = ' '.join([token.text for token in doc if not token.is_stop])
72
- return text_without_stopwords
73
-
74
- def ModernBert_Tokenization(self) -> dict:
75
- """
76
- Tokenizes the cleaned text using ModernBERT's tokenizer.
77
- """
78
- cleaned_text = self.Cleaning_text(self.text)
79
- tokenized_output = self.tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True)
80
- return tokenized_output
81
-
82
- def Tfidf_Transformation(self, texts: list) -> np.ndarray:
83
- """
84
- Applies TF-IDF transformation to a list of texts.
85
-
86
- Args:
87
- texts (list of str): List of text strings to apply the TF-IDF transformation.
88
-
89
- Returns:
90
- np.ndarray: TF-IDF feature matrix.
91
- """
92
- tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
93
- return tfidf_matrix.toarray()
94
-
95
- def BagOfWords_Transformation(self, texts: list) -> np.ndarray:
96
- """
97
- Applies Bag of Words (BoW) transformation to a list of texts.
98
-
99
- Args:
100
- texts (list of str): List of text strings to apply the BoW transformation.
101
-
102
- Returns:
103
- np.ndarray: Bag of Words feature matrix.
104
- """
105
- vectorizer = CountVectorizer()
106
- bow_matrix = vectorizer.fit_transform(texts)
107
- return bow_matrix.toarray()
108
-
109
- def Ngram_Transformation(self, texts: list, ngram_range=(1, 2)) -> np.ndarray:
110
- """
111
- Applies N-gram transformation (uni-grams, bi-grams, etc.) to a list of texts.
112
-
113
- Args:
114
- texts (list of str): List of text strings to apply the N-gram transformation.
115
- ngram_range (tuple): The range of n-values for n-grams to extract. Default is (1, 2) for unigrams and bigrams.
116
-
117
- Returns:
118
- np.ndarray: N-gram feature matrix.
119
- """
120
- vectorizer = CountVectorizer(ngram_range=ngram_range)
121
- ngram_matrix = vectorizer.fit_transform(texts)
122
- return ngram_matrix.toarray()
123
-
124
-
125
-
126
 
127
  def preprocess_text(text):
128
- text_preprocessor = TextPreprocessing(text=None, tokenizer=None)
129
- cleaned_text = text_preprocessor.Cleaning_text(text)
 
 
130
  return cleaned_text
131
 
132
 
133
  def predict_news(text):
 
 
 
134
  cleaned_text = preprocess_text(text)
135
  X_input = tfidf_vectorizer.transform([cleaned_text])
136
- prediction = mnb.predict(X_input)
137
  return "Fake News" if prediction == 0 else "Real News"
138
 
139
-
140
  iface = gr.Interface(
141
  fn=predict_news,
142
  inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."),
143
  outputs="text",
144
  title="Fake News Classification",
145
- description="Classify news articles as real or fake."
146
  )
147
 
148
  iface.launch()
 
2
  import gradio as gr
3
  import re
4
  import spacy
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
  import numpy as np
7
+ from sklearn.linear_model import PassiveAggressiveClassifier
8
+
9
+ # Ensure required Spacy model is installed
10
  import subprocess
11
  import sys
 
12
  subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
13
  subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
14
 
15
+ # Load the saved vectorizer and model
16
  with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
17
  tfidf_vectorizer = pickle.load(vectorizer_file)
18
 
19
+ with open('pac_model.pkl', 'rb') as model_file: # Updated to PAC model
20
+ pac_model = pickle.load(model_file)
 
 
 
 
 
21
 
22
+ # Load Spacy language model
23
  nlp = spacy.load("en_core_web_sm")
24
 
25
  class TextPreprocessing:
 
 
 
 
 
26
  @staticmethod
27
  def Cleaning_text(text: str) -> str:
28
  """
 
37
  text = re.sub(r'\s+', ' ', text).strip()
38
  return text
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  def preprocess_text(text):
42
+ """
43
+ Preprocess the text by cleaning it using the TextPreprocessing class.
44
+ """
45
+ cleaned_text = TextPreprocessing.Cleaning_text(text)
46
  return cleaned_text
47
 
48
 
49
  def predict_news(text):
50
+ """
51
+ Predict whether the input news text is real or fake.
52
+ """
53
  cleaned_text = preprocess_text(text)
54
  X_input = tfidf_vectorizer.transform([cleaned_text])
55
+ prediction = pac_model.predict(X_input)
56
  return "Fake News" if prediction == 0 else "Real News"
57
 
58
+ # Gradio Interface
59
  iface = gr.Interface(
60
  fn=predict_news,
61
  inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."),
62
  outputs="text",
63
  title="Fake News Classification",
64
+ description="Classify news articles as real or fake using a Passive Aggressive Classifier."
65
  )
66
 
67
  iface.launch()