Spaces:

RaghuCourage9605
/

Jagriti_News_Classification

Sleeping

App Files Files Community

RaghuCourage9605 commited on 10 days ago

Commit

fcb77f0

verified ·

1 Parent(s): 73919ae

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -44

app.py CHANGED Viewed

@@ -4,36 +4,38 @@ import re
 import spacy
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 import numpy as np
-import requests
 import subprocess
-import os
 import sys
 subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
 subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
-# Load pre-trained models and vectorizer
 with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
     tfidf_vectorizer = pickle.load(vectorizer_file)
 with open('mnb_model.pkl', 'rb') as model_file:
     mnb = pickle.load(model_file)
-nlp = spacy.load("en_core_web_sm")
-# ClaimBuster API URL and key (replace YOUR_API_KEY with your actual key)
-CLAIMBUSTER_API_URL = "https://idir.uta.edu/claimbuster/api/v2/score/text/"
-API_KEY = os.getenv("CLAIMBURST_API")
 class TextPreprocessing:
-    def __init__(self, text: str, tokenizer=None, tfidf_vectorizer: TfidfVectorizer = None):
         self.text = text
         self.tokenizer = tokenizer
         self.tfidf_vectorizer = tfidf_vectorizer or TfidfVectorizer()
     @staticmethod
     def Cleaning_text(text: str) -> str:
         text = text.lower()
         text = re.sub(r'http\S+|www\S+|https\S+', '', text)
         text = re.sub(r"[^a-zA-Z\s]", '', text)
@@ -44,88 +46,103 @@ class TextPreprocessing:
     @staticmethod
     def Tokenization_text(text: str) -> list:
         doc = nlp(text)
         tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
         return tokens
     @staticmethod
     def Lemmatization_text(text: str) -> str:
         doc = nlp(text)
         lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
         return lemmatized_text
     @staticmethod
     def Stopwords_removal(text: str) -> str:
         doc = nlp(text)
         text_without_stopwords = ' '.join([token.text for token in doc if not token.is_stop])
         return text_without_stopwords
-    def ModernBert_Tokenization(self):
-        if not self.tokenizer:
-            raise ValueError("Tokenizer not provided.")
         cleaned_text = self.Cleaning_text(self.text)
         tokenized_output = self.tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True)
         return tokenized_output
     def Tfidf_Transformation(self, texts: list) -> np.ndarray:
         tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
         return tfidf_matrix.toarray()
     def BagOfWords_Transformation(self, texts: list) -> np.ndarray:
         vectorizer = CountVectorizer()
         bow_matrix = vectorizer.fit_transform(texts)
         return bow_matrix.toarray()
     def Ngram_Transformation(self, texts: list, ngram_range=(1, 2)) -> np.ndarray:
         vectorizer = CountVectorizer(ngram_range=ngram_range)
         ngram_matrix = vectorizer.fit_transform(texts)
         return ngram_matrix.toarray()
 def preprocess_text(text):
     text_preprocessor = TextPreprocessing(text=None, tokenizer=None)
     cleaned_text = text_preprocessor.Cleaning_text(text)
     return cleaned_text
-def get_fact_checkability_score(text):
-    """
-    Calls the ClaimBuster API to get the fact-checkability score of the text.
-    """
-    headers = {"x-api-key": API_KEY}
-    response = requests.get(CLAIMBUSTER_API_URL, headers=headers, params={"query": text})
-    if response.status_code == 200:
-        data = response.json()
-        return data['results'][0]['score']
-    else:
-        return None
 def predict_news(text):
-    """
-    Predicts whether the news is real or fake and evaluates fact-checkability.
-    """
     cleaned_text = preprocess_text(text)
     X_input = tfidf_vectorizer.transform([cleaned_text])
     prediction = mnb.predict(X_input)
-    classification = "Fake News" if prediction == 0 else "Real News"
-    # Get fact-checkability score
-    score = get_fact_checkability_score(cleaned_text)
-    if score is not None:
-        if score > 0.5:
-            fact_check_message = "This article should be investigated for factual accuracy."
-        else:
-            fact_check_message = "This article does not require immediate factual investigation."
-    else:
-        fact_check_message = "Unable to retrieve fact-checkability score."
-    return f"{classification}\n\nFact-Checkability: {fact_check_message}"
-# Gradio interface
 iface = gr.Interface(
-    fn=predict_news,
-    inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."),
-    outputs="text",
-    title="Fake News Classification with Fact-Checkability",
-    description="Classify news articles as real or fake, and get a suggestion on whether the article should be investigated for factual accuracy."
 )
 iface.launch()

 import spacy
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 import numpy as np
+from transformers import AutoTokenizer
 import subprocess
 import sys
 subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
 subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
 with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
     tfidf_vectorizer = pickle.load(vectorizer_file)
 with open('mnb_model.pkl', 'rb') as model_file:
     mnb = pickle.load(model_file)
+checkpoint = "answerdotai/ModernBERT-base"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+tf_idf = TfidfVectorizer()
+nlp = spacy.load("en_core_web_sm")
 class TextPreprocessing:
+    def __init__(self, text: str, tokenizer, tfidf_vectorizer: TfidfVectorizer = None):
         self.text = text
         self.tokenizer = tokenizer
         self.tfidf_vectorizer = tfidf_vectorizer or TfidfVectorizer()
     @staticmethod
     def Cleaning_text(text: str) -> str:
+        """
+        Cleans the input text by converting to lowercase,
+        removing URLs, special characters, and unnecessary spaces.
+        """
         text = text.lower()
         text = re.sub(r'http\S+|www\S+|https\S+', '', text)
         text = re.sub(r"[^a-zA-Z\s]", '', text)
     @staticmethod
     def Tokenization_text(text: str) -> list:
+        """
+        Tokenizes the text into a list of words, excluding punctuations and spaces.
+        """
         doc = nlp(text)
         tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
         return tokens
     @staticmethod
     def Lemmatization_text(text: str) -> str:
+        """
+        Performs lemmatization on the text and returns the lemmatized version.
+        """
         doc = nlp(text)
         lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
         return lemmatized_text
     @staticmethod
     def Stopwords_removal(text: str) -> str:
+        """
+        Removes stopwords from the input text.
+        """
         doc = nlp(text)
         text_without_stopwords = ' '.join([token.text for token in doc if not token.is_stop])
         return text_without_stopwords
+    def ModernBert_Tokenization(self) -> dict:
+        """
+        Tokenizes the cleaned text using ModernBERT's tokenizer.
+        """
         cleaned_text = self.Cleaning_text(self.text)
         tokenized_output = self.tokenizer(cleaned_text, return_tensors='pt', truncation=True, padding=True)
         return tokenized_output
     def Tfidf_Transformation(self, texts: list) -> np.ndarray:
+        """
+        Applies TF-IDF transformation to a list of texts.
+        Args:
+            texts (list of str): List of text strings to apply the TF-IDF transformation.
+        Returns:
+            np.ndarray: TF-IDF feature matrix.
+        """
         tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
         return tfidf_matrix.toarray()
     def BagOfWords_Transformation(self, texts: list) -> np.ndarray:
+        """
+        Applies Bag of Words (BoW) transformation to a list of texts.
+        Args:
+            texts (list of str): List of text strings to apply the BoW transformation.
+        Returns:
+            np.ndarray: Bag of Words feature matrix.
+        """
         vectorizer = CountVectorizer()
         bow_matrix = vectorizer.fit_transform(texts)
         return bow_matrix.toarray()
     def Ngram_Transformation(self, texts: list, ngram_range=(1, 2)) -> np.ndarray:
+        """
+        Applies N-gram transformation (uni-grams, bi-grams, etc.) to a list of texts.
+        Args:
+            texts (list of str): List of text strings to apply the N-gram transformation.
+            ngram_range (tuple): The range of n-values for n-grams to extract. Default is (1, 2) for unigrams and bigrams.
+        Returns:
+            np.ndarray: N-gram feature matrix.
+        """
         vectorizer = CountVectorizer(ngram_range=ngram_range)
         ngram_matrix = vectorizer.fit_transform(texts)
         return ngram_matrix.toarray()
 def preprocess_text(text):
     text_preprocessor = TextPreprocessing(text=None, tokenizer=None)
     cleaned_text = text_preprocessor.Cleaning_text(text)
     return cleaned_text
 def predict_news(text):
     cleaned_text = preprocess_text(text)
     X_input = tfidf_vectorizer.transform([cleaned_text])
     prediction = mnb.predict(X_input)
+    return "Fake News" if prediction == 0 else "Real News"
 iface = gr.Interface(
+    fn=predict_news,
+    inputs=gr.Textbox(lines=7, placeholder="Enter the news article here..."),
+    outputs="text",
+    title="Fake News Classification",
+    description="Classify news articles as real or fake."
 )
 iface.launch()