Spaces:
Running
Running
import streamlit as st | |
import pdfplumber | |
import re | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
from gtts import gTTS | |
from sklearn.feature_extraction.text import CountVectorizer | |
from nltk.sentiment import SentimentIntensityAnalyzer | |
# Initialize necessary components | |
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True) | |
qa_pipeline = pipeline("question-answering") | |
sia = SentimentIntensityAnalyzer() | |
# Helper functions | |
def extract_text_from_pdf(file): | |
with pdfplumber.open(file) as pdf: | |
text = ''.join([page.extract_text() for page in pdf.pages]) | |
return text | |
def clean_text(text): | |
text = re.sub(r'\s*Page \d+\s*', '', text) # Remove page numbers | |
return text.strip() | |
def chunk_text(text, max_tokens=1024): | |
words = text.split() | |
chunks, current_chunk, current_token_count = [], [], 0 | |
for word in words: | |
token_count = len(tokenizer(word)['input_ids']) | |
if current_token_count + token_count > max_tokens: | |
chunks.append(" ".join(current_chunk)) | |
current_chunk, current_token_count = [], 0 | |
current_chunk.append(word) | |
current_token_count += token_count | |
if current_chunk: | |
chunks.append(" ".join(current_chunk)) | |
return chunks | |
def summarize_text_qwen(text, max_length=800): | |
input_text = f"summarize: {text}" | |
tokens = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024) | |
summary_ids = model.generate( | |
tokens["input_ids"], max_length=max_length, min_length=200, | |
length_penalty=2.0, num_beams=4, early_stopping=True | |
) | |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
def summarize_large_document(text, max_length=800): | |
chunks = chunk_text(text) | |
summaries = [summarize_text_qwen(chunk, max_length=max_length) for chunk in chunks] | |
return " ".join(summaries) | |
def answer_question_with_context(question, context, chunk_size=500): | |
chunks = chunk_text(context, max_tokens=chunk_size) | |
answers = [] | |
for chunk in chunks: | |
try: | |
answers.append(qa_pipeline({'question': question, 'context': chunk})['answer']) | |
except: | |
continue | |
return " ".join(answers) | |
# Replace Tortoise-TTS with gTTS for text-to-speech functionality | |
def text_to_speech(text, language="en"): | |
tts = gTTS(text=text, lang=language, slow=False) | |
file_name = "output_audio.mp3" | |
tts.save(file_name) | |
return file_name | |
def extract_keywords(text, top_n=10): | |
vectorizer = CountVectorizer(stop_words="english") | |
word_counts = vectorizer.fit_transform([text]) | |
keywords = sorted( | |
zip(vectorizer.get_feature_names_out(), word_counts.toarray()[0]), | |
key=lambda x: x[1], reverse=True | |
)[:top_n] | |
return [word for word, count in keywords] | |
def analyze_sentiment(text): | |
return sia.polarity_scores(text) | |
# Streamlit App Interface | |
st.title("Enhanced PDF to Audiobook App") | |
st.markdown("### Turn documents into interactive audiobooks with advanced features.") | |
uploaded_file = st.file_uploader("Upload a PDF", type="pdf") | |
if uploaded_file: | |
with st.spinner("Extracting and cleaning PDF content..."): | |
raw_text = extract_text_from_pdf(uploaded_file) | |
cleaned_text = clean_text(raw_text) | |
st.text_area("Extracted Text", cleaned_text[:5000], height=300, help="Displaying first 5000 characters.") | |
if st.button("Summarize Document"): | |
with st.spinner("Summarizing document..."): | |
summary = summarize_large_document(cleaned_text, max_length=800) | |
st.text_area("Summary", summary, height=300) | |
if st.button("Convert Summary to Audiobook"): | |
with st.spinner("Generating audio..."): | |
audio_file = text_to_speech(summary) | |
st.audio(audio_file, format="audio/mp3", start_time=0) | |
st.markdown("### Ask Questions About the Document") | |
question = st.text_input("Your Question:") | |
if question: | |
with st.spinner("Answering your question..."): | |
answer = answer_question_with_context(question, cleaned_text) | |
st.write(f"**Answer:** {answer}") | |
if st.button("Convert Answer to Audio"): | |
with st.spinner("Generating answer audio..."): | |
answer_audio_file = text_to_speech(answer) | |
st.audio(answer_audio_file, format="audio/mp3", start_time=0) | |
st.markdown("### Document Insights") | |
if st.checkbox("Extract Keywords"): | |
with st.spinner("Extracting keywords..."): | |
keywords = extract_keywords(cleaned_text) | |
st.write("Keywords:", ", ".join(keywords)) | |
if st.checkbox("Analyze Sentiment"): | |
with st.spinner("Analyzing sentiment..."): | |
sentiment = analyze_sentiment(cleaned_text) | |
st.write("Sentiment Analysis:", sentiment) | |