Spaces:
Running
Running
File size: 4,906 Bytes
9cccd1c f15224d 9cccd1c f15224d 9cccd1c f15224d 9cccd1c f15224d 9cccd1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import streamlit as st
import pdfplumber
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from gtts import gTTS
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer
# Initialize necessary components
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
qa_pipeline = pipeline("question-answering")
sia = SentimentIntensityAnalyzer()
# Helper functions
def extract_text_from_pdf(file):
with pdfplumber.open(file) as pdf:
text = ''.join([page.extract_text() for page in pdf.pages])
return text
def clean_text(text):
text = re.sub(r'\s*Page \d+\s*', '', text) # Remove page numbers
return text.strip()
def chunk_text(text, max_tokens=1024):
words = text.split()
chunks, current_chunk, current_token_count = [], [], 0
for word in words:
token_count = len(tokenizer(word)['input_ids'])
if current_token_count + token_count > max_tokens:
chunks.append(" ".join(current_chunk))
current_chunk, current_token_count = [], 0
current_chunk.append(word)
current_token_count += token_count
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def summarize_text_qwen(text, max_length=800):
input_text = f"summarize: {text}"
tokens = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
summary_ids = model.generate(
tokens["input_ids"], max_length=max_length, min_length=200,
length_penalty=2.0, num_beams=4, early_stopping=True
)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
def summarize_large_document(text, max_length=800):
chunks = chunk_text(text)
summaries = [summarize_text_qwen(chunk, max_length=max_length) for chunk in chunks]
return " ".join(summaries)
def answer_question_with_context(question, context, chunk_size=500):
chunks = chunk_text(context, max_tokens=chunk_size)
answers = []
for chunk in chunks:
try:
answers.append(qa_pipeline({'question': question, 'context': chunk})['answer'])
except:
continue
return " ".join(answers)
# Replace Tortoise-TTS with gTTS for text-to-speech functionality
def text_to_speech(text, language="en"):
tts = gTTS(text=text, lang=language, slow=False)
file_name = "output_audio.mp3"
tts.save(file_name)
return file_name
def extract_keywords(text, top_n=10):
vectorizer = CountVectorizer(stop_words="english")
word_counts = vectorizer.fit_transform([text])
keywords = sorted(
zip(vectorizer.get_feature_names_out(), word_counts.toarray()[0]),
key=lambda x: x[1], reverse=True
)[:top_n]
return [word for word, count in keywords]
def analyze_sentiment(text):
return sia.polarity_scores(text)
# Streamlit App Interface
st.title("Enhanced PDF to Audiobook App")
st.markdown("### Turn documents into interactive audiobooks with advanced features.")
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file:
with st.spinner("Extracting and cleaning PDF content..."):
raw_text = extract_text_from_pdf(uploaded_file)
cleaned_text = clean_text(raw_text)
st.text_area("Extracted Text", cleaned_text[:5000], height=300, help="Displaying first 5000 characters.")
if st.button("Summarize Document"):
with st.spinner("Summarizing document..."):
summary = summarize_large_document(cleaned_text, max_length=800)
st.text_area("Summary", summary, height=300)
if st.button("Convert Summary to Audiobook"):
with st.spinner("Generating audio..."):
audio_file = text_to_speech(summary)
st.audio(audio_file, format="audio/mp3", start_time=0)
st.markdown("### Ask Questions About the Document")
question = st.text_input("Your Question:")
if question:
with st.spinner("Answering your question..."):
answer = answer_question_with_context(question, cleaned_text)
st.write(f"**Answer:** {answer}")
if st.button("Convert Answer to Audio"):
with st.spinner("Generating answer audio..."):
answer_audio_file = text_to_speech(answer)
st.audio(answer_audio_file, format="audio/mp3", start_time=0)
st.markdown("### Document Insights")
if st.checkbox("Extract Keywords"):
with st.spinner("Extracting keywords..."):
keywords = extract_keywords(cleaned_text)
st.write("Keywords:", ", ".join(keywords))
if st.checkbox("Analyze Sentiment"):
with st.spinner("Analyzing sentiment..."):
sentiment = analyze_sentiment(cleaned_text)
st.write("Sentiment Analysis:", sentiment)
|