import streamlit as st import pdfplumber import re from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from gtts import gTTS from sklearn.feature_extraction.text import CountVectorizer from nltk.sentiment import SentimentIntensityAnalyzer # Initialize necessary components tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True) qa_pipeline = pipeline("question-answering") sia = SentimentIntensityAnalyzer() # Helper functions def extract_text_from_pdf(file): with pdfplumber.open(file) as pdf: text = ''.join([page.extract_text() for page in pdf.pages]) return text def clean_text(text): text = re.sub(r'\s*Page \d+\s*', '', text) # Remove page numbers return text.strip() def chunk_text(text, max_tokens=1024): words = text.split() chunks, current_chunk, current_token_count = [], [], 0 for word in words: token_count = len(tokenizer(word)['input_ids']) if current_token_count + token_count > max_tokens: chunks.append(" ".join(current_chunk)) current_chunk, current_token_count = [], 0 current_chunk.append(word) current_token_count += token_count if current_chunk: chunks.append(" ".join(current_chunk)) return chunks def summarize_text_qwen(text, max_length=800): input_text = f"summarize: {text}" tokens = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024) summary_ids = model.generate( tokens["input_ids"], max_length=max_length, min_length=200, length_penalty=2.0, num_beams=4, early_stopping=True ) return tokenizer.decode(summary_ids[0], skip_special_tokens=True) def summarize_large_document(text, max_length=800): chunks = chunk_text(text) summaries = [summarize_text_qwen(chunk, max_length=max_length) for chunk in chunks] return " ".join(summaries) def answer_question_with_context(question, context, chunk_size=500): chunks = chunk_text(context, max_tokens=chunk_size) answers = [] for chunk in chunks: try: answers.append(qa_pipeline({'question': question, 'context': chunk})['answer']) except: continue return " ".join(answers) # Replace Tortoise-TTS with gTTS for text-to-speech functionality def text_to_speech(text, language="en"): tts = gTTS(text=text, lang=language, slow=False) file_name = "output_audio.mp3" tts.save(file_name) return file_name def extract_keywords(text, top_n=10): vectorizer = CountVectorizer(stop_words="english") word_counts = vectorizer.fit_transform([text]) keywords = sorted( zip(vectorizer.get_feature_names_out(), word_counts.toarray()[0]), key=lambda x: x[1], reverse=True )[:top_n] return [word for word, count in keywords] def analyze_sentiment(text): return sia.polarity_scores(text) # Streamlit App Interface st.title("Enhanced PDF to Audiobook App") st.markdown("### Turn documents into interactive audiobooks with advanced features.") uploaded_file = st.file_uploader("Upload a PDF", type="pdf") if uploaded_file: with st.spinner("Extracting and cleaning PDF content..."): raw_text = extract_text_from_pdf(uploaded_file) cleaned_text = clean_text(raw_text) st.text_area("Extracted Text", cleaned_text[:5000], height=300, help="Displaying first 5000 characters.") if st.button("Summarize Document"): with st.spinner("Summarizing document..."): summary = summarize_large_document(cleaned_text, max_length=800) st.text_area("Summary", summary, height=300) if st.button("Convert Summary to Audiobook"): with st.spinner("Generating audio..."): audio_file = text_to_speech(summary) st.audio(audio_file, format="audio/mp3", start_time=0) st.markdown("### Ask Questions About the Document") question = st.text_input("Your Question:") if question: with st.spinner("Answering your question..."): answer = answer_question_with_context(question, cleaned_text) st.write(f"**Answer:** {answer}") if st.button("Convert Answer to Audio"): with st.spinner("Generating answer audio..."): answer_audio_file = text_to_speech(answer) st.audio(answer_audio_file, format="audio/mp3", start_time=0) st.markdown("### Document Insights") if st.checkbox("Extract Keywords"): with st.spinner("Extracting keywords..."): keywords = extract_keywords(cleaned_text) st.write("Keywords:", ", ".join(keywords)) if st.checkbox("Analyze Sentiment"): with st.spinner("Analyzing sentiment..."): sentiment = analyze_sentiment(cleaned_text) st.write("Sentiment Analysis:", sentiment)