Arslan17121 commited on
Commit
9cccd1c
·
verified ·
1 Parent(s): cee9b4c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -0
app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pdfplumber
3
+ import re
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
5
+ from tortoise.api import TTS
6
+ from sklearn.feature_extraction.text import CountVectorizer
7
+ from nltk.sentiment import SentimentIntensityAnalyzer
8
+
9
+ # Initialize necessary components
10
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
11
+ model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
12
+ qa_pipeline = pipeline("question-answering")
13
+ tts = TTS()
14
+ sia = SentimentIntensityAnalyzer()
15
+
16
+ # Helper functions
17
+ def extract_text_from_pdf(file):
18
+ with pdfplumber.open(file) as pdf:
19
+ text = ''.join([page.extract_text() for page in pdf.pages])
20
+ return text
21
+
22
+ def clean_text(text):
23
+ text = re.sub(r'\s*Page \d+\s*', '', text) # Remove page numbers
24
+ return text.strip()
25
+
26
+ def chunk_text(text, max_tokens=1024):
27
+ words = text.split()
28
+ chunks, current_chunk, current_token_count = [], [], 0
29
+ for word in words:
30
+ token_count = len(tokenizer(word)['input_ids'])
31
+ if current_token_count + token_count > max_tokens:
32
+ chunks.append(" ".join(current_chunk))
33
+ current_chunk, current_token_count = [], 0
34
+ current_chunk.append(word)
35
+ current_token_count += token_count
36
+ if current_chunk:
37
+ chunks.append(" ".join(current_chunk))
38
+ return chunks
39
+
40
+ def summarize_text_qwen(text, max_length=800):
41
+ input_text = f"summarize: {text}"
42
+ tokens = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
43
+ summary_ids = model.generate(
44
+ tokens["input_ids"], max_length=max_length, min_length=200,
45
+ length_penalty=2.0, num_beams=4, early_stopping=True
46
+ )
47
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
48
+
49
+ def summarize_large_document(text, max_length=800):
50
+ chunks = chunk_text(text)
51
+ summaries = [summarize_text_qwen(chunk, max_length=max_length) for chunk in chunks]
52
+ return " ".join(summaries)
53
+
54
+ def answer_question_with_context(question, context, chunk_size=500):
55
+ chunks = chunk_text(context, max_tokens=chunk_size)
56
+ answers = []
57
+ for chunk in chunks:
58
+ try:
59
+ answers.append(qa_pipeline({'question': question, 'context': chunk})['answer'])
60
+ except:
61
+ continue
62
+ return " ".join(answers)
63
+
64
+ def text_to_speech(text, voice="emotional_voice"):
65
+ return tts.generate(text, preset="standard", voice=voice)
66
+
67
+ def extract_keywords(text, top_n=10):
68
+ vectorizer = CountVectorizer(stop_words="english")
69
+ word_counts = vectorizer.fit_transform([text])
70
+ keywords = sorted(
71
+ zip(vectorizer.get_feature_names_out(), word_counts.toarray()[0]),
72
+ key=lambda x: x[1], reverse=True
73
+ )[:top_n]
74
+ return [word for word, count in keywords]
75
+
76
+ def analyze_sentiment(text):
77
+ return sia.polarity_scores(text)
78
+
79
+ # Streamlit App Interface
80
+ st.title("Enhanced PDF to Audiobook App")
81
+ st.markdown("### Turn documents into interactive audiobooks with advanced features.")
82
+
83
+ uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
84
+
85
+ if uploaded_file:
86
+ with st.spinner("Extracting and cleaning PDF content..."):
87
+ raw_text = extract_text_from_pdf(uploaded_file)
88
+ cleaned_text = clean_text(raw_text)
89
+ st.text_area("Extracted Text", cleaned_text[:5000], height=300, help="Displaying first 5000 characters.")
90
+
91
+ if st.button("Summarize Document"):
92
+ with st.spinner("Summarizing document..."):
93
+ summary = summarize_large_document(cleaned_text, max_length=800)
94
+ st.text_area("Summary", summary, height=300)
95
+
96
+ if st.button("Convert Summary to Audiobook"):
97
+ with st.spinner("Generating audio..."):
98
+ audio = text_to_speech(summary)
99
+ st.audio(audio, format="audio/wav", start_time=0)
100
+
101
+ st.markdown("### Ask Questions About the Document")
102
+ question = st.text_input("Your Question:")
103
+ if question:
104
+ with st.spinner("Answering your question..."):
105
+ answer = answer_question_with_context(question, cleaned_text)
106
+ st.write(f"**Answer:** {answer}")
107
+ if st.button("Convert Answer to Audio"):
108
+ with st.spinner("Generating answer audio..."):
109
+ answer_audio = text_to_speech(answer)
110
+ st.audio(answer_audio, format="audio/wav", start_time=0)
111
+
112
+ st.markdown("### Document Insights")
113
+ if st.checkbox("Extract Keywords"):
114
+ with st.spinner("Extracting keywords..."):
115
+ keywords = extract_keywords(cleaned_text)
116
+ st.write("Keywords:", ", ".join(keywords))
117
+
118
+ if st.checkbox("Analyze Sentiment"):
119
+ with st.spinner("Analyzing sentiment..."):
120
+ sentiment = analyze_sentiment(cleaned_text)
121
+ st.write("Sentiment Analysis:", sentiment)