File size: 4,906 Bytes
9cccd1c
 
 
 
f15224d
9cccd1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f15224d
 
 
 
 
 
9cccd1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f15224d
 
9cccd1c
 
 
 
 
 
 
 
 
f15224d
 
9cccd1c
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import streamlit as st
import pdfplumber
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from gtts import gTTS
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize necessary components
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
qa_pipeline = pipeline("question-answering")
sia = SentimentIntensityAnalyzer()

# Helper functions
def extract_text_from_pdf(file):
    with pdfplumber.open(file) as pdf:
        text = ''.join([page.extract_text() for page in pdf.pages])
    return text

def clean_text(text):
    text = re.sub(r'\s*Page \d+\s*', '', text)  # Remove page numbers
    return text.strip()

def chunk_text(text, max_tokens=1024):
    words = text.split()
    chunks, current_chunk, current_token_count = [], [], 0
    for word in words:
        token_count = len(tokenizer(word)['input_ids'])
        if current_token_count + token_count > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk, current_token_count = [], 0
        current_chunk.append(word)
        current_token_count += token_count
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

def summarize_text_qwen(text, max_length=800):
    input_text = f"summarize: {text}"
    tokens = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
    summary_ids = model.generate(
        tokens["input_ids"], max_length=max_length, min_length=200, 
        length_penalty=2.0, num_beams=4, early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def summarize_large_document(text, max_length=800):
    chunks = chunk_text(text)
    summaries = [summarize_text_qwen(chunk, max_length=max_length) for chunk in chunks]
    return " ".join(summaries)

def answer_question_with_context(question, context, chunk_size=500):
    chunks = chunk_text(context, max_tokens=chunk_size)
    answers = []
    for chunk in chunks:
        try:
            answers.append(qa_pipeline({'question': question, 'context': chunk})['answer'])
        except:
            continue
    return " ".join(answers)

# Replace Tortoise-TTS with gTTS for text-to-speech functionality
def text_to_speech(text, language="en"):
    tts = gTTS(text=text, lang=language, slow=False)
    file_name = "output_audio.mp3"
    tts.save(file_name)
    return file_name

def extract_keywords(text, top_n=10):
    vectorizer = CountVectorizer(stop_words="english")
    word_counts = vectorizer.fit_transform([text])
    keywords = sorted(
        zip(vectorizer.get_feature_names_out(), word_counts.toarray()[0]), 
        key=lambda x: x[1], reverse=True
    )[:top_n]
    return [word for word, count in keywords]

def analyze_sentiment(text):
    return sia.polarity_scores(text)

# Streamlit App Interface
st.title("Enhanced PDF to Audiobook App")
st.markdown("### Turn documents into interactive audiobooks with advanced features.")

uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

if uploaded_file:
    with st.spinner("Extracting and cleaning PDF content..."):
        raw_text = extract_text_from_pdf(uploaded_file)
        cleaned_text = clean_text(raw_text)
    st.text_area("Extracted Text", cleaned_text[:5000], height=300, help="Displaying first 5000 characters.")

    if st.button("Summarize Document"):
        with st.spinner("Summarizing document..."):
            summary = summarize_large_document(cleaned_text, max_length=800)
        st.text_area("Summary", summary, height=300)

        if st.button("Convert Summary to Audiobook"):
            with st.spinner("Generating audio..."):
                audio_file = text_to_speech(summary)
            st.audio(audio_file, format="audio/mp3", start_time=0)

    st.markdown("### Ask Questions About the Document")
    question = st.text_input("Your Question:")
    if question:
        with st.spinner("Answering your question..."):
            answer = answer_question_with_context(question, cleaned_text)
        st.write(f"**Answer:** {answer}")
        if st.button("Convert Answer to Audio"):
            with st.spinner("Generating answer audio..."):
                answer_audio_file = text_to_speech(answer)
            st.audio(answer_audio_file, format="audio/mp3", start_time=0)

    st.markdown("### Document Insights")
    if st.checkbox("Extract Keywords"):
        with st.spinner("Extracting keywords..."):
            keywords = extract_keywords(cleaned_text)
        st.write("Keywords:", ", ".join(keywords))

    if st.checkbox("Analyze Sentiment"):
        with st.spinner("Analyzing sentiment..."):
            sentiment = analyze_sentiment(cleaned_text)
        st.write("Sentiment Analysis:", sentiment)