vakodiya commited on
Commit
73866b8
·
1 Parent(s): 67e2cbc

Audio question and audio response

Browse files
Files changed (6) hide show
  1. SDLC.wav +0 -0
  2. app.py +18 -17
  3. audio_to_text.py +23 -0
  4. generate_answer.py +19 -0
  5. requirements.txt +4 -1
  6. text_to_audio.py +22 -0
SDLC.wav ADDED
Binary file (203 kB). View file
 
app.py CHANGED
@@ -1,26 +1,27 @@
1
  import streamlit as st
2
- from transformers import GPT2Tokenizer, GPT2LMHeadModel
3
- from langchain.prompts import PromptTemplate
 
 
4
 
5
- tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
6
- model = GPT2LMHeadModel.from_pretrained('gpt2')
7
 
8
  st.title("GPT 2 Chat Bot?")
9
- input_text = st.text_area("Enter text to classify:")
10
-
11
 
 
12
  if st.button("""Enter >>>>> """):
13
  if input_text:
14
- prompt_template = PromptTemplate(template="Answer the following question and classify it: {question}",
15
- input_variables=["question"], output_variables=["answer", "classification"])
16
- # Model loading
17
- format_prompt = prompt_template.format(question=input_text)
18
- encoded_input = tokenizer(format_prompt, return_tensors='pt')
19
- # Run the model
20
- output = model.generate(**encoded_input, max_length=100) # Use generate method for text generation
21
- # Decode the model output to text
22
- decoded_output = tokenizer.decode(output[0])
23
- response_text = decoded_output.split('\n\n')
24
- st.write(response_text[1])
25
 
26
 
 
1
  import streamlit as st
2
+ import os
3
+ from generate_answer import generate_answer
4
+ from audio_to_text import audio_to_text
5
+ from text_to_audio import text_to_audio
6
 
7
+ file_path_relative = 'SDLC.wav'
8
+ file_path_abs = os.path.abspath(file_path_relative)
9
 
10
  st.title("GPT 2 Chat Bot?")
 
 
11
 
12
+ input_text = st.text_area("Search query:")
13
  if st.button("""Enter >>>>> """):
14
  if input_text:
15
+ response_text = generate_answer(input_text)
16
+ st.write(response_text)
17
+
18
+ st.audio(data=file_path_abs, format='audio/wav')
19
+ if st.button("""Use Audio question """):
20
+ audio_text = audio_to_text(file_path_abs)
21
+ answer = generate_answer(audio_text)
22
+ audio_bytes = text_to_audio(answer)
23
+ st.audio(audio_bytes, format="audio/wav")
24
+ st.write(answer)
25
+
26
 
27
 
audio_to_text.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
+ import torchaudio
3
+
4
+
5
+ # load model and processor
6
+ processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
7
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
8
+ model.config.forced_decoder_ids = None
9
+
10
+
11
+ def audio_to_text(file_path_abs):
12
+ # Load the audio and resample it
13
+ waveform, sample_rate = torchaudio.load(file_path_abs)
14
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
15
+ waveform = resampler(waveform)
16
+ waveform = waveform.squeeze().numpy()
17
+ input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
18
+
19
+ # generate token ids
20
+ predicted_ids = model.generate(input_features)
21
+ # decode token ids to text
22
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
23
+ return transcription
generate_answer.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel
2
+ from langchain.prompts import PromptTemplate
3
+
4
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
5
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
6
+
7
+
8
+ def generate_answer(question):
9
+ prompt_template = PromptTemplate(template="Answer the following question within 100 words: {question}",
10
+ input_variables=["question"], output_variables=["answer"])
11
+ # Model loading
12
+ format_prompt = prompt_template.format(question=question)
13
+ encoded_input = tokenizer(format_prompt, return_tensors='pt')
14
+ # Run the model
15
+ output = model.generate(**encoded_input, max_length=100) # Use generate method for text generation
16
+ # Decode the model output to text
17
+ decoded_output = tokenizer.decode(output[0])
18
+ response_text = decoded_output
19
+ return response_text
requirements.txt CHANGED
@@ -4,4 +4,7 @@ langchain-community==0.2.5
4
  python-multipart==0.0.9
5
  transformers==4.41.2
6
  torch==2.3.1
7
- tensorflow==2.16.2
 
 
 
 
4
  python-multipart==0.0.9
5
  transformers==4.41.2
6
  torch==2.3.1
7
+ tensorflow==2.16.2
8
+ sentencepiece
9
+ soundfile
10
+ datasets
text_to_audio.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from datasets import load_dataset
3
+ import soundfile as sf
4
+ import torch
5
+
6
+ synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
7
+
8
+
9
+ def text_to_audio(text):
10
+ # clean the response and max_size is 600
11
+ text_clean = text.replace('\n', '').replace('*', '')
12
+ text_550 = text_clean[:590]
13
+ # get speaker embeddings
14
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
15
+ speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
16
+ # You can replace this embedding with your own as well.
17
+
18
+ speech = synthesiser(text_550, forward_params={"speaker_embeddings": speaker_embedding})
19
+ sf.write("output.wav", speech["audio"], samplerate=speech["sampling_rate"])
20
+ audio_file = open("output.wav", "rb")
21
+ audio_bytes = audio_file.read()
22
+ return audio_bytes