raphgonda commited on
Commit
29bc38c
·
verified ·
1 Parent(s): 10e3728

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +333 -0
app.py CHANGED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import cv2
4
+ import torch
5
+ import tempfile
6
+ import os
7
+ import librosa
8
+ from fer import FER
9
+ from transformers import AutoModelForAudioClassification, pipeline
10
+ from moviepy.editor import VideoFileClip, AudioFileClip
11
+ import numpy as np
12
+ from torch.nn.functional import softmax
13
+ import whisper_timestamped as whisper
14
+ from translate import Translator
15
+
16
+ # Load pre-trained models
17
+ audio_model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes", trust_remote_code=True)
18
+ face_detector = FER(mtcnn=True)
19
+ classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
20
+
21
+ # Set mean and std for audio model
22
+ mean = audio_model.config.mean
23
+ std = audio_model.config.std
24
+
25
+ # Function to extract audio from video for audio emotion analysis
26
+ def extract_audio_from_video(video_path):
27
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
28
+ video_clip = VideoFileClip(video_path)
29
+ audio_clip = video_clip.audio
30
+ audio_clip.write_audiofile(temp_audio_file.name, codec="pcm_s16le")
31
+ return temp_audio_file.name
32
+
33
+ # Function to perform audio emotion detection per second
34
+ def process_audio_and_detect_emotions(audio_clip):
35
+ audio_np = np.array(audio_clip)
36
+ mask = torch.ones(1, len(audio_np))
37
+ wavs = torch.tensor(audio_np).unsqueeze(0)
38
+
39
+ with torch.no_grad():
40
+ pred = audio_model(wavs, mask)
41
+ logits = pred.logits if hasattr(pred, 'logits') else pred[0]
42
+ labels = {0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise', 4: 'Fear', 5: 'Disgust', 7: 'Neutral'}
43
+ probabilities = softmax(logits, dim=-1).squeeze(0)[[0, 1, 2, 3, 4, 5, 7]]
44
+ probabilities = probabilities / probabilities.sum()
45
+ df = pd.DataFrame([probabilities.numpy()], columns=labels.values())
46
+ return df
47
+
48
+ # Function to analyze audio emotions
49
+ def analyze_audio_emotions(video_path):
50
+ temp_audio_path = None
51
+ try:
52
+ temp_audio_path = extract_audio_from_video(video_path)
53
+ raw_wav, _ = librosa.load(temp_audio_path, sr=audio_model.config.sampling_rate)
54
+ norm_wav = (raw_wav - mean) / (std + 0.000001)
55
+
56
+ times = []
57
+ emotions_dfs = []
58
+ for start_time in range(0, len(norm_wav), audio_model.config.sampling_rate):
59
+ audio_segment = norm_wav[start_time:start_time + audio_model.config.sampling_rate]
60
+ df = process_audio_and_detect_emotions(audio_segment)
61
+ times.append(start_time / audio_model.config.sampling_rate)
62
+ emotions_dfs.append(df)
63
+
64
+ emotions_df = pd.concat(emotions_dfs, ignore_index=True)
65
+ emotions_df.insert(0, "Time(s)", times)
66
+ emotion_rename_map = {'Angry': 'anger', 'Sad': 'sadness', 'Happy': 'happy', 'Surprise': 'surprise', 'Fear': 'fear', 'Disgust': 'disgust', 'Neutral': 'neutral'}
67
+ emotions_df.rename(columns=emotion_rename_map, inplace=True)
68
+
69
+ emotions_xlsx_path = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False).name
70
+ emotions_df.to_excel(emotions_xlsx_path, index=False)
71
+
72
+ return f"Audio emotion detection completed successfully.", emotions_df, emotions_xlsx_path
73
+
74
+ except Exception as e:
75
+ return f"Error during audio emotion detection: {str(e)}", None, None
76
+ finally:
77
+ if temp_audio_path and os.path.exists(temp_audio_path):
78
+ os.remove(temp_audio_path)
79
+
80
+ # Function to detect facial emotions
81
+ def detect_faces_and_emotions(video_path):
82
+ temp_video_path = None
83
+ temp_audio_path = None
84
+ output_video_path = None
85
+ emotions_data = []
86
+ try:
87
+ temp_video = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
88
+ temp_video_path = temp_video.name
89
+ temp_audio = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
90
+ temp_audio_path = temp_audio.name
91
+ output_xlsx = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False)
92
+ output_xlsx_path = output_xlsx.name
93
+
94
+ original_video = VideoFileClip(video_path)
95
+ original_audio = original_video.audio
96
+ original_audio.write_audiofile(temp_audio_path)
97
+
98
+ cap = cv2.VideoCapture(video_path)
99
+ if not cap.isOpened():
100
+ raise Exception("Error: Could not open video file.")
101
+
102
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
103
+ frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
104
+ frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
105
+
106
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
107
+ out = cv2.VideoWriter(temp_video_path, fourcc, fps, (frame_width, frame_height))
108
+
109
+ frame_number = 0
110
+ while cap.isOpened():
111
+ ret, frame = cap.read()
112
+ if not ret:
113
+ break
114
+ if frame is None:
115
+ continue
116
+
117
+ time_seconds = round(frame_number / fps)
118
+ result = face_detector.detect_emotions(frame)
119
+
120
+ for face in result:
121
+ bounding_box = face["box"]
122
+ emotions = face["emotions"]
123
+ emotions["Time(s)"] = time_seconds
124
+ emotions_data.append(emotions)
125
+ cv2.rectangle(frame, (bounding_box[0], bounding_box[1]),
126
+ (bounding_box[0] + bounding_box[2], bounding_box[1] + bounding_box[3]), (0, 155, 255), 2)
127
+ for index, (emotion_name, score) in enumerate(emotions.items()):
128
+ color = (211, 211, 211) if score < 0.01 else (255, 0, 0)
129
+ emotion_score = "{}: {:.2f}".format(emotion_name, score)
130
+ cv2.putText(frame, emotion_score, (bounding_box[0], bounding_box[1] + bounding_box[3] + 30 + index * 15),
131
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1, cv2.LINE_AA)
132
+
133
+ out.write(frame)
134
+ frame_number += 1
135
+
136
+ cap.release()
137
+ out.release()
138
+
139
+ emotions_df = pd.DataFrame(emotions_data)
140
+ emotions_df['Time(s)'] = emotions_df['Time(s)'].round().astype(int)
141
+ max_time = emotions_df['Time(s)'].max()
142
+ all_times = pd.DataFrame({'Time(s)': range(max_time + 1)})
143
+ avg_scores = emotions_df.groupby("Time(s)").mean().reset_index()
144
+ df_merged = pd.merge(all_times, avg_scores, on='Time(s)', how='left')
145
+ df_merged.fillna(0, inplace=True)
146
+ df_merged['Time(s)'] = df_merged['Time(s)'].astype(str) + " sec"
147
+ df_merged.to_excel(output_xlsx_path, index=False)
148
+
149
+ processed_video = VideoFileClip(temp_video_path)
150
+ audio = AudioFileClip(temp_audio_path)
151
+ final_video = processed_video.set_audio(audio)
152
+ output_video = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
153
+ output_video_path = output_video.name
154
+ final_video.write_videofile(output_video_path, codec='libx264')
155
+
156
+ return "Face and emotion detection completed successfully.", df_merged, output_xlsx_path, output_video_path
157
+
158
+ except Exception as e:
159
+ return f"Error during processing: {str(e)}", None, None, None
160
+ finally:
161
+ if temp_video_path and os.path.exists(temp_video_path):
162
+ os.remove(temp_video_path)
163
+ if temp_audio_path and os.path.exists(temp_audio_path):
164
+ os.remove(temp_audio_path)
165
+
166
+ # Function to analyze text emotions
167
+ def process_video_text(video_path):
168
+ temp_audio_path = None
169
+ try:
170
+ video_clip = VideoFileClip(video_path)
171
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
172
+ temp_audio_path = temp_audio_file.name
173
+ video_clip.audio.write_audiofile(temp_audio_path)
174
+
175
+ audio = whisper.load_audio(temp_audio_path)
176
+ model = whisper.load_model("medium", device="cpu")
177
+ result = whisper.transcribe(model, audio)
178
+
179
+ # Create lists to store word-level data with timestamps
180
+ word_texts = []
181
+ word_starts = []
182
+ word_ends = []
183
+ word_confidences = []
184
+
185
+ for segment in result['segments']:
186
+ for word in segment['words']:
187
+ word_texts.append(word['text'])
188
+ word_starts.append(word['start'])
189
+ word_ends.append(word['end'])
190
+ word_confidences.append(word['confidence'])
191
+
192
+ # Create segments DataFrame
193
+ segments_data = [{'text': seg['text'], 'start': seg['start'], 'end': seg['end'], 'confidence': seg['confidence']} for seg in result['segments']]
194
+ segments_df = pd.DataFrame(segments_data)
195
+
196
+ # Translate from Korean to English
197
+ translator = Translator(from_lang='ko', to_lang='en')
198
+ segments_df['Translated_Text'] = segments_df['text'].apply(lambda x: translator.translate(x))
199
+
200
+ # Apply the sentiment analysis model to the translated text
201
+ segments_df['Sentiment_Scores'] = segments_df['Translated_Text'].apply(lambda x: {entry['label']: entry['score'] for entry in classifier(x)[0]})
202
+
203
+ # Split the sentiment scores into individual columns
204
+ sentiment_df = segments_df['Sentiment_Scores'].apply(pd.Series)
205
+ sentiment_df = pd.concat([segments_df, sentiment_df], axis=1)
206
+
207
+ # Create words DataFrame
208
+ words_data = {
209
+ 'text': word_texts,
210
+ 'start': word_starts,
211
+ 'end': word_ends,
212
+ 'confidence': word_confidences
213
+ }
214
+ words_df = pd.DataFrame(words_data)
215
+
216
+ # Round up the start time to the next second
217
+ words_df['second'] = words_df['start'].apply(lambda x: int(np.ceil(x)))
218
+
219
+ # Group words by second, concatenating words that belong to the same second
220
+ words_grouped = words_df.groupby('second').agg({
221
+ 'text': lambda x: ' '.join(x),
222
+ 'start': 'min',
223
+ 'end': 'max',
224
+ 'confidence': 'mean'
225
+ }).reset_index()
226
+
227
+ # Fill in missing seconds
228
+ max_second = int(video_clip.duration) # The last second in the video
229
+ all_seconds = pd.DataFrame({'second': np.arange(0, max_second + 1)}) # Start from 0 and go to the maximum second
230
+ words_grouped = all_seconds.merge(words_grouped, on='second', how='left')
231
+
232
+ # Fill missing values with blanks or zeros
233
+ words_grouped['text'].fillna('', inplace=True)
234
+ words_grouped.fillna(0, inplace=True)
235
+
236
+ # Initialize emotion columns with NaN values
237
+ emotion_columns = sentiment_df.columns.difference(['text', 'start', 'end', 'confidence', 'Translated_Text', 'Sentiment_Scores'])
238
+ for col in emotion_columns:
239
+ words_grouped[col] = np.nan
240
+
241
+ # For each second, find the corresponding segment and copy its emotion scores
242
+ for i, row in words_grouped.iterrows():
243
+ matching_segment = sentiment_df[(sentiment_df['start'] <= row['start']) & (sentiment_df['end'] >= row['end'])]
244
+ if not matching_segment.empty:
245
+ for emotion in emotion_columns:
246
+ words_grouped.at[i, emotion] = matching_segment.iloc[0][emotion]
247
+
248
+ # Replace any NaN values in emotion columns with 0
249
+ words_grouped[emotion_columns] = words_grouped[emotion_columns].fillna(0)
250
+
251
+ # Save DataFrames to XLSX files
252
+ segments_xlsx_path = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False).name
253
+ words_xlsx_path = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False).name
254
+ sentiment_df.to_excel(segments_xlsx_path, index=False)
255
+ words_grouped.to_excel(words_xlsx_path, index=False)
256
+
257
+ return words_grouped, sentiment_df, words_xlsx_path, segments_xlsx_path, "Text emotion processing completed successfully!"
258
+
259
+ except Exception as e:
260
+ return None, None, None, None, f"Error during text emotion processing: {str(e)}"
261
+ finally:
262
+ if temp_audio_path and os.path.exists(temp_audio_path):
263
+ os.remove(temp_audio_path)
264
+
265
+ # Gradio App
266
+ def gradio_app():
267
+ interface = gr.Blocks()
268
+
269
+ with interface:
270
+ gr.Markdown("## I-MEQ: Emotion Monitoring System")
271
+ video_input = gr.Video(label="Upload your video for analysis", height=600)
272
+
273
+ with gr.Row():
274
+ analyze_audio_button = gr.Button("Analyze Audio Emotions")
275
+ analyze_fer_button = gr.Button("Analyze Facial Emotions")
276
+ analyze_text_button = gr.Button("Transcribe & Analyze Textual Emotions")
277
+
278
+ with gr.Row():
279
+ with gr.Column():
280
+ audio_analysis_status = gr.Textbox(label="Audio Emotion Analysis Status")
281
+ audio_emotions_dataframe = gr.Dataframe(label="Audio Emotions DataFrame", interactive=False)
282
+ audio_emotions_xlsx_download = gr.File(label="Download Audio Emotions XLSX")
283
+
284
+ with gr.Column():
285
+ fer_analysis_status = gr.Textbox(label="Facial Emotion Analysis Status")
286
+ fer_emotions_dataframe = gr.Dataframe(label="Facial Emotions DataFrame", interactive=False)
287
+ fer_emotions_xlsx_download = gr.File(label="Download Facial Emotions XLSX")
288
+ processed_video_download = gr.File(label="Download Processed Video")
289
+
290
+ with gr.Column():
291
+ text_analysis_status = gr.Textbox(label="Text Sentiment Analysis Status")
292
+ words_dataframe = gr.Dataframe(label="Words DataFrame", interactive=False)
293
+ segments_dataframe = gr.Dataframe(label="Segments DataFrame", interactive=False)
294
+ words_xlsx_download = gr.File(label="Download Words XLSX")
295
+ segments_xlsx_download = gr.File(label="Download Segments XLSX")
296
+
297
+ analyze_audio_button.click(
298
+ analyze_audio_emotions,
299
+ inputs=video_input,
300
+ outputs=[
301
+ audio_analysis_status,
302
+ audio_emotions_dataframe,
303
+ audio_emotions_xlsx_download
304
+ ]
305
+ )
306
+
307
+ analyze_fer_button.click(
308
+ detect_faces_and_emotions,
309
+ inputs=video_input,
310
+ outputs=[
311
+ fer_analysis_status,
312
+ fer_emotions_dataframe,
313
+ fer_emotions_xlsx_download,
314
+ processed_video_download
315
+ ]
316
+ )
317
+
318
+ analyze_text_button.click(
319
+ process_video_text,
320
+ inputs=video_input,
321
+ outputs=[
322
+ words_dataframe,
323
+ segments_dataframe,
324
+ words_xlsx_download,
325
+ segments_xlsx_download,
326
+ text_analysis_status
327
+ ]
328
+ )
329
+
330
+ interface.launch()
331
+
332
+ # Start the Gradio app
333
+ gradio_app()