Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import cv2
|
4 |
+
import torch
|
5 |
+
import tempfile
|
6 |
+
import os
|
7 |
+
import librosa
|
8 |
+
from fer import FER
|
9 |
+
from transformers import AutoModelForAudioClassification, pipeline
|
10 |
+
from moviepy.editor import VideoFileClip, AudioFileClip
|
11 |
+
import numpy as np
|
12 |
+
from torch.nn.functional import softmax
|
13 |
+
import whisper_timestamped as whisper
|
14 |
+
from translate import Translator
|
15 |
+
|
16 |
+
# Load pre-trained models
|
17 |
+
audio_model = AutoModelForAudioClassification.from_pretrained("3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes", trust_remote_code=True)
|
18 |
+
face_detector = FER(mtcnn=True)
|
19 |
+
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
|
20 |
+
|
21 |
+
# Set mean and std for audio model
|
22 |
+
mean = audio_model.config.mean
|
23 |
+
std = audio_model.config.std
|
24 |
+
|
25 |
+
# Function to extract audio from video for audio emotion analysis
|
26 |
+
def extract_audio_from_video(video_path):
|
27 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
|
28 |
+
video_clip = VideoFileClip(video_path)
|
29 |
+
audio_clip = video_clip.audio
|
30 |
+
audio_clip.write_audiofile(temp_audio_file.name, codec="pcm_s16le")
|
31 |
+
return temp_audio_file.name
|
32 |
+
|
33 |
+
# Function to perform audio emotion detection per second
|
34 |
+
def process_audio_and_detect_emotions(audio_clip):
|
35 |
+
audio_np = np.array(audio_clip)
|
36 |
+
mask = torch.ones(1, len(audio_np))
|
37 |
+
wavs = torch.tensor(audio_np).unsqueeze(0)
|
38 |
+
|
39 |
+
with torch.no_grad():
|
40 |
+
pred = audio_model(wavs, mask)
|
41 |
+
logits = pred.logits if hasattr(pred, 'logits') else pred[0]
|
42 |
+
labels = {0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise', 4: 'Fear', 5: 'Disgust', 7: 'Neutral'}
|
43 |
+
probabilities = softmax(logits, dim=-1).squeeze(0)[[0, 1, 2, 3, 4, 5, 7]]
|
44 |
+
probabilities = probabilities / probabilities.sum()
|
45 |
+
df = pd.DataFrame([probabilities.numpy()], columns=labels.values())
|
46 |
+
return df
|
47 |
+
|
48 |
+
# Function to analyze audio emotions
|
49 |
+
def analyze_audio_emotions(video_path):
|
50 |
+
temp_audio_path = None
|
51 |
+
try:
|
52 |
+
temp_audio_path = extract_audio_from_video(video_path)
|
53 |
+
raw_wav, _ = librosa.load(temp_audio_path, sr=audio_model.config.sampling_rate)
|
54 |
+
norm_wav = (raw_wav - mean) / (std + 0.000001)
|
55 |
+
|
56 |
+
times = []
|
57 |
+
emotions_dfs = []
|
58 |
+
for start_time in range(0, len(norm_wav), audio_model.config.sampling_rate):
|
59 |
+
audio_segment = norm_wav[start_time:start_time + audio_model.config.sampling_rate]
|
60 |
+
df = process_audio_and_detect_emotions(audio_segment)
|
61 |
+
times.append(start_time / audio_model.config.sampling_rate)
|
62 |
+
emotions_dfs.append(df)
|
63 |
+
|
64 |
+
emotions_df = pd.concat(emotions_dfs, ignore_index=True)
|
65 |
+
emotions_df.insert(0, "Time(s)", times)
|
66 |
+
emotion_rename_map = {'Angry': 'anger', 'Sad': 'sadness', 'Happy': 'happy', 'Surprise': 'surprise', 'Fear': 'fear', 'Disgust': 'disgust', 'Neutral': 'neutral'}
|
67 |
+
emotions_df.rename(columns=emotion_rename_map, inplace=True)
|
68 |
+
|
69 |
+
emotions_xlsx_path = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False).name
|
70 |
+
emotions_df.to_excel(emotions_xlsx_path, index=False)
|
71 |
+
|
72 |
+
return f"Audio emotion detection completed successfully.", emotions_df, emotions_xlsx_path
|
73 |
+
|
74 |
+
except Exception as e:
|
75 |
+
return f"Error during audio emotion detection: {str(e)}", None, None
|
76 |
+
finally:
|
77 |
+
if temp_audio_path and os.path.exists(temp_audio_path):
|
78 |
+
os.remove(temp_audio_path)
|
79 |
+
|
80 |
+
# Function to detect facial emotions
|
81 |
+
def detect_faces_and_emotions(video_path):
|
82 |
+
temp_video_path = None
|
83 |
+
temp_audio_path = None
|
84 |
+
output_video_path = None
|
85 |
+
emotions_data = []
|
86 |
+
try:
|
87 |
+
temp_video = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
|
88 |
+
temp_video_path = temp_video.name
|
89 |
+
temp_audio = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
|
90 |
+
temp_audio_path = temp_audio.name
|
91 |
+
output_xlsx = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False)
|
92 |
+
output_xlsx_path = output_xlsx.name
|
93 |
+
|
94 |
+
original_video = VideoFileClip(video_path)
|
95 |
+
original_audio = original_video.audio
|
96 |
+
original_audio.write_audiofile(temp_audio_path)
|
97 |
+
|
98 |
+
cap = cv2.VideoCapture(video_path)
|
99 |
+
if not cap.isOpened():
|
100 |
+
raise Exception("Error: Could not open video file.")
|
101 |
+
|
102 |
+
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
103 |
+
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
104 |
+
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
105 |
+
|
106 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
107 |
+
out = cv2.VideoWriter(temp_video_path, fourcc, fps, (frame_width, frame_height))
|
108 |
+
|
109 |
+
frame_number = 0
|
110 |
+
while cap.isOpened():
|
111 |
+
ret, frame = cap.read()
|
112 |
+
if not ret:
|
113 |
+
break
|
114 |
+
if frame is None:
|
115 |
+
continue
|
116 |
+
|
117 |
+
time_seconds = round(frame_number / fps)
|
118 |
+
result = face_detector.detect_emotions(frame)
|
119 |
+
|
120 |
+
for face in result:
|
121 |
+
bounding_box = face["box"]
|
122 |
+
emotions = face["emotions"]
|
123 |
+
emotions["Time(s)"] = time_seconds
|
124 |
+
emotions_data.append(emotions)
|
125 |
+
cv2.rectangle(frame, (bounding_box[0], bounding_box[1]),
|
126 |
+
(bounding_box[0] + bounding_box[2], bounding_box[1] + bounding_box[3]), (0, 155, 255), 2)
|
127 |
+
for index, (emotion_name, score) in enumerate(emotions.items()):
|
128 |
+
color = (211, 211, 211) if score < 0.01 else (255, 0, 0)
|
129 |
+
emotion_score = "{}: {:.2f}".format(emotion_name, score)
|
130 |
+
cv2.putText(frame, emotion_score, (bounding_box[0], bounding_box[1] + bounding_box[3] + 30 + index * 15),
|
131 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1, cv2.LINE_AA)
|
132 |
+
|
133 |
+
out.write(frame)
|
134 |
+
frame_number += 1
|
135 |
+
|
136 |
+
cap.release()
|
137 |
+
out.release()
|
138 |
+
|
139 |
+
emotions_df = pd.DataFrame(emotions_data)
|
140 |
+
emotions_df['Time(s)'] = emotions_df['Time(s)'].round().astype(int)
|
141 |
+
max_time = emotions_df['Time(s)'].max()
|
142 |
+
all_times = pd.DataFrame({'Time(s)': range(max_time + 1)})
|
143 |
+
avg_scores = emotions_df.groupby("Time(s)").mean().reset_index()
|
144 |
+
df_merged = pd.merge(all_times, avg_scores, on='Time(s)', how='left')
|
145 |
+
df_merged.fillna(0, inplace=True)
|
146 |
+
df_merged['Time(s)'] = df_merged['Time(s)'].astype(str) + " sec"
|
147 |
+
df_merged.to_excel(output_xlsx_path, index=False)
|
148 |
+
|
149 |
+
processed_video = VideoFileClip(temp_video_path)
|
150 |
+
audio = AudioFileClip(temp_audio_path)
|
151 |
+
final_video = processed_video.set_audio(audio)
|
152 |
+
output_video = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
|
153 |
+
output_video_path = output_video.name
|
154 |
+
final_video.write_videofile(output_video_path, codec='libx264')
|
155 |
+
|
156 |
+
return "Face and emotion detection completed successfully.", df_merged, output_xlsx_path, output_video_path
|
157 |
+
|
158 |
+
except Exception as e:
|
159 |
+
return f"Error during processing: {str(e)}", None, None, None
|
160 |
+
finally:
|
161 |
+
if temp_video_path and os.path.exists(temp_video_path):
|
162 |
+
os.remove(temp_video_path)
|
163 |
+
if temp_audio_path and os.path.exists(temp_audio_path):
|
164 |
+
os.remove(temp_audio_path)
|
165 |
+
|
166 |
+
# Function to analyze text emotions
|
167 |
+
def process_video_text(video_path):
|
168 |
+
temp_audio_path = None
|
169 |
+
try:
|
170 |
+
video_clip = VideoFileClip(video_path)
|
171 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
|
172 |
+
temp_audio_path = temp_audio_file.name
|
173 |
+
video_clip.audio.write_audiofile(temp_audio_path)
|
174 |
+
|
175 |
+
audio = whisper.load_audio(temp_audio_path)
|
176 |
+
model = whisper.load_model("medium", device="cpu")
|
177 |
+
result = whisper.transcribe(model, audio)
|
178 |
+
|
179 |
+
# Create lists to store word-level data with timestamps
|
180 |
+
word_texts = []
|
181 |
+
word_starts = []
|
182 |
+
word_ends = []
|
183 |
+
word_confidences = []
|
184 |
+
|
185 |
+
for segment in result['segments']:
|
186 |
+
for word in segment['words']:
|
187 |
+
word_texts.append(word['text'])
|
188 |
+
word_starts.append(word['start'])
|
189 |
+
word_ends.append(word['end'])
|
190 |
+
word_confidences.append(word['confidence'])
|
191 |
+
|
192 |
+
# Create segments DataFrame
|
193 |
+
segments_data = [{'text': seg['text'], 'start': seg['start'], 'end': seg['end'], 'confidence': seg['confidence']} for seg in result['segments']]
|
194 |
+
segments_df = pd.DataFrame(segments_data)
|
195 |
+
|
196 |
+
# Translate from Korean to English
|
197 |
+
translator = Translator(from_lang='ko', to_lang='en')
|
198 |
+
segments_df['Translated_Text'] = segments_df['text'].apply(lambda x: translator.translate(x))
|
199 |
+
|
200 |
+
# Apply the sentiment analysis model to the translated text
|
201 |
+
segments_df['Sentiment_Scores'] = segments_df['Translated_Text'].apply(lambda x: {entry['label']: entry['score'] for entry in classifier(x)[0]})
|
202 |
+
|
203 |
+
# Split the sentiment scores into individual columns
|
204 |
+
sentiment_df = segments_df['Sentiment_Scores'].apply(pd.Series)
|
205 |
+
sentiment_df = pd.concat([segments_df, sentiment_df], axis=1)
|
206 |
+
|
207 |
+
# Create words DataFrame
|
208 |
+
words_data = {
|
209 |
+
'text': word_texts,
|
210 |
+
'start': word_starts,
|
211 |
+
'end': word_ends,
|
212 |
+
'confidence': word_confidences
|
213 |
+
}
|
214 |
+
words_df = pd.DataFrame(words_data)
|
215 |
+
|
216 |
+
# Round up the start time to the next second
|
217 |
+
words_df['second'] = words_df['start'].apply(lambda x: int(np.ceil(x)))
|
218 |
+
|
219 |
+
# Group words by second, concatenating words that belong to the same second
|
220 |
+
words_grouped = words_df.groupby('second').agg({
|
221 |
+
'text': lambda x: ' '.join(x),
|
222 |
+
'start': 'min',
|
223 |
+
'end': 'max',
|
224 |
+
'confidence': 'mean'
|
225 |
+
}).reset_index()
|
226 |
+
|
227 |
+
# Fill in missing seconds
|
228 |
+
max_second = int(video_clip.duration) # The last second in the video
|
229 |
+
all_seconds = pd.DataFrame({'second': np.arange(0, max_second + 1)}) # Start from 0 and go to the maximum second
|
230 |
+
words_grouped = all_seconds.merge(words_grouped, on='second', how='left')
|
231 |
+
|
232 |
+
# Fill missing values with blanks or zeros
|
233 |
+
words_grouped['text'].fillna('', inplace=True)
|
234 |
+
words_grouped.fillna(0, inplace=True)
|
235 |
+
|
236 |
+
# Initialize emotion columns with NaN values
|
237 |
+
emotion_columns = sentiment_df.columns.difference(['text', 'start', 'end', 'confidence', 'Translated_Text', 'Sentiment_Scores'])
|
238 |
+
for col in emotion_columns:
|
239 |
+
words_grouped[col] = np.nan
|
240 |
+
|
241 |
+
# For each second, find the corresponding segment and copy its emotion scores
|
242 |
+
for i, row in words_grouped.iterrows():
|
243 |
+
matching_segment = sentiment_df[(sentiment_df['start'] <= row['start']) & (sentiment_df['end'] >= row['end'])]
|
244 |
+
if not matching_segment.empty:
|
245 |
+
for emotion in emotion_columns:
|
246 |
+
words_grouped.at[i, emotion] = matching_segment.iloc[0][emotion]
|
247 |
+
|
248 |
+
# Replace any NaN values in emotion columns with 0
|
249 |
+
words_grouped[emotion_columns] = words_grouped[emotion_columns].fillna(0)
|
250 |
+
|
251 |
+
# Save DataFrames to XLSX files
|
252 |
+
segments_xlsx_path = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False).name
|
253 |
+
words_xlsx_path = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False).name
|
254 |
+
sentiment_df.to_excel(segments_xlsx_path, index=False)
|
255 |
+
words_grouped.to_excel(words_xlsx_path, index=False)
|
256 |
+
|
257 |
+
return words_grouped, sentiment_df, words_xlsx_path, segments_xlsx_path, "Text emotion processing completed successfully!"
|
258 |
+
|
259 |
+
except Exception as e:
|
260 |
+
return None, None, None, None, f"Error during text emotion processing: {str(e)}"
|
261 |
+
finally:
|
262 |
+
if temp_audio_path and os.path.exists(temp_audio_path):
|
263 |
+
os.remove(temp_audio_path)
|
264 |
+
|
265 |
+
# Gradio App
|
266 |
+
def gradio_app():
|
267 |
+
interface = gr.Blocks()
|
268 |
+
|
269 |
+
with interface:
|
270 |
+
gr.Markdown("## I-MEQ: Emotion Monitoring System")
|
271 |
+
video_input = gr.Video(label="Upload your video for analysis", height=600)
|
272 |
+
|
273 |
+
with gr.Row():
|
274 |
+
analyze_audio_button = gr.Button("Analyze Audio Emotions")
|
275 |
+
analyze_fer_button = gr.Button("Analyze Facial Emotions")
|
276 |
+
analyze_text_button = gr.Button("Transcribe & Analyze Textual Emotions")
|
277 |
+
|
278 |
+
with gr.Row():
|
279 |
+
with gr.Column():
|
280 |
+
audio_analysis_status = gr.Textbox(label="Audio Emotion Analysis Status")
|
281 |
+
audio_emotions_dataframe = gr.Dataframe(label="Audio Emotions DataFrame", interactive=False)
|
282 |
+
audio_emotions_xlsx_download = gr.File(label="Download Audio Emotions XLSX")
|
283 |
+
|
284 |
+
with gr.Column():
|
285 |
+
fer_analysis_status = gr.Textbox(label="Facial Emotion Analysis Status")
|
286 |
+
fer_emotions_dataframe = gr.Dataframe(label="Facial Emotions DataFrame", interactive=False)
|
287 |
+
fer_emotions_xlsx_download = gr.File(label="Download Facial Emotions XLSX")
|
288 |
+
processed_video_download = gr.File(label="Download Processed Video")
|
289 |
+
|
290 |
+
with gr.Column():
|
291 |
+
text_analysis_status = gr.Textbox(label="Text Sentiment Analysis Status")
|
292 |
+
words_dataframe = gr.Dataframe(label="Words DataFrame", interactive=False)
|
293 |
+
segments_dataframe = gr.Dataframe(label="Segments DataFrame", interactive=False)
|
294 |
+
words_xlsx_download = gr.File(label="Download Words XLSX")
|
295 |
+
segments_xlsx_download = gr.File(label="Download Segments XLSX")
|
296 |
+
|
297 |
+
analyze_audio_button.click(
|
298 |
+
analyze_audio_emotions,
|
299 |
+
inputs=video_input,
|
300 |
+
outputs=[
|
301 |
+
audio_analysis_status,
|
302 |
+
audio_emotions_dataframe,
|
303 |
+
audio_emotions_xlsx_download
|
304 |
+
]
|
305 |
+
)
|
306 |
+
|
307 |
+
analyze_fer_button.click(
|
308 |
+
detect_faces_and_emotions,
|
309 |
+
inputs=video_input,
|
310 |
+
outputs=[
|
311 |
+
fer_analysis_status,
|
312 |
+
fer_emotions_dataframe,
|
313 |
+
fer_emotions_xlsx_download,
|
314 |
+
processed_video_download
|
315 |
+
]
|
316 |
+
)
|
317 |
+
|
318 |
+
analyze_text_button.click(
|
319 |
+
process_video_text,
|
320 |
+
inputs=video_input,
|
321 |
+
outputs=[
|
322 |
+
words_dataframe,
|
323 |
+
segments_dataframe,
|
324 |
+
words_xlsx_download,
|
325 |
+
segments_xlsx_download,
|
326 |
+
text_analysis_status
|
327 |
+
]
|
328 |
+
)
|
329 |
+
|
330 |
+
interface.launch()
|
331 |
+
|
332 |
+
# Start the Gradio app
|
333 |
+
gradio_app()
|