marcmaxmeister commited on
Commit
f02201b
·
1 Parent(s): 7c15f78

Adding an example of using pretrained model to predict emotion in local audio file

Browse files

This card had no practical example of how to use this model, but this repo (https://github.com/m3hrdadfi/soxan) had one that works with this model, so I am adding notes here for others.

Files changed (1) hide show
  1. README.md +53 -1
README.md CHANGED
@@ -6,4 +6,56 @@ tags:
6
  - audio
7
  - HUBert
8
  ---
9
- A place to hold the model for easier inference.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  - audio
7
  - HUBert
8
  ---
9
+
10
+
11
+ Working example of using pretrained model to predict emotion in local audio file
12
+
13
+ ```
14
+
15
+ def predict_emotion_hubert(audio_file):
16
+ """ inspired by an example from https://github.com/m3hrdadfi/soxan """
17
+ from audio_models import HubertForSpeechClassification
18
+ from transformers import Wav2Vec2FeatureExtractor, AutoConfig
19
+ import torch.nn.functional as F
20
+ import torch
21
+ import numpy as np
22
+ from pydub import AudioSegment
23
+
24
+ model = HubertForSpeechClassification.from_pretrained("Rajaram1996/Hubert_emotion") # Downloading: 362M
25
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
26
+ sampling_rate=16000 # defined by the model; must convert mp3 to this rate.
27
+ config = AutoConfig.from_pretrained("Rajaram1996/Hubert_emotion")
28
+
29
+ def speech_file_to_array(path, sampling_rate):
30
+ # using torchaudio...
31
+ # speech_array, _sampling_rate = torchaudio.load(path)
32
+ # resampler = torchaudio.transforms.Resample(_sampling_rate, sampling_rate)
33
+ # speech = resampler(speech_array).squeeze().numpy()
34
+ sound = AudioSegment.from_file(path)
35
+ sound = sound.set_frame_rate(sampling_rate)
36
+ sound_array = np.array(sound.get_array_of_samples())
37
+ return sound_array
38
+
39
+ sound_array = speech_file_to_array(audio_file, sampling_rate)
40
+ inputs = feature_extractor(sound_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
41
+ inputs = {key: inputs[key].to("cpu").float() for key in inputs}
42
+
43
+ with torch.no_grad():
44
+ logits = model(**inputs).logits
45
+
46
+ scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
47
+ outputs = [{
48
+ "emo": config.id2label[i],
49
+ "score": round(score * 100, 1)}
50
+ for i, score in enumerate(scores)
51
+ ]
52
+ return [row for row in sorted(outputs, key=lambda x:x["score"], reverse=True) if row['score'] != '0.0%'][:2]
53
+ ```
54
+
55
+ ```
56
+
57
+ result = predict_emotion_hubert("male-crying.mp3")
58
+ >>> result
59
+ [{'emo': 'male_sad', 'score': 91.0}, {'emo': 'male_fear', 'score': 4.8}]
60
+ ```
61
+