Vaibhav Srivastav hysts HF staff commited on
Commit
8fb8950
·
1 Parent(s): da26cb0

Initial commit

Browse files

Co-authored-by: hysts <[email protected]>

Files changed (6) hide show
  1. Dockerfile +65 -0
  2. README.md +3 -2
  3. app.py +221 -0
  4. mlg_config.json +186 -0
  5. requirements.txt +4 -0
  6. style.css +16 -0
Dockerfile ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
2
+ ENV DEBIAN_FRONTEND=noninteractive
3
+ RUN apt-get update && \
4
+ apt-get upgrade -y && \
5
+ apt-get install -y --no-install-recommends \
6
+ git \
7
+ git-lfs \
8
+ wget \
9
+ curl \
10
+ # python build dependencies \
11
+ build-essential \
12
+ libssl-dev \
13
+ zlib1g-dev \
14
+ libbz2-dev \
15
+ libreadline-dev \
16
+ libsqlite3-dev \
17
+ libncursesw5-dev \
18
+ xz-utils \
19
+ tk-dev \
20
+ libxml2-dev \
21
+ libxmlsec1-dev \
22
+ libffi-dev \
23
+ liblzma-dev \
24
+ # gradio dependencies \
25
+ ffmpeg \
26
+ # fairseq2 dependencies \
27
+ libsndfile-dev && \
28
+ apt-get clean && \
29
+ rm -rf /var/lib/apt/lists/*
30
+
31
+ RUN useradd -m -u 1000 user
32
+ USER user
33
+ ENV HOME=/home/user \
34
+ PATH=/home/user/.local/bin:${PATH}
35
+ WORKDIR ${HOME}/app
36
+
37
+ RUN curl https://pyenv.run | bash
38
+ ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
39
+ ARG PYTHON_VERSION=3.10.12
40
+ RUN pyenv install ${PYTHON_VERSION} && \
41
+ pyenv global ${PYTHON_VERSION} && \
42
+ pyenv rehash && \
43
+ pip install --no-cache-dir -U pip setuptools wheel
44
+
45
+ RUN pip install --no-cache-dir torch==2.0.1 gradio==3.40.1 && \
46
+ pip install --extra-index-url https://test.pypi.org/simple/ fairseq2==0.1.0rc0
47
+ RUN --mount=type=secret,id=GITHUB_TOKEN,mode=0444,required=true \
48
+ git clone https://$(cat /run/secrets/GITHUB_TOKEN)@github.com/fairinternal/seamless_communication && \
49
+ cd seamless_communication && \
50
+ pip install . && \
51
+ cd .. && \
52
+ rm -rf seamless_communication
53
+
54
+ COPY ./requirements.txt /tmp/requirements.txt
55
+ RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
56
+
57
+ COPY --chown=1000 . ${HOME}/app
58
+ ENV PYTHONPATH=${HOME}/app \
59
+ PYTHONUNBUFFERED=1 \
60
+ GRADIO_ALLOW_FLAGGING=never \
61
+ GRADIO_NUM_PORTS=1 \
62
+ GRADIO_SERVER_NAME=0.0.0.0 \
63
+ GRADIO_THEME=huggingface \
64
+ SYSTEM=spaces
65
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
- title: Seamless M4t
3
- emoji: 🌖
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Seamless M4T
3
+ emoji: 📞
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
+ suggested_hardware: t4-medium
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import gradio as gr
5
+ import numpy as np
6
+ import torch
7
+ import torchaudio
8
+ from seamless_communication.models.inference.translator import Translator
9
+
10
+ DESCRIPTION = "# SeamlessM4T"
11
+
12
+ with open("./mlg_config.json", "r") as f:
13
+ lang_idx_map = json.loads(f.read())
14
+ LANGUAGES = lang_idx_map["multilingual"].keys()
15
+
16
+ TASK_NAMES = [
17
+ "S2ST (Speech to Speech translation)",
18
+ "S2TT (Speech to Text translation)",
19
+ "T2ST (Text to Speech translation)",
20
+ "T2TT (Text to Text translation)",
21
+ "ASR (Automatic Speech Recognition)",
22
+ ]
23
+
24
+ AUDIO_SAMPLE_RATE = 16000.0
25
+ MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
26
+
27
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
28
+ translator = Translator(
29
+ model_name_or_card="multitask_unity_large",
30
+ vocoder_name_or_card="vocoder_36langs",
31
+ device=device,
32
+ sample_rate=AUDIO_SAMPLE_RATE,
33
+ )
34
+
35
+
36
+ def predict(
37
+ task_name: str,
38
+ audio_source: str,
39
+ input_audio_mic: str,
40
+ input_audio_file: str,
41
+ input_text: str,
42
+ source_language: str,
43
+ target_language: str,
44
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
45
+ task_name = task_name.split()[0]
46
+ if task_name in ["S2ST", "S2TT", "ASR"]:
47
+ if audio_source == "microphone":
48
+ input_data = input_audio_mic
49
+ else:
50
+ input_data = input_audio_file
51
+
52
+ arr, org_sr = torchaudio.load(input_data)
53
+ new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
54
+ max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
55
+ if new_arr.shape[1] > max_length:
56
+ new_arr = new_arr[:, :max_length]
57
+ gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
58
+ torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
59
+ else:
60
+ input_data = input_text
61
+ text_out, wav, sr = translator.predict(
62
+ input=input_data,
63
+ task_str=task_name,
64
+ tgt_lang=target_language,
65
+ src_lang=source_language,
66
+ )
67
+ if task_name in ["S2ST", "T2ST"]:
68
+ return (sr, wav.cpu().detach().numpy()), text_out
69
+ else:
70
+ return None, text_out
71
+
72
+
73
+ def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
74
+ mic = audio_source == "microphone"
75
+ return (
76
+ gr.update(visible=mic, value=None), # input_audio_mic
77
+ gr.update(visible=not mic, value=None), # input_audio_file
78
+ )
79
+
80
+
81
+ def update_input_ui(task_name: str) -> tuple[dict, dict, dict, dict]:
82
+ task_name = task_name.split()[0]
83
+ if task_name in ["S2ST", "S2TT"]:
84
+ return (
85
+ gr.update(visible=True), # audio_box
86
+ gr.update(visible=False), # input_text
87
+ gr.update(visible=False), # source_language
88
+ gr.update(visible=True), # target_language
89
+ )
90
+ elif task_name in ["T2ST", "T2TT"]:
91
+ return (
92
+ gr.update(visible=False), # audio_box
93
+ gr.update(visible=True), # input_text
94
+ gr.update(visible=True), # source_language
95
+ gr.update(visible=True), # target_language
96
+ )
97
+ elif task_name == "ASR":
98
+ return (
99
+ gr.update(visible=True), # audio_box
100
+ gr.update(visible=False), # input_text
101
+ gr.update(visible=False), # source_language
102
+ gr.update(visible=True), # target_language
103
+ )
104
+ else:
105
+ raise ValueError(f"Unknown task: {task_name}")
106
+
107
+
108
+ def update_output_ui(task_name: str) -> tuple[dict, dict]:
109
+ task_name = task_name.split()[0]
110
+ if task_name in ["S2ST", "T2ST"]:
111
+ return (
112
+ gr.update(visible=True, value=None), # output_audio
113
+ gr.update(value=None), # output_text
114
+ )
115
+ elif task_name in ["S2TT", "T2TT", "ASR"]:
116
+ return (
117
+ gr.update(visible=False, value=None), # output_audio
118
+ gr.update(value=None), # output_text
119
+ )
120
+ else:
121
+ raise ValueError(f"Unknown task: {task_name}")
122
+
123
+
124
+ with gr.Blocks(css="style.css") as demo:
125
+ gr.Markdown(DESCRIPTION)
126
+ gr.DuplicateButton(
127
+ value="Duplicate Space for private use",
128
+ elem_id="duplicate-button",
129
+ visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
130
+ )
131
+ with gr.Group():
132
+ task_name = gr.Dropdown(
133
+ label="Task",
134
+ choices=TASK_NAMES,
135
+ value=TASK_NAMES[0],
136
+ )
137
+ with gr.Row():
138
+ source_language = gr.Dropdown(
139
+ label="Source language",
140
+ choices=LANGUAGES,
141
+ value="eng",
142
+ visible=False,
143
+ )
144
+ target_language = gr.Dropdown(
145
+ label="Target language",
146
+ choices=LANGUAGES,
147
+ value="fra",
148
+ )
149
+ with gr.Row() as audio_box:
150
+ audio_source = gr.Radio(
151
+ label="Audio source",
152
+ choices=["file", "microphone"],
153
+ value="file",
154
+ )
155
+ input_audio_mic = gr.Audio(
156
+ label="Input speech",
157
+ type="filepath",
158
+ source="microphone",
159
+ visible=False,
160
+ )
161
+ input_audio_file = gr.Audio(
162
+ label="Input speech",
163
+ type="filepath",
164
+ source="upload",
165
+ visible=True,
166
+ )
167
+ input_text = gr.Textbox(label="Input text", visible=False)
168
+ btn = gr.Button("Translate")
169
+ with gr.Column():
170
+ output_audio = gr.Audio(
171
+ label="Translated speech",
172
+ autoplay=False,
173
+ streaming=False,
174
+ type="numpy",
175
+ )
176
+ output_text = gr.Textbox(label="Translated text")
177
+
178
+ audio_source.change(
179
+ fn=update_audio_ui,
180
+ inputs=audio_source,
181
+ outputs=[
182
+ input_audio_mic,
183
+ input_audio_file,
184
+ ],
185
+ queue=False,
186
+ api_name=False,
187
+ )
188
+ task_name.change(
189
+ fn=update_input_ui,
190
+ inputs=task_name,
191
+ outputs=[
192
+ audio_box,
193
+ input_text,
194
+ source_language,
195
+ target_language,
196
+ ],
197
+ queue=False,
198
+ api_name=False,
199
+ ).then(
200
+ fn=update_output_ui,
201
+ inputs=task_name,
202
+ outputs=[output_audio, output_text],
203
+ queue=False,
204
+ api_name=False,
205
+ )
206
+
207
+ btn.click(
208
+ fn=predict,
209
+ inputs=[
210
+ task_name,
211
+ audio_source,
212
+ input_audio_mic,
213
+ input_audio_file,
214
+ input_text,
215
+ source_language,
216
+ target_language,
217
+ ],
218
+ outputs=[output_audio, output_text],
219
+ api_name="run",
220
+ )
221
+ demo.queue(max_size=50).launch()
mlg_config.json ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "multilingual": {
3
+ "arb": 0,
4
+ "ben": 1,
5
+ "cat": 2,
6
+ "ces": 3,
7
+ "cmn": 4,
8
+ "cym": 5,
9
+ "dan": 6,
10
+ "deu": 7,
11
+ "eng": 8,
12
+ "est": 9,
13
+ "fin": 10,
14
+ "fra": 11,
15
+ "hin": 12,
16
+ "ind": 13,
17
+ "ita": 14,
18
+ "jpn": 15,
19
+ "kor": 16,
20
+ "mlt": 17,
21
+ "nld": 18,
22
+ "pes": 19,
23
+ "pol": 20,
24
+ "por": 21,
25
+ "ron": 22,
26
+ "rus": 23,
27
+ "slk": 24,
28
+ "spa": 25,
29
+ "swe": 26,
30
+ "swh": 27,
31
+ "tel": 28,
32
+ "tgl": 29,
33
+ "tha": 30,
34
+ "tur": 31,
35
+ "ukr": 32,
36
+ "urd": 33,
37
+ "uzn": 34,
38
+ "vie": 35
39
+ },
40
+ "multispkr": {
41
+ "arb": [
42
+ 0
43
+ ],
44
+ "ben": [
45
+ 2,
46
+ 1
47
+ ],
48
+ "cat": [
49
+ 3
50
+ ],
51
+ "ces": [
52
+ 4
53
+ ],
54
+ "cmn": [
55
+ 5
56
+ ],
57
+ "cym": [
58
+ 6
59
+ ],
60
+ "dan": [
61
+ 7,
62
+ 8
63
+ ],
64
+ "deu": [
65
+ 9
66
+ ],
67
+ "eng": [
68
+ 10
69
+ ],
70
+ "est": [
71
+ 11,
72
+ 12,
73
+ 13
74
+ ],
75
+ "fin": [
76
+ 14
77
+ ],
78
+ "fra": [
79
+ 15
80
+ ],
81
+ "hin": [
82
+ 16
83
+ ],
84
+ "ind": [
85
+ 17,
86
+ 24,
87
+ 18,
88
+ 20,
89
+ 19,
90
+ 21,
91
+ 23,
92
+ 27,
93
+ 26,
94
+ 22,
95
+ 25
96
+ ],
97
+ "ita": [
98
+ 29,
99
+ 28
100
+ ],
101
+ "jpn": [
102
+ 30
103
+ ],
104
+ "kor": [
105
+ 31
106
+ ],
107
+ "mlt": [
108
+ 32,
109
+ 33,
110
+ 34
111
+ ],
112
+ "nld": [
113
+ 35
114
+ ],
115
+ "pes": [
116
+ 36
117
+ ],
118
+ "pol": [
119
+ 37
120
+ ],
121
+ "por": [
122
+ 38
123
+ ],
124
+ "ron": [
125
+ 39
126
+ ],
127
+ "rus": [
128
+ 40
129
+ ],
130
+ "slk": [
131
+ 41
132
+ ],
133
+ "spa": [
134
+ 42
135
+ ],
136
+ "swe": [
137
+ 43,
138
+ 45,
139
+ 44
140
+ ],
141
+ "swh": [
142
+ 46,
143
+ 48,
144
+ 47
145
+ ],
146
+ "tel": [
147
+ 49
148
+ ],
149
+ "tgl": [
150
+ 50
151
+ ],
152
+ "tha": [
153
+ 51,
154
+ 54,
155
+ 55,
156
+ 52,
157
+ 53
158
+ ],
159
+ "tur": [
160
+ 58,
161
+ 57,
162
+ 56
163
+ ],
164
+ "ukr": [
165
+ 59
166
+ ],
167
+ "urd": [
168
+ 60,
169
+ 61,
170
+ 62
171
+ ],
172
+ "uzn": [
173
+ 63,
174
+ 64,
175
+ 65
176
+ ],
177
+ "vie": [
178
+ 66,
179
+ 67,
180
+ 70,
181
+ 71,
182
+ 68,
183
+ 69
184
+ ]
185
+ }
186
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==3.40.1
2
+ huggingface_hub==0.16.4
3
+ torch==2.0.1
4
+ torchaudio==2.0.2
style.css ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ }
4
+
5
+ #duplicate-button {
6
+ margin: auto;
7
+ color: #fff;
8
+ background: #1565c0;
9
+ border-radius: 100vh;
10
+ }
11
+
12
+ #component-0 {
13
+ max-width: 730px;
14
+ margin: auto;
15
+ padding-top: 1.5rem;
16
+ }