Pedro13543
commited on
Commit
·
90c6d0f
1
Parent(s):
121cd9c
download models fix
Browse files- kokoro-v0_19-half.pth +3 -0
- kokoro.py +20 -11
- voices/example_mixed_af_bm_lewis_ratio_0.27.pt +3 -0
kokoro-v0_19-half.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70cbf37f84610967f2ca72dadb95456fdd8b6c72cdd6dc7372c50f525889ff0c
|
3 |
+
size 163731194
|
kokoro.py
CHANGED
@@ -131,7 +131,7 @@ def length_to_mask(lengths):
|
|
131 |
return mask
|
132 |
|
133 |
@torch.no_grad()
|
134 |
-
def
|
135 |
device = ref_s.device
|
136 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
137 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
@@ -169,35 +169,44 @@ def generate(model, text, voicepack, lang='a', speed=1, ps=None):
|
|
169 |
return out, ps
|
170 |
|
171 |
|
|
|
172 |
@torch.no_grad()
|
173 |
def forward(model, tokens, ref_s, speed):
|
|
|
174 |
device = ref_s.device
|
|
|
|
|
175 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
176 |
-
input_lengths = torch.LongTensor([tokens.shape[-1]])
|
|
|
|
|
177 |
text_mask = length_to_mask(input_lengths).to(device)
|
178 |
-
|
179 |
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
180 |
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
181 |
s = ref_s[:, 128:]
|
182 |
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
183 |
-
|
|
|
184 |
x, _ = model.predictor.lstm(d)
|
185 |
duration = model.predictor.duration_proj(x)
|
186 |
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
|
|
|
|
187 |
pred_dur = torch.round(duration).clamp(min=1).long()
|
188 |
-
|
189 |
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
|
190 |
c_frame = 0
|
191 |
for i in range(pred_aln_trg.size(0)):
|
192 |
pred_aln_trg[i, c_frame:c_frame + pred_dur[0, i].item()] = 1
|
193 |
c_frame += pred_dur[0, i].item()
|
194 |
-
|
|
|
195 |
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
|
196 |
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
197 |
-
|
|
|
198 |
t_en = model.text_encoder(tokens, input_lengths, text_mask)
|
199 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
200 |
-
|
201 |
-
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
202 |
-
|
203 |
-
|
|
|
131 |
return mask
|
132 |
|
133 |
@torch.no_grad()
|
134 |
+
def forward_2(model, tokens, ref_s, speed):
|
135 |
device = ref_s.device
|
136 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
137 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
|
|
169 |
return out, ps
|
170 |
|
171 |
|
172 |
+
|
173 |
@torch.no_grad()
|
174 |
def forward(model, tokens, ref_s, speed):
|
175 |
+
# Device management
|
176 |
device = ref_s.device
|
177 |
+
|
178 |
+
# Tokenization
|
179 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
180 |
+
input_lengths = torch.LongTensor([tokens.shape[-1]])
|
181 |
+
|
182 |
+
# Text Mask
|
183 |
text_mask = length_to_mask(input_lengths).to(device)
|
184 |
+
# Predictor
|
185 |
bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
|
186 |
d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
|
187 |
s = ref_s[:, 128:]
|
188 |
d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
|
189 |
+
|
190 |
+
# Fusion layers
|
191 |
x, _ = model.predictor.lstm(d)
|
192 |
duration = model.predictor.duration_proj(x)
|
193 |
duration = torch.sigmoid(duration).sum(axis=-1) / speed
|
194 |
+
|
195 |
+
# Prediction
|
196 |
pred_dur = torch.round(duration).clamp(min=1).long()
|
197 |
+
|
198 |
pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
|
199 |
c_frame = 0
|
200 |
for i in range(pred_aln_trg.size(0)):
|
201 |
pred_aln_trg[i, c_frame:c_frame + pred_dur[0, i].item()] = 1
|
202 |
c_frame += pred_dur[0, i].item()
|
203 |
+
|
204 |
+
# Decoder
|
205 |
en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
|
206 |
F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
|
207 |
+
|
208 |
+
# Output
|
209 |
t_en = model.text_encoder(tokens, input_lengths, text_mask)
|
210 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
211 |
+
|
212 |
+
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
|
|
|
voices/example_mixed_af_bm_lewis_ratio_0.27.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e32f1a1d5905088625a0b486b1b363596b372347f5256c8e04704f648d92adc
|
3 |
+
size 263085
|