Pedro13543 commited on
Commit
90c6d0f
·
1 Parent(s): 121cd9c

download models fix

Browse files
kokoro-v0_19-half.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70cbf37f84610967f2ca72dadb95456fdd8b6c72cdd6dc7372c50f525889ff0c
3
+ size 163731194
kokoro.py CHANGED
@@ -131,7 +131,7 @@ def length_to_mask(lengths):
131
  return mask
132
 
133
  @torch.no_grad()
134
- def forward(model, tokens, ref_s, speed):
135
  device = ref_s.device
136
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
137
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
@@ -169,35 +169,44 @@ def generate(model, text, voicepack, lang='a', speed=1, ps=None):
169
  return out, ps
170
 
171
 
 
172
  @torch.no_grad()
173
  def forward(model, tokens, ref_s, speed):
 
174
  device = ref_s.device
 
 
175
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
176
- input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
 
 
177
  text_mask = length_to_mask(input_lengths).to(device)
178
-
179
  bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
180
  d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
181
  s = ref_s[:, 128:]
182
  d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
183
-
 
184
  x, _ = model.predictor.lstm(d)
185
  duration = model.predictor.duration_proj(x)
186
  duration = torch.sigmoid(duration).sum(axis=-1) / speed
 
 
187
  pred_dur = torch.round(duration).clamp(min=1).long()
188
-
189
  pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
190
  c_frame = 0
191
  for i in range(pred_aln_trg.size(0)):
192
  pred_aln_trg[i, c_frame:c_frame + pred_dur[0, i].item()] = 1
193
  c_frame += pred_dur[0, i].item()
194
-
 
195
  en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
196
  F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
197
-
 
198
  t_en = model.text_encoder(tokens, input_lengths, text_mask)
199
  asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
200
-
201
- return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
202
-
203
-
 
131
  return mask
132
 
133
  @torch.no_grad()
134
+ def forward_2(model, tokens, ref_s, speed):
135
  device = ref_s.device
136
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
137
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
 
169
  return out, ps
170
 
171
 
172
+
173
  @torch.no_grad()
174
  def forward(model, tokens, ref_s, speed):
175
+ # Device management
176
  device = ref_s.device
177
+
178
+ # Tokenization
179
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
180
+ input_lengths = torch.LongTensor([tokens.shape[-1]])
181
+
182
+ # Text Mask
183
  text_mask = length_to_mask(input_lengths).to(device)
184
+ # Predictor
185
  bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
186
  d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
187
  s = ref_s[:, 128:]
188
  d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
189
+
190
+ # Fusion layers
191
  x, _ = model.predictor.lstm(d)
192
  duration = model.predictor.duration_proj(x)
193
  duration = torch.sigmoid(duration).sum(axis=-1) / speed
194
+
195
+ # Prediction
196
  pred_dur = torch.round(duration).clamp(min=1).long()
197
+
198
  pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
199
  c_frame = 0
200
  for i in range(pred_aln_trg.size(0)):
201
  pred_aln_trg[i, c_frame:c_frame + pred_dur[0, i].item()] = 1
202
  c_frame += pred_dur[0, i].item()
203
+
204
+ # Decoder
205
  en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
206
  F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
207
+
208
+ # Output
209
  t_en = model.text_encoder(tokens, input_lengths, text_mask)
210
  asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
211
+
212
+ return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
 
 
voices/example_mixed_af_bm_lewis_ratio_0.27.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e32f1a1d5905088625a0b486b1b363596b372347f5256c8e04704f648d92adc
3
+ size 263085