import gradio as gr import numpy as np import json import os from os import walk from models import build_model from kokoro import generate import time from datetime import datetime import torch import requests # Load the model and voicepack device = 'cuda' if torch.cuda.is_available() else 'cpu' MODEL = build_model('kokoro-v0_19-half.pth', device) # Load voices and make the saved voices dir global VOICE_NAME global voices voices = [] VOICE_NAME = [] model_list = [] os.makedirs("voices", exist_ok=True) for (dirpath, dirnames, filenames) in walk("voices/"): VOICE_NAME.extend(filenames) voices = [torch.load(f'voices/{vn}', weights_only=True).to(device) for vn in VOICE_NAME] break # Get top-k indices def get_topk_indices(vector_index, k): differences = diffs[vector_index].sum(dim=-1)[:,0] topk_indices = torch.topk(differences, k=k).indices return topk_indices.tolist() # Visualize top-k differences def visualize_topk(vector_index, k): vector_index= VOICE_NAME.index(vector_index) topk_indices = get_topk_indices(vector_index, k) return json.dumps(topk_indices, indent=2) #FFT interpolation import torch def interpolate_vectors(vec1, vec2, t,importances=None): """ Interpolate between two vectors in the frequency domain using PyTorch. Parameters: - vec1: torch.Tensor of shape (511, 1, 256) - vec2: torch.Tensor of shape (511, 1, 256) - t: interpolation parameter between 0 and 1 Returns: - Interpolated vector of shape (511, 1, 256) """ # print(vec1.shape,vec2.shape) # Reshape to (511, 256) vec1 = vec1.view(511, 256) vec2 = vec2.view(511, 256) # Apply FFT along the first dimension fft1 = torch.fft.fft(vec1, dim=0) fft2 = torch.fft.fft(vec2, dim=0) # Interpolate magnitude and phase mag1 = torch.abs(fft1) mag2 = torch.abs(fft2) phase1 = torch.angle(fft1) phase2 = torch.angle(fft2) if importances != None: mag_interpolated = mag1 # use first voice as base,then interpolate on top different indexes phase_diff = torch.zeros_like(mag1) for idx in importances: mag_interpolated[idx]= (1 - t) * mag1[idx] + t * mag2[idx] phase_diff[idx] = phase2[idx] - phase1[idx] phase_diff_wrapped = torch.angle(torch.exp(1j * phase_diff)) phase_interpolated = phase1 + t * phase_diff_wrapped else: # Interpolate magnitude mag_interpolated = (1 - t) * mag1 + t * mag2 # Angular interpolation for phase phase_diff = phase2 - phase1 phase_diff_wrapped = torch.angle(torch.exp(1j * phase_diff)) phase_interpolated = phase1 + t * phase_diff_wrapped # Combine magnitude and phase interpolated_fft = mag_interpolated * torch.exp(1j * phase_interpolated) # Inverse FFT interpolated_seq = torch.fft.ifft(interpolated_fft, dim=0).real # Reshape back to (512, 1, 256) interpolated_vec = interpolated_seq.view(511, 1,256) return interpolated_vec voices_mean = torch.stack(voices,dim=0).mean(dim=0) print(f"computing diferences of each voice" ) diffs = [ torch.abs(voices[i]-voices_mean) for i in range(len(voices))] print(f'Loaded voices: {len(voices)}') # Function for generating audio from text def generate_audio_from_text(text, vector_index_1, vector_index_2, slice_value,topk=0): # Get the selected vectors vector_index_1= VOICE_NAME.index(vector_index_1) vector_index_2= VOICE_NAME.index(vector_index_2) vector_1 = voices[vector_index_1].clone() vector_2 = voices[vector_index_2].clone() tk_idx=None if topk !=0: differences_A = diffs[vector_index_1].sum(dim=-1)[:,0] differences_B = diffs[vector_index_2].sum(dim=-1)[:,0] diff_of_differences = differences_B-differences_A topk_indices = torch.topk(diff_of_differences, k=int(topk)).indices # Interpolate the selected slice interpolated_vector = interpolate_vectors(vector_1,vector_2,slice_value,importances=tk_idx)#.half() # Generate audio audio, out_ps = generate(MODEL, text, interpolated_vector) audio_np = audio.flatten() return 24000, audio_np def reload_voices(): VOICE_NAME = [] voices = [] for (dirpath, dirnames, filenames) in walk("voices/"): VOICE_NAME.extend(filenames) voices = [torch.load(f'voices/{vn}', weights_only=True).to(device) for vn in filenames] break voices_mean = torch.stack(voices,dim=0).mean(dim=0) print(f"computing diferences of each voice" ) diffs = [ torch.abs(voices[i]-voices_mean) for i in range(len(voices))] model_list = list_models() # Function to save a custom voice def save_custom_voice(name, vector_index_1, vector_index_2, slice_value,topk=0): # Get the selected vectors #print(VOICE_NAME) fmt =f"_mixed_{vector_index_1[:-3]}_{vector_index_2[:-3]}_ratio_{slice_value}" vector_index_1= VOICE_NAME.index(vector_index_1) vector_index_2= VOICE_NAME.index(vector_index_2) vector_1 = voices[vector_index_1].clone() vector_2 = voices[vector_index_2].clone() tk_idx=None if topk !=0: differences_A = diffs[vector_index_1].sum(dim=-1)[:,0] differences_B = diffs[vector_index_2].sum(dim=-1)[:,0] diff_of_differences = differences_B-differences_A topk_indices = torch.topk(diff_of_differences, k=int(topk)).indices custom_voice = interpolate_vectors(vector_1,vector_2,slice_value,importances=tk_idx) # Save the custom voice save_path = f"voices/{name}{fmt}.pt" os.makedirs("voices", exist_ok=True) torch.save(custom_voice.half(), save_path) # reload all voices again reload_voices() return f"Voice saved as {save_path}",save_path # Function to load saved voices def load_voices(): if not os.path.exists("voices"): return [] return [f.replace(".pt", "") for f in os.listdir("voices") if f.endswith(".pt")] # Gradio interface components iface = gr.Interface( fn=generate_audio_from_text, inputs=[ gr.Textbox(label="Text Input"), gr.Dropdown(choices=VOICE_NAME, label="Select Vector 1"), gr.Dropdown(choices=VOICE_NAME, label="Select Vector 2"), gr.Slider(minimum=0, maximum=1, step=0.01, label="Slice Value"), gr.Slider(minimum=0, maximum=511, step=1, label="top-K diferent indexes to use"), # gr.Button("Sample Audio"), ], outputs=[ gr.Audio(interactive=False, format="wav", label="Synthesized Audio"), ], title="Kokoro TTS voices Interpolation gui ", description="Select 2 voices indexes, then a interpolation alpha value, and top-k diferent indexes to merge,if 0 are selected,the voice will merge all its indexes. It uses linear interpolation (1 - alpha) * voice1 + alpha * voice2" ) # Save and rate interface save_iface = gr.Interface( fn=save_custom_voice, inputs=[ gr.Textbox(label="Voice Name"), gr.Dropdown(choices=VOICE_NAME, label="Select Vector 1"), gr.Dropdown(choices=VOICE_NAME, label="Select Vector 2"), gr.Slider(minimum=0, maximum=1, step=0.01, label="Slice Value"), gr.Slider(minimum=0, maximum=511, step=1, label="top-K diferent indexes to use"), ], outputs=[ gr.Textbox(label="Save Status"), gr.File(label="Download File"), ], title="Save Custom Voice", description="Save your custom voice with a name." ) # Interface for visualizing top-k differences topk_iface = gr.Interface( fn=visualize_topk, inputs=[ gr.Dropdown(choices=VOICE_NAME, label="Select Vector"), gr.Slider(minimum=1, maximum=100, step=1, label="Top-k Indices") ], outputs=gr.Textbox(label="Top-k Differences"), title="Visualize Top-k Differences", description="View the indices with the largest differences for a selected vector." ) def list_models(): models = [] for (dirpath, dirnames, filenames) in walk("voices/"): models.extend(filenames) return models def list_models_update_gr(): models = [] for (dirpath, dirnames, filenames) in walk("voices/"): models.extend(filenames) return gr.update(choices=models) # Function to return the full path of the selected model def d_model(name): return f"voices/{name}" # Define the Gradio interface with gr.Blocks() as download_models_tab: # Dropdown to list models model_dropdown = gr.Dropdown( choices=list_models(), label="Select Model", ) # file_download = gr.File(label="Download File") # Function to refresh the dropdown choices with gr.Row(): refresh_button = gr.Button("Refresh") # Button to download the selected model download_button = gr.Button("Download") refresh_button.click(list_models_update_gr, inputs=[], outputs=model_dropdown) download_button.click( d_model, inputs=[model_dropdown], outputs=[gr.File(label="Download File")] ) # Combine all tabs into a single Gradio application gr.TabbedInterface( [ iface, save_iface, topk_iface, download_models_tab ], [ "Interpolation", "Save Custom Voice", "Visualize Top-k", "Download Models" ] ).launch()