{"num_layers": 6, "num_single_layers": 18, "in_channels": 64, "attention_head_dim": 128, "joint_attention_dim": 1024, "num_attention_heads": 8, "audio_seq_len": 645, "max_duration": 30, "uncondition": false, "text_encoder_name": "google/flan-t5-large"} |