{ "experiment": { "project": "tatitok_sl128_vae", "name": "tatitok_sl128_vae_run1", "output_dir": "tatitok_sl128_vae_run1", "max_train_examples": 685800000, "save_every": 50000, "eval_every": 50000, "generate_every": 5000, "log_every": 50, "log_grad_norm_every": 1000, "resume": true, "tokenizer_checkpoint": "tatitok_sl128_vae.bin" }, "model": { "vq_model": { "quantize_mode": "vae", "token_size": 16, "vit_enc_model_size": "small", "vit_dec_model_size": "large", "vit_enc_patch_size": 16, "vit_dec_patch_size": 16, "num_latent_tokens": 128, "finetune_decoder": false, "is_legacy": false } }, "losses": { "discriminator_start": 200000, "quantizer_weight": 1.0, "discriminator_factor": 1.0, "discriminator_weight": 0.1, "perceptual_loss": "lpips-convnext_s-1.0-0.1", "perceptual_weight": 1.1, "reconstruction_loss": "l2", "reconstruction_weight": 1.0, "lecam_regularization_weight": 0.001, "kl_weight": 1e-06, "logvar_init": 0.0 }, "dataset": { "params": { "train_shards_path_or_url": "datacomp_sharded/train/datacomp-train-{000000..140089}.tar", "eval_shards_path_or_url": "imagenet_sharded/val/imagenet-val-{0000..0009}.tar", "num_workers_per_gpu": 12, "dataset_with_class_label": false, "dataset_with_text_label": true }, "preprocessing": { "resize_shorter_edge": 256, "crop_size": 256, "random_crop": true, "random_flip": true, "res_ratio_filtering": true } }, "optimizer": { "name": "adamw", "params": { "learning_rate": 0.0001, "discriminator_learning_rate": 0.0001, "beta1": 0.9, "beta2": 0.999, "weight_decay": 0.0001 } }, "lr_scheduler": { "scheduler": "cosine", "params": { "learning_rate": "${optimizer.params.learning_rate}", "warmup_steps": 10000, "end_lr": 1e-05 } }, "training": { "gradient_accumulation_steps": 1, "per_gpu_batch_size": 32, "mixed_precision": "fp16", "enable_tf32": true, "enable_wandb": true, "use_ema": true, "seed": 42, "max_train_steps": 650000, "num_generated_images": 2, "max_grad_norm": 1.0 } }