{ "model_type": "dasheng_audiogen", "architectures": [ "DashengAudioGenModel" ], "auto_map": { "AutoConfig": "configuration_dasheng_audiogen.DashengAudioGenConfig", "AutoModel": "modeling_dasheng_audiogen.DashengAudioGenModel" }, "text_encoder_name": "google/mt5-large", "tokenizer_name": "mispeech/dashengtokenizer", "use_zero_instruction": true, "instruction_seq_len": 1, "task_instruction_dim": 1024, "sample_rate": 16000, "downsampling_ratio": 640, "latent_dim": 1280, "content_dim": 1024, "frame_resolution": 0.005, "duration_offset": 1.0, "tokenizer_max_length": 512, "dit_img_size": 1000, "dit_patch_size": 1, "dit_in_chans": 1280, "dit_out_chans": 1280, "dit_input_type": "1d", "dit_embed_dim": 1536, "dit_depth": 32, "dit_num_heads": 24, "dit_mlp_ratio": 4.0, "dit_qk_norm": "layernorm", "dit_norm_layer": "layernorm", "dit_act_layer": "geglu", "dit_context_norm": true, "dit_time_fusion": "ada", "dit_ada_sola_rank": 32, "dit_ada_sola_alpha": 32, "dit_ta_context_dim": 1024, "dit_ta_context_fusion": "add", "dit_ta_context_norm": true, "dit_context_dim": 1024, "dit_context_fusion": "cross", "dit_context_pe_method": "none", "dit_pe_method": "none", "dit_rope_mode": "shared", "adapter_num_heads": 16, "adapter_dropout": 0.2, "adapter_duration_grad_scale": 0.1, "duration_predictor_filter_channels": 512, "duration_predictor_n_layers": 5, "duration_predictor_kernel_size": 3, "duration_predictor_p_dropout": 0.5, "special_tokens": [ "<|caption|>", "<|speech|>", "<|sfx|>", "<|music|>", "<|env|>", "<|asr|>", "<|speech_start|>", "<|speech_end|>" ], "train_special_tokens": true }