| | |
| | |
| | |
| | |
| |
|
| | |
| | lexicon: |
| | - "AA" |
| | - "AE" |
| | - "AH" |
| | - "AO" |
| | - "AW" |
| | - "AY" |
| | - "B" |
| | - "CH" |
| | - "D" |
| | - "DH" |
| | - "EH" |
| | - "ER" |
| | - "EY" |
| | - "F" |
| | - "G" |
| | - "HH" |
| | - "IH" |
| | - "IY" |
| | - "JH" |
| | - "K" |
| | - "L" |
| | - "M" |
| | - "N" |
| | - "NG" |
| | - "OW" |
| | - "OY" |
| | - "P" |
| | - "R" |
| | - "S" |
| | - "SH" |
| | - "T" |
| | - "TH" |
| | - "UH" |
| | - "UW" |
| | - "V" |
| | - "W" |
| | - "Y" |
| | - "Z" |
| | - "ZH" |
| | - "-" |
| | - "!" |
| | - "'" |
| | - "(" |
| | - ")" |
| | - "," |
| | - "." |
| | - ":" |
| | - ";" |
| | - "?" |
| | - " " |
| |
|
| | n_symbols: 52 |
| | padding_idx: 0 |
| | n_mel_channels: 80 |
| |
|
| | hidden_channels: 512 |
| |
|
| | |
| | enc_num_layers: 4 |
| | enc_num_head: 2 |
| | enc_d_model: !ref <hidden_channels> |
| | enc_ffn_dim: 1024 |
| | enc_k_dim: !ref <hidden_channels> |
| | enc_v_dim: !ref <hidden_channels> |
| | enc_dropout: 0.2 |
| |
|
| | |
| | in_query_channels: 80 |
| | in_key_channels: !ref <hidden_channels> |
| | attn_channels: 80 |
| | temperature: 0.0005 |
| |
|
| | |
| | dec_num_layers: 4 |
| | dec_num_head: 2 |
| | dec_d_model: !ref <hidden_channels> |
| | dec_ffn_dim: 1024 |
| | dec_k_dim: !ref <hidden_channels> |
| | dec_v_dim: !ref <hidden_channels> |
| | dec_dropout: 0.2 |
| |
|
| | |
| | postnet_embedding_dim: 512 |
| | postnet_kernel_size: 5 |
| | postnet_n_convolutions: 5 |
| | postnet_dropout: 0.2 |
| |
|
| | |
| | normalize_before: True |
| | ffn_type: 1dcnn |
| | ffn_cnn_kernel_size_list: [9, 1] |
| |
|
| | |
| | dur_pred_kernel_size: 3 |
| | pitch_pred_kernel_size: 3 |
| | energy_pred_kernel_size: 3 |
| | variance_predictor_dropout: 0.5 |
| |
|
| | |
| | model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2WithAlignment |
| | enc_num_layers: !ref <enc_num_layers> |
| | enc_num_head: !ref <enc_num_head> |
| | enc_d_model: !ref <enc_d_model> |
| | enc_ffn_dim: !ref <enc_ffn_dim> |
| | enc_k_dim: !ref <enc_k_dim> |
| | enc_v_dim: !ref <enc_v_dim> |
| | enc_dropout: !ref <enc_dropout> |
| | in_query_channels: !ref <in_query_channels> |
| | in_key_channels: !ref <in_key_channels> |
| | attn_channels: !ref <attn_channels> |
| | temperature: !ref <temperature> |
| | dec_num_layers: !ref <dec_num_layers> |
| | dec_num_head: !ref <dec_num_head> |
| | dec_d_model: !ref <dec_d_model> |
| | dec_ffn_dim: !ref <dec_ffn_dim> |
| | dec_k_dim: !ref <dec_k_dim> |
| | dec_v_dim: !ref <dec_v_dim> |
| | dec_dropout: !ref <dec_dropout> |
| | normalize_before: !ref <normalize_before> |
| | ffn_type: !ref <ffn_type> |
| | ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list> |
| | n_char: !ref <n_symbols> |
| | n_mels: !ref <n_mel_channels> |
| | postnet_embedding_dim: !ref <postnet_embedding_dim> |
| | postnet_kernel_size: !ref <postnet_kernel_size> |
| | postnet_n_convolutions: !ref <postnet_n_convolutions> |
| | postnet_dropout: !ref <postnet_dropout> |
| | padding_idx: !ref <padding_idx> |
| | dur_pred_kernel_size: !ref <dur_pred_kernel_size> |
| | pitch_pred_kernel_size: !ref <pitch_pred_kernel_size> |
| | energy_pred_kernel_size: !ref <energy_pred_kernel_size> |
| | variance_predictor_dropout: !ref <variance_predictor_dropout> |
| |
|
| | input_encoder: !new:speechbrain.dataio.encoder.TextEncoder |
| |
|
| | modules: |
| | model: !ref <model> |
| |
|
| | pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
| | loadables: |
| | model: !ref <model> |