Batch Size : 16
Trainable Parameters: 25.637M
FeedForwardTransformer(
  (encoder): Encoder(
    (embed): Sequential(
      (0): Embedding(56, 256, padding_idx=0)
      (1): ScaledPositionalEncoding(
        (dropout): Dropout(p=0.2, inplace=False)
      )
    )
    (encoders): MultiSequential(
      (0): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_q): Linear(in_features=256, out_features=256, bias=True)
          (linear_k): Linear(in_features=256, out_features=256, bias=True)
          (linear_v): Linear(in_features=256, out_features=256, bias=True)
          (linear_out): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (feed_forward): MultiLayeredConv1d(
          (w_1): Conv1d(256, 1024, kernel_size=(9,), stride=(1,), padding=(4,))
          (w_2): Conv1d(1024, 256, kernel_size=(1,), stride=(1,))
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (1): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_q): Linear(in_features=256, out_features=256, bias=True)
          (linear_k): Linear(in_features=256, out_features=256, bias=True)
          (linear_v): Linear(in_features=256, out_features=256, bias=True)
          (linear_out): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (feed_forward): MultiLayeredConv1d(
          (w_1): Conv1d(256, 1024, kernel_size=(9,), stride=(1,), padding=(4,))
          (w_2): Conv1d(1024, 256, kernel_size=(1,), stride=(1,))
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (2): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_q): Linear(in_features=256, out_features=256, bias=True)
          (linear_k): Linear(in_features=256, out_features=256, bias=True)
          (linear_v): Linear(in_features=256, out_features=256, bias=True)
          (linear_out): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (feed_forward): MultiLayeredConv1d(
          (w_1): Conv1d(256, 1024, kernel_size=(9,), stride=(1,), padding=(4,))
          (w_2): Conv1d(1024, 256, kernel_size=(1,), stride=(1,))
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (3): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_q): Linear(in_features=256, out_features=256, bias=True)
          (linear_k): Linear(in_features=256, out_features=256, bias=True)
          (linear_v): Linear(in_features=256, out_features=256, bias=True)
          (linear_out): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (feed_forward): MultiLayeredConv1d(
          (w_1): Conv1d(256, 1024, kernel_size=(9,), stride=(1,), padding=(4,))
          (w_2): Conv1d(1024, 256, kernel_size=(1,), stride=(1,))
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (duration_predictor): DurationPredictor(
    (conv): ModuleList(
      (0): Sequential(
        (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): ReLU()
        (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (3): Dropout(p=0.5, inplace=False)
      )
      (1): Sequential(
        (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): ReLU()
        (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (3): Dropout(p=0.5, inplace=False)
      )
    )
    (linear): Linear(in_features=256, out_features=1, bias=True)
  )
  (energy_predictor): EnergyPredictor(
    (predictor): VariancePredictor(
      (conv): ModuleList(
        (0): Sequential(
          (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
          (1): ReLU()
          (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
          (3): Dropout(p=0.5, inplace=False)
        )
        (1): Sequential(
          (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
          (1): ReLU()
          (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
          (3): Dropout(p=0.5, inplace=False)
        )
      )
      (linear): Linear(in_features=256, out_features=1, bias=True)
    )
  )
  (energy_embed): Linear(in_features=256, out_features=256, bias=True)
  (pitch_predictor): PitchPredictor(
    (predictor): VariancePredictor(
      (conv): ModuleList(
        (0): Sequential(
          (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
          (1): ReLU()
          (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
          (3): Dropout(p=0.5, inplace=False)
        )
        (1): Sequential(
          (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
          (1): ReLU()
          (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
          (3): Dropout(p=0.5, inplace=False)
        )
      )
      (linear): Linear(in_features=256, out_features=1, bias=True)
    )
  )
  (pitch_embed): Linear(in_features=256, out_features=256, bias=True)
  (length_regulator): LengthRegulator()
  (decoder): Encoder(
    (embed): Sequential(
      (0): ScaledPositionalEncoding(
        (dropout): Dropout(p=0.2, inplace=False)
      )
    )
    (encoders): MultiSequential(
      (0): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_q): Linear(in_features=256, out_features=256, bias=True)
          (linear_k): Linear(in_features=256, out_features=256, bias=True)
          (linear_v): Linear(in_features=256, out_features=256, bias=True)
          (linear_out): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (feed_forward): MultiLayeredConv1d(
          (w_1): Conv1d(256, 1024, kernel_size=(9,), stride=(1,), padding=(4,))
          (w_2): Conv1d(1024, 256, kernel_size=(1,), stride=(1,))
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (1): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_q): Linear(in_features=256, out_features=256, bias=True)
          (linear_k): Linear(in_features=256, out_features=256, bias=True)
          (linear_v): Linear(in_features=256, out_features=256, bias=True)
          (linear_out): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (feed_forward): MultiLayeredConv1d(
          (w_1): Conv1d(256, 1024, kernel_size=(9,), stride=(1,), padding=(4,))
          (w_2): Conv1d(1024, 256, kernel_size=(1,), stride=(1,))
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (2): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_q): Linear(in_features=256, out_features=256, bias=True)
          (linear_k): Linear(in_features=256, out_features=256, bias=True)
          (linear_v): Linear(in_features=256, out_features=256, bias=True)
          (linear_out): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (feed_forward): MultiLayeredConv1d(
          (w_1): Conv1d(256, 1024, kernel_size=(9,), stride=(1,), padding=(4,))
          (w_2): Conv1d(1024, 256, kernel_size=(1,), stride=(1,))
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (3): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_q): Linear(in_features=256, out_features=256, bias=True)
          (linear_k): Linear(in_features=256, out_features=256, bias=True)
          (linear_v): Linear(in_features=256, out_features=256, bias=True)
          (linear_out): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (feed_forward): MultiLayeredConv1d(
          (w_1): Conv1d(256, 1024, kernel_size=(9,), stride=(1,), padding=(4,))
          (w_2): Conv1d(1024, 256, kernel_size=(1,), stride=(1,))
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (postnet): Postnet(
    (postnet): ModuleList(
      (0): Sequential(
        (0): Conv1d(80, 256, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
        (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): Tanh()
        (3): Dropout(p=0.5, inplace=False)
      )
      (1): Sequential(
        (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
        (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): Tanh()
        (3): Dropout(p=0.5, inplace=False)
      )
      (2): Sequential(
        (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
        (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): Tanh()
        (3): Dropout(p=0.5, inplace=False)
      )
      (3): Sequential(
        (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
        (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): Tanh()
        (3): Dropout(p=0.5, inplace=False)
      )
      (4): Sequential(
        (0): Conv1d(256, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
        (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): Dropout(p=0.5, inplace=False)
      )
    )
  )
  (feat_out): Linear(in_features=256, out_features=80, bias=True)
  (duration_criterion): DurationPredictorLoss(
    (criterion): MSELoss()
  )
  (energy_criterion): EnergyPredictorLoss(
    (criterion): MSELoss()
  )
  (pitch_criterion): PitchPredictorLoss(
    (criterion): MSELoss()
  )
  (criterion): L1Loss()
)