[DONE]Supplement the Variance Model Scaling / Retake Scaling / Conditioner cache on LYNXNet2 (#259)

KakaruHayate · web-flow · commit b0ae9ca8ba08 · 2025-05-20T23:41:58.000+08:00
* Supplement the Variance Model Scaling / Retake Scaling / Conditioner cache on LYNXNet2

* Update toplevel.py

* del use_retake_scaling
diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
@@ -77,6 +77,7 @@ backbone_args:
   num_layers: 6
   kernel_size: 31
   dropout_rate: 0.0
+  use_conditioner_cache: true
 main_loss_type: l2
 main_loss_log_norm: false
 schedule_type: 'linear'
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
@@ -83,6 +83,7 @@ backbone_args:
   num_layers: 6
   kernel_size: 31
   dropout_rate: 0.0
+  use_conditioner_cache: true
 #backbone_type: 'wavenet'
 #backbone_args:
 #  num_channels: 512
diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml
@@ -105,6 +105,7 @@ pitch_prediction_args:
     num_layers: 6
     num_channels: 512
     dropout_rate: 0.0
+    use_conditioner_cache: true
 
 variances_prediction_args:
   total_repeat_bins: 48
@@ -118,6 +119,7 @@ variances_prediction_args:
     num_layers: 6
     num_channels: 384
     dropout_rate: 0.0
+    use_conditioner_cache: true
 
 lambda_dur_loss: 1.0
 lambda_pitch_loss: 1.0
diff --git a/configs/variance.yaml b/configs/variance.yaml
@@ -71,6 +71,7 @@ pitch_prediction_args:
     num_layers: 6
     num_channels: 512
     dropout_rate: 0.0
+    use_conditioner_cache: true
 
 energy_db_min: -96.0
 energy_db_max: -12.0
@@ -94,6 +95,7 @@ variances_prediction_args:
     num_layers: 6
     num_channels: 384
     dropout_rate: 0.0
+    use_conditioner_cache: true
 
 lambda_dur_loss: 1.0
 lambda_pitch_loss: 1.0
diff --git a/deployment/modules/toplevel.py b/deployment/modules/toplevel.py
@@ -252,10 +252,16 @@ def forward_pitch_preprocess(
         base_pitch = self.smooth(frame_midi_pitch)
         if self.use_melody_encoder:
             delta_pitch = (pitch - base_pitch) * ~retake
-            pitch_cond += self.delta_pitch_embed(delta_pitch[:, :, None])
+            if self.use_variance_scaling:
+                pitch_cond += self.delta_pitch_embed(delta_pitch[:, :, None] / 12)
+            else:
+                pitch_cond += self.delta_pitch_embed(delta_pitch[:, :, None])
         else:
             base_pitch = base_pitch * retake + pitch * ~retake
-            pitch_cond += self.base_pitch_embed(base_pitch[:, :, None])
+            if self.use_variance_scaling:
+                pitch_cond += self.base_pitch_embed(base_pitch[:, :, None] / 128)
+            else:
+                pitch_cond += self.base_pitch_embed(base_pitch[:, :, None])
         if hparams['use_spk_id'] and spk_embed is not None:
             pitch_cond += spk_embed
         return pitch_cond, base_pitch
@@ -275,13 +281,16 @@ def forward_variance_preprocess(
             variances: dict = None, retake=None, spk_embed=None
     ):
         condition = self.forward_mel2x_gather(encoder_out, ph_dur, x_dim=self.hidden_size)
-        variance_cond = condition + self.pitch_embed(pitch[:, :, None])
+        if self.use_variance_scaling:
+            variance_cond = condition + self.pitch_embed(pitch[:, :, None] / 12)
+        else:
+            variance_cond = condition + self.pitch_embed(pitch[:, :, None])
         non_retake_masks = [
             v_retake.float()  # [B, T, 1]
             for v_retake in (~retake).split(1, dim=2)
         ]
         variance_embeds = [
-            self.variance_embeds[v_name](variances[v_name][:, :, None]) * v_masks
+            self.variance_embeds[v_name](variances[v_name][:, :, None]) * v_masks * self.variance_retake_scaling[v_name]
             for v_name, v_masks in zip(self.variance_prediction_list, non_retake_masks)
         ]
         variance_cond += torch.stack(variance_embeds, dim=-1).sum(-1)
diff --git a/modules/backbones/lynxnet2.py b/modules/backbones/lynxnet2.py
@@ -33,17 +33,20 @@ def forward(self, x):
 
 class LYNXNet2(nn.Module):
     def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=1, kernel_size=31,
-                 dropout=0.0):
+                 dropout=0.0, use_conditioner_cache=False):
         """
         LYNXNet2(Linear Gated Depthwise Separable Convolution Network Version 2)
         """
         super().__init__()
         self.in_dims = in_dims
         self.n_feats = n_feats
         self.input_projection = nn.Linear(in_dims * n_feats, num_channels)
-        self.conditioner_projection = nn.Linear(hparams['hidden_size'], num_channels)
-        # It may need to be modified at some point to be compatible with the condition cache
-        # self.conditioner_projection = nn.Conv1d(hparams['hidden_size'], num_channels, 1)
+        self.use_conditioner_cache = use_conditioner_cache
+        if self.use_conditioner_cache:
+            # It may need to be modified at some point to be compatible with the condition cache
+            self.conditioner_projection = nn.Conv1d(hparams['hidden_size'], num_channels, 1)
+        else:
+            self.conditioner_projection = nn.Linear(hparams['hidden_size'], num_channels)
         self.diffusion_embedding = nn.Sequential(
             SinusoidalPosEmb(num_channels),
             nn.Linear(num_channels, num_channels * 4),
@@ -81,9 +84,11 @@ def forward(self, spec, diffusion_step, cond):
             x = spec.flatten(start_dim=1, end_dim=2)  # [B, F x M, T]
 
         x = self.input_projection(x.transpose(1, 2)) # [B, T, F x M]
-        x = x + self.conditioner_projection(cond.transpose(1, 2))
-        # It may need to be modified at some point to be compatible with the condition cache
-        # x = x + self.conditioner_projection(cond.transpose(1, 2))
+        if self.use_conditioner_cache:
+            # It may need to be modified at some point to be compatible with the condition cache
+            x = x + self.conditioner_projection(cond).transpose(1, 2)
+        else:
+            x = x + self.conditioner_projection(cond.transpose(1, 2))
         x = x + self.diffusion_embedding(diffusion_step).unsqueeze(1)
 
         for layer in self.residual_layers:
diff --git a/modules/toplevel.py b/modules/toplevel.py
@@ -195,6 +195,28 @@ def __init__(self, vocab_size):
             else:
                 raise NotImplementedError(self.diffusion_type)
 
+        self.use_variance_scaling = hparams.get('use_variance_scaling', False)
+        self.custom_variance_scaling_factor = {
+            'energy': 1. / 96,
+            'breathiness': 1. / 96,
+            'voicing': 1. / 96,
+            'tension': 0.1,
+            'key_shift': 1. / 12,
+            'speed': 1.
+        }
+        self.default_variance_scaling_factor = {
+            'energy': 1.,
+            'breathiness': 1.,
+            'voicing': 1.,
+            'tension': 1.,
+            'key_shift': 1.,
+            'speed': 1.
+        }
+        if self.use_variance_scaling:
+            self.variance_retake_scaling = self.custom_variance_scaling_factor
+        else:
+            self.variance_retake_scaling = self.default_variance_scaling_factor
+
     def forward(
             self, txt_tokens, midi, ph2word, ph_dur=None, word_dur=None, mel2ph=None,
             note_midi=None, note_rest=None, note_dur=None, note_glide=None, mel2note=None,
@@ -271,11 +293,17 @@ def forward(
                     delta_pitch_in = torch.zeros_like(base_pitch)
                 else:
                     delta_pitch_in = (pitch - base_pitch) * ~pitch_retake
-                pitch_cond += self.delta_pitch_embed(delta_pitch_in[:, :, None])
+                if self.use_variance_scaling:
+                    pitch_cond += self.delta_pitch_embed(delta_pitch_in[:, :, None] / 12)
+                else:
+                    pitch_cond += self.delta_pitch_embed(delta_pitch_in[:, :, None])
             else:
                 if not retake_unset:  # retake
                     base_pitch = base_pitch * pitch_retake + pitch * ~pitch_retake
-                pitch_cond += self.base_pitch_embed(base_pitch[:, :, None])
+                if self.use_variance_scaling:
+                    pitch_cond += self.base_pitch_embed(base_pitch[:, :, None] / 128)
+                else:
+                    pitch_cond += self.base_pitch_embed(base_pitch[:, :, None])
 
             if infer:
                 pitch_pred_out = self.pitch_predictor(pitch_cond, infer=True)
@@ -289,12 +317,16 @@ def forward(
 
         if pitch is None:
             pitch = base_pitch + pitch_pred_out
-        var_cond = condition + self.pitch_embed(pitch[:, :, None])
+        if self.use_variance_scaling:
+            var_cond = condition + self.pitch_embed(pitch[:, :, None] / 12)
+        else:
+            var_cond = condition + self.pitch_embed(pitch[:, :, None])
 
         variance_inputs = self.collect_variance_inputs(**kwargs)
+
         if variance_retake is not None:
             variance_embeds = [
-                self.variance_embeds[v_name](v_input[:, :, None]) * ~variance_retake[v_name][:, :, None]
+                self.variance_embeds[v_name](v_input[:, :, None]) * ~variance_retake[v_name][:, :, None] * self.variance_retake_scaling[v_name]
                 for v_name, v_input in zip(self.variance_prediction_list, variance_inputs)
             ]
             var_cond += torch.stack(variance_embeds, dim=-1).sum(-1)