Merge branch 'lynxnet2' into muon_lynxnet2

yxlllc · yxlllc · commit 51d3d3d263cb · 2025-04-03T23:33:50.000+08:00
diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
@@ -70,13 +70,12 @@ sampling_steps: 20
 diff_accelerator: ddim
 diff_speedup: 10
 hidden_size: 256
-backbone_type: 'lynxnet'
+backbone_type: 'lynxnet2'
 backbone_args:
   num_channels: 1024
   num_layers: 6
   kernel_size: 31
   dropout_rate: 0.0
-  strong_cond: true
 main_loss_type: l2
 main_loss_log_norm: false
 schedule_type: 'linear'
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
@@ -76,13 +76,12 @@ T_start: 0.4
 T_start_infer: 0.4
 K_step: 300
 K_step_infer: 300
-backbone_type: 'lynxnet'
+backbone_type: 'lynxnet2'
 backbone_args:
   num_channels: 1024
   num_layers: 6
   kernel_size: 31
   dropout_rate: 0.0
-  strong_cond: true
 #backbone_type: 'wavenet'
 #backbone_args:
 #  num_channels: 512
diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml
@@ -94,31 +94,29 @@ pitch_prediction_args:
   pitd_clip_min: -12.0
   pitd_clip_max: 12.0
   repeat_bins: 64
-  backbone_type: 'wavenet'
-  backbone_args:
-    num_layers: 20
-    num_channels: 256
-    dilation_cycle_length: 5
-# backbone_type: 'lynxnet'
+# backbone_type: 'wavenet'
 # backbone_args:
-#   num_layers: 6
-#   num_channels: 512
-#   dropout_rate: 0.0
-#   strong_cond: true
+#   num_layers: 20
+#   num_channels: 256
+#   dilation_cycle_length: 5
+  backbone_type: 'lynxnet2'
+  backbone_args:
+    num_layers: 6
+    num_channels: 512
+    dropout_rate: 0.0
 
 variances_prediction_args:
   total_repeat_bins: 48
-  backbone_type: 'wavenet'
-  backbone_args:
-    num_layers: 10
-    num_channels: 192
-    dilation_cycle_length: 4
-# backbone_type: 'lynxnet'
+# backbone_type: 'wavenet'
 # backbone_args:
-#   num_layers: 6
-#   num_channels: 384
-#   dropout_rate: 0.0
-#   strong_cond: true
+#   num_layers: 10
+#   num_channels: 192
+#   dilation_cycle_length: 4
+  backbone_type: 'lynxnet2'
+  backbone_args:
+    num_layers: 6
+    num_channels: 384
+    dropout_rate: 0.0
 
 lambda_dur_loss: 1.0
 lambda_pitch_loss: 1.0
diff --git a/configs/variance.yaml b/configs/variance.yaml
@@ -65,11 +65,11 @@ pitch_prediction_args:
   pitd_clip_min: -12.0
   pitd_clip_max: 12.0
   repeat_bins: 64
-  backbone_type: 'wavenet'
+  backbone_type: 'lynxnet2'
   backbone_args:
-    num_layers: 20
-    num_channels: 256
-    dilation_cycle_length: 5 
+    num_layers: 6
+    num_channels: 512
+    dropout_rate: 0.0
 
 energy_db_min: -96.0
 energy_db_max: -12.0
@@ -88,11 +88,11 @@ tension_smooth_width: 0.12
 
 variances_prediction_args:
   total_repeat_bins: 48
-  backbone_type: 'wavenet'
+  backbone_type: 'lynxnet2'
   backbone_args:
-    num_layers: 10
-    num_channels: 192
-    dilation_cycle_length: 4
+    num_layers: 6
+    num_channels: 384
+    dropout_rate: 0.0
 
 lambda_dur_loss: 1.0
 lambda_pitch_loss: 1.0
diff --git a/modules/backbones/__init__.py b/modules/backbones/__init__.py
@@ -1,11 +1,13 @@
 import torch.nn
 from modules.backbones.wavenet import WaveNet
 from modules.backbones.lynxnet import LYNXNet
+from modules.backbones.lynxnet2 import LYNXNet2
 from utils import filter_kwargs
 
 BACKBONES = {
     'wavenet': WaveNet,
-    'lynxnet': LYNXNet
+    'lynxnet': LYNXNet,
+    'lynxnet2': LYNXNet2,
 }
 
 
diff --git a/modules/backbones/lynxnet.py b/modules/backbones/lynxnet.py
@@ -6,26 +6,10 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from modules.commons.common_layers import SinusoidalPosEmb, SwiGLU
+from modules.commons.common_layers import SinusoidalPosEmb, SwiGLU, Conv1d, Transpose
 from utils.hparams import hparams
 
 
-class Conv1d(torch.nn.Conv1d):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        nn.init.kaiming_normal_(self.weight)
-
-
-class Transpose(nn.Module):
-    def __init__(self, dims):
-        super().__init__()
-        assert len(dims) == 2, 'dims must be a tuple of two dimensions'
-        self.dims = dims
-
-    def forward(self, x):
-        return x.transpose(*self.dims)
-
-
 class LYNXConvModule(nn.Module):
     @staticmethod
     def calc_same_padding(kernel_size):
@@ -150,7 +134,7 @@ def forward(self, spec, diffusion_step, cond):
         # post-norm
         x = self.norm(x.transpose(1, 2)).transpose(1, 2)
 
-        # MLP and GLU
+        # output_projection
         x = self.output_projection(x)  # [B, 128, T]
 
         if self.n_feats == 1:
diff --git a/modules/backbones/lynxnet2.py b/modules/backbones/lynxnet2.py
@@ -0,0 +1,101 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modules.commons.common_layers import SinusoidalPosEmb, SwiGLU, Conv1d, Transpose
+from utils.hparams import hparams
+
+
+class LYNXNet2Block(nn.Module):
+    def __init__(self, dim, expansion_factor, kernel_size=31, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * expansion_factor)
+        if float(dropout) > 0.:
+            _dropout = nn.Dropout(dropout)
+        else:
+            _dropout = nn.Identity()
+        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
+            Transpose((1, 2)),
+            nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=kernel_size // 2, groups=dim),
+            Transpose((1, 2)),
+            nn.Linear(dim, inner_dim * 2),
+            SwiGLU(),
+            nn.Linear(inner_dim, inner_dim * 2),
+            SwiGLU(),
+            nn.Linear(inner_dim, dim),
+            _dropout
+        )
+
+    def forward(self, x):
+        return x + self.net(x)
+
+
+class LYNXNet2(nn.Module):
+    def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=1, kernel_size=31,
+                 dropout=0.0):
+        """
+        LYNXNet2(Linear Gated Depthwise Separable Convolution Network Version 2)
+        """
+        super().__init__()
+        self.in_dims = in_dims
+        self.n_feats = n_feats
+        self.input_projection = nn.Linear(in_dims * n_feats, num_channels)
+        self.conditioner_projection = nn.Linear(hparams['hidden_size'], num_channels)
+        self.diffusion_embedding = nn.Sequential(
+            SinusoidalPosEmb(num_channels),
+            nn.Linear(num_channels, num_channels * 4),
+            nn.GELU(),
+            nn.Linear(num_channels * 4, num_channels),
+        )
+        self.residual_layers = nn.ModuleList(
+            [
+                LYNXNet2Block(
+                    dim=num_channels,
+                    expansion_factor=expansion_factor,
+                    kernel_size=kernel_size,
+                    dropout=dropout
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.norm = nn.LayerNorm(num_channels)
+        self.output_projection = nn.Linear(num_channels, in_dims * n_feats)
+        nn.init.kaiming_normal_(self.input_projection.weight)
+        nn.init.kaiming_normal_(self.conditioner_projection.weight)
+        nn.init.zeros_(self.output_projection.weight)
+
+    def forward(self, spec, diffusion_step, cond):
+        """
+        :param spec: [B, F, M, T]
+        :param diffusion_step: [B, 1]
+        :param cond: [B, H, T]
+        :return:
+        """
+
+        if self.n_feats == 1:
+            x = spec[:, 0]  # [B, M, T]
+        else:
+            x = spec.flatten(start_dim=1, end_dim=2)  # [B, F x M, T]
+
+        x = self.input_projection(x.transpose(1, 2)) # [B, T, F x M]
+        x = x + self.conditioner_projection(cond.transpose(1, 2))
+        x = x + self.diffusion_embedding(diffusion_step).unsqueeze(1)
+
+        for layer in self.residual_layers:
+            x = layer(x)
+
+        # post-norm
+        x = self.norm(x)
+
+        # output projection
+        x = self.output_projection(x).transpose(1, 2)  # [B, 128, T]
+
+        if self.n_feats == 1:
+            x = x[:, None, :, :]
+        else:
+            # This is the temporary solution since PyTorch 1.13
+            # does not support exporting aten::unflatten to ONNX
+            # x = x.unflatten(dim=1, sizes=(self.n_feats, self.in_dims))
+            x = x.reshape(-1, self.n_feats, self.in_dims, x.shape[2])
+        return x
diff --git a/modules/backbones/wavenet.py b/modules/backbones/wavenet.py
@@ -5,16 +5,10 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from modules.commons.common_layers import SinusoidalPosEmb
+from modules.commons.common_layers import SinusoidalPosEmb, Conv1d
 from utils.hparams import hparams
 
 
-class Conv1d(torch.nn.Conv1d):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        nn.init.kaiming_normal_(self.weight)
-
-
 class ResidualBlock(nn.Module):
     def __init__(self, encoder_hidden, residual_channels, dilation):
         super().__init__()
diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py
@@ -117,6 +117,22 @@ def forward(self, x):
         return out * F.silu(gate)
 
 
+class Conv1d(torch.nn.Conv1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        nn.init.kaiming_normal_(self.weight)
+
+
+class Transpose(nn.Module):
+    def __init__(self, dims):
+        super().__init__()
+        assert len(dims) == 2, 'dims must be a tuple of two dimensions'
+        self.dims = dims
+
+    def forward(self, x):
+        return x.transpose(*self.dims)
+        
+        
 class TransformerFFNLayer(nn.Module):
     def __init__(self, hidden_size, filter_size, kernel_size=1, dropout=0., act='gelu'):
         super().__init__()