Optimized glu (#278)

yxlllc · KakaruHayate · web-flow · commit a39677b4202a · 2025-12-04T00:56:00.000+08:00
* optimize * Keep AtanGLU behavior unchanged during eval (#275) --------- Co-authored-by: Kakaru <97896816+KakaruHayate@users.noreply.github.com>
diff --git a/modules/commons/common_layers.py b/modules/commons/common_layers.py
@@ -128,6 +128,22 @@ def forward(self, x):
         return out * gate
 
 
+class ATanGLUFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, out, gate):
+        atan_gate = torch.atan(gate)
+        decay_out = out / gate.square().add(1.0)
+        ctx.save_for_backward(decay_out, atan_gate)
+        return out * atan_gate
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        decay_out, atan_gate = ctx.saved_tensors
+        grad_out_part = grad_output * atan_gate
+        grad_gate_part = grad_output * decay_out
+        return grad_out_part, grad_gate_part   
+
+       
 class ATanGLU(nn.Module):
     # ArcTan-Applies the gated linear unit function.
     def __init__(self, dim=-1):
@@ -136,9 +152,12 @@ def __init__(self, dim=-1):
 
     def forward(self, x):
         # out, gate = x.chunk(2, dim=self.dim)
-        # Using torch.split instead of chunk for ONNX export compatibility.
+        # Using torch.split instead of chunk for ONNX export compatibility.        
         out, gate = torch.split(x, x.size(self.dim) // 2, dim=self.dim)
-        return out * torch.atan(gate)
+        if self.training:
+            return ATanGLUFunction.apply(out, gate)
+        else:
+            return out * torch.atan(gate)
         
         
 class KaimingNormalConv1d(torch.nn.Conv1d):