Merge pull request #75 from basf/models

AnFreTh · web-flow · commit e16ba03ed12f · 2024-07-15T12:51:37.000+02:00
Models
diff --git a/mambular/arch_utils/transformer_utils.py b/mambular/arch_utils/transformer_utils.py
@@ -0,0 +1,63 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def reglu(x):
+    a, b = x.chunk(2, dim=-1)
+    return a * F.relu(b)
+
+
+class ReGLU(nn.Module):
+    def forward(self, x):
+        return reglu(x)
+
+
+class GLU(nn.Module):
+    def __init__(self):
+        super(GLU, self).__init__()
+
+    def forward(self, x):
+        assert x.size(-1) % 2 == 0, "Input dimension must be even"
+        split_dim = x.size(-1) // 2
+        return x[..., :split_dim] * torch.sigmoid(x[..., split_dim:])
+
+
+class CustomTransformerEncoderLayer(nn.TransformerEncoderLayer):
+    def __init__(self, *args, activation=F.relu, **kwargs):
+        super(CustomTransformerEncoderLayer, self).__init__(
+            *args, activation=activation, **kwargs
+        )
+        self.custom_activation = activation
+
+        # Check if the activation function is an instance of a GLU variant
+        if activation in [ReGLU, GLU] or isinstance(activation, (ReGLU, GLU)):
+            self.linear1 = nn.Linear(
+                self.linear1.in_features,
+                self.linear1.out_features * 2,
+                bias=kwargs.get("bias", True),
+            )
+            self.linear2 = nn.Linear(
+                self.linear2.in_features,
+                self.linear2.out_features,
+                bias=kwargs.get("bias", True),
+            )
+
+    def forward(self, src, src_mask=None, src_key_padding_mask=None, is_causal=False):
+        src2 = self.self_attn(
+            src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
+        )[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+
+        # Use the provided activation function
+        if self.custom_activation in [ReGLU, GLU] or isinstance(
+            self.custom_activation, (ReGLU, GLU)
+        ):
+            src2 = self.linear2(self.custom_activation(self.linear1(src)))
+        else:
+            src2 = self.linear2(self.custom_activation(self.linear1(src)))
+
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
diff --git a/mambular/base_models/ft_transformer.py b/mambular/base_models/ft_transformer.py
@@ -9,6 +9,7 @@
     InstanceNorm,
     GroupNorm,
 )
+from ..arch_utils.transformer_utils import CustomTransformerEncoderLayer
 from ..configs.fttransformer_config import DefaultFTTransformerConfig
 from .basemodel import BaseModel
 
@@ -87,7 +88,7 @@ def __init__(
             "num_embedding_activation", config.num_embedding_activation
         )
 
-        encoder_layer = nn.TransformerEncoderLayer(
+        encoder_layer = CustomTransformerEncoderLayer(
             d_model=self.hparams.get("d_model", config.d_model),
             nhead=self.hparams.get("n_heads", config.n_heads),
             batch_first=True,
diff --git a/mambular/base_models/mambular.py b/mambular/base_models/mambular.py
@@ -174,6 +174,11 @@ def __init__(
             torch.zeros(1, 1, self.hparams.get("d_model", config.d_model))
         )
 
+        if self.pooling_method == "cls":
+            self.use_cls = True
+        else:
+            self.use_cls = self.hparams.get("use_cls", config.use_cls)
+
         if self.hparams.get("layer_norm_after_embedding"):
             self.embedding_norm = nn.LayerNorm(
                 self.hparams.get("d_model", config.d_model)
@@ -198,10 +203,13 @@ def forward(self, num_features, cat_features):
         Tensor
             The output predictions of the model.
         """
-        batch_size = (
-            cat_features[0].size(0) if cat_features != [] else num_features[0].size(0)
-        )
-        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        if self.use_cls:
+            batch_size = (
+                cat_features[0].size(0)
+                if cat_features != []
+                else num_features[0].size(0)
+            )
+            cls_tokens = self.cls_token.expand(batch_size, -1, -1)
 
         if len(self.cat_embeddings) > 0 and cat_features:
             cat_embeddings = [
@@ -225,11 +233,20 @@ def forward(self, num_features, cat_features):
             num_embeddings = None
 
         if cat_embeddings is not None and num_embeddings is not None:
-            x = torch.cat([cls_tokens, cat_embeddings, num_embeddings], dim=1)
+            if self.use_cls:
+                x = torch.cat([cat_embeddings, num_embeddings, cls_tokens], dim=1)
+            else:
+                x = torch.cat([cat_embeddings, num_embeddings], dim=1)
         elif cat_embeddings is not None:
-            x = torch.cat([cls_tokens, cat_embeddings], dim=1)
+            if self.use_cls:
+                x = torch.cat([cat_embeddings, cls_tokens], dim=1)
+            else:
+                x = cat_embeddings
         elif num_embeddings is not None:
-            x = torch.cat([cls_tokens, num_embeddings], dim=1)
+            if self.use_cls:
+                x = torch.cat([num_embeddings, cls_tokens], dim=1)
+            else:
+                x = num_embeddings
         else:
             raise ValueError("No features provided to the model.")
 
@@ -242,7 +259,9 @@ def forward(self, num_features, cat_features):
         elif self.pooling_method == "sum":
             x = torch.sum(x, dim=1)
         elif self.pooling_method == "cls_token":
-            x = x[:, 0]
+            x = x[:, -1]
+        elif self.pooling_method == "last":
+            x = x[:, -1]
         else:
             raise ValueError(f"Invalid pooling method: {self.pooling_method}")
 
diff --git a/mambular/base_models/tabtransformer.py b/mambular/base_models/tabtransformer.py
@@ -11,6 +11,7 @@
 )
 from ..configs.tabtransformer_config import DefaultTabTransformerConfig
 from .basemodel import BaseModel
+from ..arch_utils.transformer_utils import CustomTransformerEncoderLayer
 
 
 class TabTransformer(BaseModel):
@@ -91,7 +92,7 @@ def __init__(
             "num_embedding_activation", config.num_embedding_activation
         )
 
-        encoder_layer = nn.TransformerEncoderLayer(
+        encoder_layer = CustomTransformerEncoderLayer(
             d_model=self.hparams.get("d_model", config.d_model),
             nhead=self.hparams.get("n_heads", config.n_heads),
             batch_first=True,
diff --git a/mambular/configs/fttransformer_config.py b/mambular/configs/fttransformer_config.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 import torch.nn as nn
+from ..arch_utils.transformer_utils import ReGLU
 
 
 @dataclass
@@ -63,15 +64,15 @@ class DefaultFTTransformerConfig:
     lr_patience: int = 10
     weight_decay: float = 1e-06
     lr_factor: float = 0.1
-    d_model: int = 64
-    n_layers: int = 8
-    n_heads: int = 4
-    attn_dropout: float = 0.3
-    ff_dropout: float = 0.3
-    norm: str = "RMSNorm"
+    d_model: int = 128
+    n_layers: int = 4
+    n_heads: int = 8
+    attn_dropout: float = 0.2
+    ff_dropout: float = 0.1
+    norm: str = "LayerNorm"
     activation: callable = nn.SELU()
     num_embedding_activation: callable = nn.Identity()
-    head_layer_sizes: list = (128, 64, 32)
+    head_layer_sizes: list = ()
     head_dropout: float = 0.5
     head_skip_layers: bool = False
     head_activation: callable = nn.SELU()
@@ -80,6 +81,7 @@ class DefaultFTTransformerConfig:
     pooling_method: str = "cls"
     norm_first: bool = False
     bias: bool = True
-    transformer_activation: callable = nn.SELU()
+    transformer_activation: callable = ReGLU()
     layer_norm_eps: float = 1e-05
-    transformer_dim_feedforward: int = 512
+    transformer_dim_feedforward: int = 256
+    numerical_embedding: str = "ple"
diff --git a/mambular/configs/mambular_config.py b/mambular/configs/mambular_config.py
@@ -69,30 +69,32 @@ class DefaultMambularConfig:
         Whether to use bidirectional processing of the input sequences.
     use_learnable_interaction : bool, default=False
         Whether to use learnable feature interactions before passing through mamba blocks.
+    use_cls : bool, default=True
+        Whether to append a cls to the beginning of each 'sequence'.
     """
 
     lr: float = 1e-04
     lr_patience: int = 10
     weight_decay: float = 1e-06
     lr_factor: float = 0.1
     d_model: int = 64
-    n_layers: int = 8
+    n_layers: int = 4
     expand_factor: int = 2
     bias: bool = False
-    d_conv: int = 16
+    d_conv: int = 4
     conv_bias: bool = True
-    dropout: float = 0.05
+    dropout: float = 0.0
     dt_rank: str = "auto"
-    d_state: int = 32
+    d_state: int = 128
     dt_scale: float = 1.0
     dt_init: str = "random"
     dt_max: float = 0.1
     dt_min: float = 1e-04
     dt_init_floor: float = 1e-04
-    norm: str = "RMSNorm"
-    activation: callable = nn.SELU()
+    norm: str = "LayerNorm"
+    activation: callable = nn.SiLU()
     num_embedding_activation: callable = nn.Identity()
-    head_layer_sizes: list = (128, 64, 32)
+    head_layer_sizes: list = ()
     head_dropout: float = 0.5
     head_skip_layers: bool = False
     head_activation: callable = nn.SELU()
@@ -101,3 +103,4 @@ class DefaultMambularConfig:
     pooling_method: str = "avg"
     bidirectional: bool = False
     use_learnable_interaction: bool = False
+    use_cls: bool = False
diff --git a/mambular/configs/mlp_config.py b/mambular/configs/mlp_config.py
@@ -41,7 +41,7 @@ class DefaultMLPConfig:
     lr_patience: int = 10
     weight_decay: float = 1e-06
     lr_factor: float = 0.1
-    layer_sizes: list = (128, 128, 32)
+    layer_sizes: list = (256, 128, 32)
     activation: callable = nn.SELU()
     skip_layers: bool = False
     dropout: float = 0.5
diff --git a/mambular/configs/resnet_config.py b/mambular/configs/resnet_config.py
@@ -43,7 +43,7 @@ class DefaultResNetConfig:
     lr_patience: int = 10
     weight_decay: float = 1e-06
     lr_factor: float = 0.1
-    layer_sizes: list = (128, 128, 32)
+    layer_sizes: list = (256, 128, 32)
     activation: callable = nn.SELU()
     skip_layers: bool = False
     dropout: float = 0.5
diff --git a/mambular/configs/tabtransformer_config.py b/mambular/configs/tabtransformer_config.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 import torch.nn as nn
+from ..arch_utils.transformer_utils import ReGLU
 
 
 @dataclass
@@ -63,15 +64,15 @@ class DefaultTabTransformerConfig:
     lr_patience: int = 10
     weight_decay: float = 1e-06
     lr_factor: float = 0.1
-    d_model: int = 64
-    n_layers: int = 8
-    n_heads: int = 4
-    attn_dropout: float = 0.3
-    ff_dropout: float = 0.3
-    norm: str = "RMSNorm"
+    d_model: int = 128
+    n_layers: int = 4
+    n_heads: int = 8
+    attn_dropout: float = 0.2
+    ff_dropout: float = 0.1
+    norm: str = "LayerNorm"
     activation: callable = nn.SELU()
     num_embedding_activation: callable = nn.Identity()
-    head_layer_sizes: list = (128, 64, 32)
+    head_layer_sizes: list = ()
     head_dropout: float = 0.5
     head_skip_layers: bool = False
     head_activation: callable = nn.SELU()
@@ -80,6 +81,6 @@ class DefaultTabTransformerConfig:
     pooling_method: str = "avg"
     norm_first: bool = True
     bias: bool = True
-    transformer_activation: callable = nn.SELU()
+    transformer_activation: callable = ReGLU()
     layer_norm_eps: float = 1e-05
     transformer_dim_feedforward: int = 512
diff --git a/mambular/preprocessing/preprocessor.py b/mambular/preprocessing/preprocessor.py
@@ -227,7 +227,9 @@ def fit(self, X, y=None):
                     numeric_transformer_steps.append(("scaler", StandardScaler()))
 
                 elif self.numerical_preprocessing == "normalization":
-                    numeric_transformer_steps.append(("normalizer", MinMaxScaler()))
+                    numeric_transformer_steps.append(
+                        ("normalizer", MinMaxScaler(feature_range=(-1, 1)))
+                    )
 
                 elif self.numerical_preprocessing == "quantile":
                     numeric_transformer_steps.append(
@@ -240,12 +242,15 @@ def fit(self, X, y=None):
                     )
 
                 elif self.numerical_preprocessing == "polynomial":
+                    numeric_transformer_steps.append(("scaler", StandardScaler()))
                     numeric_transformer_steps.append(
                         (
                             "polynomial",
                             PolynomialFeatures(self.degree, include_bias=False),
                         )
                     )
+                    # if self.degree > 10:
+                    #    numeric_transformer_steps.append(("normalizer", MinMaxScaler()))
 
                 elif self.numerical_preprocessing == "splines":
                     numeric_transformer_steps.append(
@@ -260,13 +265,9 @@ def fit(self, X, y=None):
                     )
 
                 elif self.numerical_preprocessing == "ple":
-                    numeric_transformer_steps.append(("normalizer", MinMaxScaler()))
                     numeric_transformer_steps.append(
-                        ("ple", PLE(n_bins=self.n_bins, task=self.task))
+                        ("normalizer", MinMaxScaler(feature_range=(-1, 1)))
                     )
-
-                elif self.numerical_preprocessing == "ple":
-                    numeric_transformer_steps.append(("normalizer", MinMaxScaler()))
                     numeric_transformer_steps.append(
                         ("ple", PLE(n_bins=self.n_bins, task=self.task))
                     )

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`InstanceNorm,`
`10`	`10`	`GroupNorm,`
`11`	`11`	`)`
	`12`	`+from ..arch_utils.transformer_utils import CustomTransformerEncoderLayer`
`12`	`13`	`from ..configs.fttransformer_config import DefaultFTTransformerConfig`
`13`	`14`	`from .basemodel import BaseModel`
`14`	`15`
`@@ -87,7 +88,7 @@ def __init__(`
`87`	`88`	`"num_embedding_activation", config.num_embedding_activation`
`88`	`89`	`)`
`89`	`90`
`90`		`- encoder_layer = nn.TransformerEncoderLayer(`
	`91`	`+ encoder_layer = CustomTransformerEncoderLayer(`
`91`	`92`	`d_model=self.hparams.get("d_model", config.d_model),`
`92`	`93`	`nhead=self.hparams.get("n_heads", config.n_heads),`
`93`	`94`	`batch_first=True,`
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@`
`11`	`11`	`)`
`12`	`12`	`from ..configs.tabtransformer_config import DefaultTabTransformerConfig`
`13`	`13`	`from .basemodel import BaseModel`
	`14`	`+from ..arch_utils.transformer_utils import CustomTransformerEncoderLayer`
`14`	`15`
`15`	`16`
`16`	`17`	`class TabTransformer(BaseModel):`
`@@ -91,7 +92,7 @@ def __init__(`
`91`	`92`	`"num_embedding_activation", config.num_embedding_activation`
`92`	`93`	`)`
`93`	`94`
`94`		`- encoder_layer = nn.TransformerEncoderLayer(`
	`95`	`+ encoder_layer = CustomTransformerEncoderLayer(`
`95`	`96`	`d_model=self.hparams.get("d_model", config.d_model),`
`96`	`97`	`nhead=self.hparams.get("n_heads", config.n_heads),`
`97`	`98`	`batch_first=True,`
Original file line number	Diff line number	Diff line change
`@@ -227,7 +227,9 @@ def fit(self, X, y=None):`
`227`	`227`	`numeric_transformer_steps.append(("scaler", StandardScaler()))`
`228`	`228`
`229`	`229`	`elif self.numerical_preprocessing == "normalization":`
`230`		`- numeric_transformer_steps.append(("normalizer", MinMaxScaler()))`
	`230`	`+ numeric_transformer_steps.append(`
	`231`	`+ ("normalizer", MinMaxScaler(feature_range=(-1, 1)))`
	`232`	`+ )`
`231`	`233`
`232`	`234`	`elif self.numerical_preprocessing == "quantile":`
`233`	`235`	`numeric_transformer_steps.append(`
`@@ -240,12 +242,15 @@ def fit(self, X, y=None):`
`240`	`242`	`)`
`241`	`243`
`242`	`244`	`elif self.numerical_preprocessing == "polynomial":`
	`245`	`+ numeric_transformer_steps.append(("scaler", StandardScaler()))`
`243`	`246`	`numeric_transformer_steps.append(`
`244`	`247`	`(`
`245`	`248`	`"polynomial",`
`246`	`249`	`PolynomialFeatures(self.degree, include_bias=False),`
`247`	`250`	`)`
`248`	`251`	`)`
	`252`	`+ # if self.degree > 10:`
	`253`	`+ # numeric_transformer_steps.append(("normalizer", MinMaxScaler()))`
`249`	`254`
`250`	`255`	`elif self.numerical_preprocessing == "splines":`
`251`	`256`	`numeric_transformer_steps.append(`
`@@ -260,13 +265,9 @@ def fit(self, X, y=None):`
`260`	`265`	`)`
`261`	`266`
`262`	`267`	`elif self.numerical_preprocessing == "ple":`
`263`		`- numeric_transformer_steps.append(("normalizer", MinMaxScaler()))`
`264`	`268`	`numeric_transformer_steps.append(`
`265`		`- ("ple", PLE(n_bins=self.n_bins, task=self.task))`
	`269`	`+ ("normalizer", MinMaxScaler(feature_range=(-1, 1)))`
`266`	`270`	`)`
`267`		`-`
`268`		`- elif self.numerical_preprocessing == "ple":`
`269`		`- numeric_transformer_steps.append(("normalizer", MinMaxScaler()))`
`270`	`271`	`numeric_transformer_steps.append(`
`271`	`272`	`("ple", PLE(n_bins=self.n_bins, task=self.task))`
`272`	`273`	`)`