Merge pull request #216 from basf/johnson_su

AnFreTh · web-flow · commit 2e87e87e9c13 · 2025-02-14T19:11:27.000+01:00
Johnson su
diff --git a/mambular/arch_utils/layer_utils/embedding_layer.py b/mambular/arch_utils/layer_utils/embedding_layer.py
@@ -156,8 +156,10 @@ def forward(self, num_features, cat_features, emb_features):
         # Process categorical embeddings
         if self.cat_embeddings and cat_features is not None:
             cat_embeddings = [
-                emb(cat_features[i]) for i, emb in enumerate(self.cat_embeddings)
+                emb(cat_features[i]) if emb(cat_features[i]).ndim == 3 else emb(cat_features[i]).unsqueeze(1)
+                for i, emb in enumerate(self.cat_embeddings)
             ]
+
             cat_embeddings = torch.stack(cat_embeddings, dim=1)
             cat_embeddings = torch.squeeze(cat_embeddings, dim=2)
             if self.layer_norm_after_embedding:
@@ -189,6 +191,7 @@ def forward(self, num_features, cat_features, emb_features):
                 ]
                 emb_embeddings = torch.stack(emb_embeddings, dim=1)
             else:
+
                 emb_embeddings = torch.stack(emb_features, dim=1)
             if self.layer_norm_after_embedding:
                 emb_embeddings = self.embedding_norm(emb_embeddings)
@@ -199,6 +202,7 @@ def forward(self, num_features, cat_features, emb_features):
 
         if embeddings:
             x = torch.cat(embeddings, dim=1) if len(embeddings) > 1 else embeddings[0]
+
         else:
             raise ValueError("No features provided to the model.")
 
diff --git a/mambular/preprocessing/preprocessor.py b/mambular/preprocessing/preprocessor.py
@@ -40,6 +40,14 @@ class Preprocessor:
 
     Parameters
     ----------
+    feature_preprocessing: dict or None
+            Dictionary mapping column names to preprocessing techniques. Example:
+            {
+                "num_feature1": "minmax",
+                "num_feature2": "ple",
+                "cat_feature1": "one-hot",
+                "cat_feature2": "int"
+            }
     n_bins : int, default=50
         The number of bins to use for numerical feature binning. This parameter is relevant
         only if `numerical_preprocessing` is set to 'binning', 'ple' or 'one-hot'.
@@ -94,6 +102,7 @@ class Preprocessor:
 
     def __init__(
         self,
+        feature_preprocessing=None,
         n_bins=64,
         numerical_preprocessing="ple",
         categorical_preprocessing="int",
@@ -153,6 +162,7 @@ def __init__(
             )
 
         self.use_decision_tree_bins = use_decision_tree_bins
+        self.feature_preprocessing = feature_preprocessing or {}
         self.column_transformer = None
         self.fitted = False
         self.binning_strategy = binning_strategy
@@ -300,6 +310,10 @@ def fit(self, X, y=None, embeddings=None):
 
         if numerical_features:
             for feature in numerical_features:
+                feature_preprocessing = self.feature_preprocessing.get(
+                    feature, self.numerical_preprocessing
+                )
+
                 # extended the annotation list if new transformer is added, either from sklearn or custom
                 numeric_transformer_steps: list[
                     tuple[
@@ -322,7 +336,7 @@ def fit(self, X, y=None, embeddings=None):
                         | SigmoidExpansion,
                     ]
                 ] = [("imputer", SimpleImputer(strategy="mean"))]
-                if self.numerical_preprocessing in ["binning", "one-hot"]:
+                if feature_preprocessing in ["binning", "one-hot"]:
                     bins = (
                         self._get_decision_tree_bins(X[[feature]], y, [feature])
                         if self.use_decision_tree_bins
@@ -356,22 +370,22 @@ def fit(self, X, y=None, embeddings=None):
                             ]
                         )
 
-                    if self.numerical_preprocessing == "one-hot":
+                    if feature_preprocessing == "one-hot":
                         numeric_transformer_steps.extend(
                             [
                                 ("onehot_from_ordinal", OneHotFromOrdinal()),
                             ]
                         )
 
-                elif self.numerical_preprocessing == "standardization":
+                elif feature_preprocessing == "standardization":
                     numeric_transformer_steps.append(("scaler", StandardScaler()))
 
-                elif self.numerical_preprocessing == "minmax":
+                elif feature_preprocessing == "minmax":
                     numeric_transformer_steps.append(
                         ("minmax", MinMaxScaler(feature_range=(-1, 1)))
                     )
 
-                elif self.numerical_preprocessing == "quantile":
+                elif feature_preprocessing == "quantile":
                     numeric_transformer_steps.append(
                         (
                             "quantile",
@@ -381,7 +395,7 @@ def fit(self, X, y=None, embeddings=None):
                         )
                     )
 
-                elif self.numerical_preprocessing == "polynomial":
+                elif feature_preprocessing == "polynomial":
                     if self.scaling_strategy == "standardization":
                         numeric_transformer_steps.append(("scaler", StandardScaler()))
                     elif self.scaling_strategy == "minmax":
@@ -395,10 +409,10 @@ def fit(self, X, y=None, embeddings=None):
                         )
                     )
 
-                elif self.numerical_preprocessing == "robust":
+                elif feature_preprocessing == "robust":
                     numeric_transformer_steps.append(("robust", RobustScaler()))
 
-                elif self.numerical_preprocessing == "splines":
+                elif feature_preprocessing == "splines":
                     if self.scaling_strategy == "standardization":
                         numeric_transformer_steps.append(("scaler", StandardScaler()))
                     elif self.scaling_strategy == "minmax":
@@ -419,7 +433,7 @@ def fit(self, X, y=None, embeddings=None):
                         ),
                     )
 
-                elif self.numerical_preprocessing == "rbf":
+                elif feature_preprocessing == "rbf":
                     if self.scaling_strategy == "standardization":
                         numeric_transformer_steps.append(("scaler", StandardScaler()))
                     elif self.scaling_strategy == "minmax":
@@ -438,7 +452,7 @@ def fit(self, X, y=None, embeddings=None):
                         )
                     )
 
-                elif self.numerical_preprocessing == "sigmoid":
+                elif feature_preprocessing == "sigmoid":
                     if self.scaling_strategy == "standardization":
                         numeric_transformer_steps.append(("scaler", StandardScaler()))
                     elif self.scaling_strategy == "minmax":
@@ -457,15 +471,19 @@ def fit(self, X, y=None, embeddings=None):
                         )
                     )
 
-                elif self.numerical_preprocessing == "ple":
+
+                elif feature_preprocessing == "ple":
                     numeric_transformer_steps.append(
                         ("minmax", MinMaxScaler(feature_range=(-1, 1)))
                     )
                     numeric_transformer_steps.append(
                         ("ple", PLE(n_bins=self.n_bins, task=self.task))
                     )
 
-                elif self.numerical_preprocessing == "box-cox":
+                elif feature_preprocessing == "box-cox":
+                    numeric_transformer_steps.append(
+                        ("minmax", MinMaxScaler(feature_range=(1e-03, 1)))
+                    )
                     numeric_transformer_steps.append(
                         ("check_positive", MinMaxScaler(feature_range=(1e-3, 1)))
                     )
@@ -476,15 +494,15 @@ def fit(self, X, y=None, embeddings=None):
                         )
                     )
 
-                elif self.numerical_preprocessing == "yeo-johnson":
+                elif feature_preprocessing == "yeo-johnson":
                     numeric_transformer_steps.append(
                         (
                             "yeo-johnson",
                             PowerTransformer(method="yeo-johnson", standardize=True),
                         )
                     )
 
-                elif self.numerical_preprocessing == "none":
+                elif feature_preprocessing == "none":
                     numeric_transformer_steps.append(
                         (
                             "none",
@@ -498,15 +516,18 @@ def fit(self, X, y=None, embeddings=None):
 
         if categorical_features:
             for feature in categorical_features:
-                if self.categorical_preprocessing == "int":
+                feature_preprocessing = self.feature_preprocessing.get(
+                    feature, self.categorical_preprocessing
+                )
+                if feature_preprocessing == "int":
                     # Use ContinuousOrdinalEncoder for "int"
                     categorical_transformer = Pipeline(
                         [
                             ("imputer", SimpleImputer(strategy="most_frequent")),
                             ("continuous_ordinal", ContinuousOrdinalEncoder()),
                         ]
                     )
-                elif self.categorical_preprocessing == "one-hot":
+                elif feature_preprocessing == "one-hot":
                     # Use OneHotEncoder for "one-hot"
                     categorical_transformer = Pipeline(
                         [
@@ -516,15 +537,15 @@ def fit(self, X, y=None, embeddings=None):
                         ]
                     )
 
-                elif self.categorical_preprocessing == "none":
+                elif feature_preprocessing == "none":
                     # Use OneHotEncoder for "one-hot"
                     categorical_transformer = Pipeline(
                         [
                             ("imputer", SimpleImputer(strategy="most_frequent")),
                             ("none", NoTransformer()),
                         ]
                     )
-                elif self.categorical_preprocessing == "pretrained":
+                elif feature_preprocessing == "pretrained":
                     categorical_transformer = Pipeline(
                         [
                             ("imputer", SimpleImputer(strategy="most_frequent")),
@@ -533,7 +554,7 @@ def fit(self, X, y=None, embeddings=None):
                     )
                 else:
                     raise ValueError(
-                        f"Unknown categorical_preprocessing type: {self.categorical_preprocessing}"
+                        f"Unknown categorical_preprocessing type: {feature_preprocessing}"
                     )
 
                 # Append the transformer for the current categorical feature
diff --git a/mambular/utils/distributions.py b/mambular/utils/distributions.py

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,14 @@ class Preprocessor:`
`40`	`40`
`41`	`41`	`Parameters`
`42`	`42`	`----------`
	`43`	`+ feature_preprocessing: dict or None`
	`44`	`+ Dictionary mapping column names to preprocessing techniques. Example:`
	`45`	`+ {`
	`46`	`+ "num_feature1": "minmax",`
	`47`	`+ "num_feature2": "ple",`
	`48`	`+ "cat_feature1": "one-hot",`
	`49`	`+ "cat_feature2": "int"`
	`50`	`+ }`
`43`	`51`	`n_bins : int, default=50`
`44`	`52`	`The number of bins to use for numerical feature binning. This parameter is relevant`
`45`	`53`	only if `numerical_preprocessing` is set to 'binning', 'ple' or 'one-hot'.
`@@ -94,6 +102,7 @@ class Preprocessor:`
`94`	`102`
`95`	`103`	`def __init__(`
`96`	`104`	`self,`
	`105`	`+ feature_preprocessing=None,`
`97`	`106`	`n_bins=64,`
`98`	`107`	`numerical_preprocessing="ple",`
`99`	`108`	`categorical_preprocessing="int",`
`@@ -153,6 +162,7 @@ def __init__(`
`153`	`162`	`)`
`154`	`163`
`155`	`164`	`self.use_decision_tree_bins = use_decision_tree_bins`
	`165`	`+ self.feature_preprocessing = feature_preprocessing or {}`
`156`	`166`	`self.column_transformer = None`
`157`	`167`	`self.fitted = False`
`158`	`168`	`self.binning_strategy = binning_strategy`
`@@ -300,6 +310,10 @@ def fit(self, X, y=None, embeddings=None):`
`300`	`310`
`301`	`311`	`if numerical_features:`
`302`	`312`	`for feature in numerical_features:`
	`313`	`+ feature_preprocessing = self.feature_preprocessing.get(`
	`314`	`+ feature, self.numerical_preprocessing`
	`315`	`+ )`
	`316`	`+`
`303`	`317`	`# extended the annotation list if new transformer is added, either from sklearn or custom`
`304`	`318`	`numeric_transformer_steps: list[`
`305`	`319`	`tuple[`
`@@ -322,7 +336,7 @@ def fit(self, X, y=None, embeddings=None):`
`322`	`336`	`\| SigmoidExpansion,`
`323`	`337`	`]`
`324`	`338`	`] = [("imputer", SimpleImputer(strategy="mean"))]`
`325`		`- if self.numerical_preprocessing in ["binning", "one-hot"]:`
	`339`	`+ if feature_preprocessing in ["binning", "one-hot"]:`
`326`	`340`	`bins = (`
`327`	`341`	`self._get_decision_tree_bins(X[[feature]], y, [feature])`
`328`	`342`	`if self.use_decision_tree_bins`
`@@ -356,22 +370,22 @@ def fit(self, X, y=None, embeddings=None):`
`356`	`370`	`]`
`357`	`371`	`)`
`358`	`372`
`359`		`- if self.numerical_preprocessing == "one-hot":`
	`373`	`+ if feature_preprocessing == "one-hot":`
`360`	`374`	`numeric_transformer_steps.extend(`
`361`	`375`	`[`
`362`	`376`	`("onehot_from_ordinal", OneHotFromOrdinal()),`
`363`	`377`	`]`
`364`	`378`	`)`
`365`	`379`
`366`		`- elif self.numerical_preprocessing == "standardization":`
	`380`	`+ elif feature_preprocessing == "standardization":`
`367`	`381`	`numeric_transformer_steps.append(("scaler", StandardScaler()))`
`368`	`382`
`369`		`- elif self.numerical_preprocessing == "minmax":`
	`383`	`+ elif feature_preprocessing == "minmax":`
`370`	`384`	`numeric_transformer_steps.append(`
`371`	`385`	`("minmax", MinMaxScaler(feature_range=(-1, 1)))`
`372`	`386`	`)`
`373`	`387`
`374`		`- elif self.numerical_preprocessing == "quantile":`
	`388`	`+ elif feature_preprocessing == "quantile":`
`375`	`389`	`numeric_transformer_steps.append(`
`376`	`390`	`(`
`377`	`391`	`"quantile",`
`@@ -381,7 +395,7 @@ def fit(self, X, y=None, embeddings=None):`
`381`	`395`	`)`
`382`	`396`	`)`
`383`	`397`
`384`		`- elif self.numerical_preprocessing == "polynomial":`
	`398`	`+ elif feature_preprocessing == "polynomial":`
`385`	`399`	`if self.scaling_strategy == "standardization":`
`386`	`400`	`numeric_transformer_steps.append(("scaler", StandardScaler()))`
`387`	`401`	`elif self.scaling_strategy == "minmax":`
`@@ -395,10 +409,10 @@ def fit(self, X, y=None, embeddings=None):`
`395`	`409`	`)`
`396`	`410`	`)`
`397`	`411`
`398`		`- elif self.numerical_preprocessing == "robust":`
	`412`	`+ elif feature_preprocessing == "robust":`
`399`	`413`	`numeric_transformer_steps.append(("robust", RobustScaler()))`
`400`	`414`
`401`		`- elif self.numerical_preprocessing == "splines":`
	`415`	`+ elif feature_preprocessing == "splines":`
`402`	`416`	`if self.scaling_strategy == "standardization":`
`403`	`417`	`numeric_transformer_steps.append(("scaler", StandardScaler()))`
`404`	`418`	`elif self.scaling_strategy == "minmax":`
`@@ -419,7 +433,7 @@ def fit(self, X, y=None, embeddings=None):`
`419`	`433`	`),`
`420`	`434`	`)`
`421`	`435`
`422`		`- elif self.numerical_preprocessing == "rbf":`
	`436`	`+ elif feature_preprocessing == "rbf":`
`423`	`437`	`if self.scaling_strategy == "standardization":`
`424`	`438`	`numeric_transformer_steps.append(("scaler", StandardScaler()))`
`425`	`439`	`elif self.scaling_strategy == "minmax":`
`@@ -438,7 +452,7 @@ def fit(self, X, y=None, embeddings=None):`
`438`	`452`	`)`
`439`	`453`	`)`
`440`	`454`
`441`		`- elif self.numerical_preprocessing == "sigmoid":`
	`455`	`+ elif feature_preprocessing == "sigmoid":`
`442`	`456`	`if self.scaling_strategy == "standardization":`
`443`	`457`	`numeric_transformer_steps.append(("scaler", StandardScaler()))`
`444`	`458`	`elif self.scaling_strategy == "minmax":`
`@@ -457,15 +471,19 @@ def fit(self, X, y=None, embeddings=None):`
`457`	`471`	`)`
`458`	`472`	`)`
`459`	`473`
`460`		`- elif self.numerical_preprocessing == "ple":`
	`474`	`+`
	`475`	`+ elif feature_preprocessing == "ple":`
`461`	`476`	`numeric_transformer_steps.append(`
`462`	`477`	`("minmax", MinMaxScaler(feature_range=(-1, 1)))`
`463`	`478`	`)`
`464`	`479`	`numeric_transformer_steps.append(`
`465`	`480`	`("ple", PLE(n_bins=self.n_bins, task=self.task))`
`466`	`481`	`)`
`467`	`482`
`468`		`- elif self.numerical_preprocessing == "box-cox":`
	`483`	`+ elif feature_preprocessing == "box-cox":`
	`484`	`+ numeric_transformer_steps.append(`
	`485`	`+ ("minmax", MinMaxScaler(feature_range=(1e-03, 1)))`
	`486`	`+ )`
`469`	`487`	`numeric_transformer_steps.append(`
`470`	`488`	`("check_positive", MinMaxScaler(feature_range=(1e-3, 1)))`
`471`	`489`	`)`
`@@ -476,15 +494,15 @@ def fit(self, X, y=None, embeddings=None):`
`476`	`494`	`)`
`477`	`495`	`)`
`478`	`496`
`479`		`- elif self.numerical_preprocessing == "yeo-johnson":`
	`497`	`+ elif feature_preprocessing == "yeo-johnson":`
`480`	`498`	`numeric_transformer_steps.append(`
`481`	`499`	`(`
`482`	`500`	`"yeo-johnson",`
`483`	`501`	`PowerTransformer(method="yeo-johnson", standardize=True),`
`484`	`502`	`)`
`485`	`503`	`)`
`486`	`504`
`487`		`- elif self.numerical_preprocessing == "none":`
	`505`	`+ elif feature_preprocessing == "none":`
`488`	`506`	`numeric_transformer_steps.append(`
`489`	`507`	`(`
`490`	`508`	`"none",`
`@@ -498,15 +516,18 @@ def fit(self, X, y=None, embeddings=None):`
`498`	`516`
`499`	`517`	`if categorical_features:`
`500`	`518`	`for feature in categorical_features:`
`501`		`- if self.categorical_preprocessing == "int":`
	`519`	`+ feature_preprocessing = self.feature_preprocessing.get(`
	`520`	`+ feature, self.categorical_preprocessing`
	`521`	`+ )`
	`522`	`+ if feature_preprocessing == "int":`
`502`	`523`	`# Use ContinuousOrdinalEncoder for "int"`
`503`	`524`	`categorical_transformer = Pipeline(`
`504`	`525`	`[`
`505`	`526`	`("imputer", SimpleImputer(strategy="most_frequent")),`
`506`	`527`	`("continuous_ordinal", ContinuousOrdinalEncoder()),`
`507`	`528`	`]`
`508`	`529`	`)`
`509`		`- elif self.categorical_preprocessing == "one-hot":`
	`530`	`+ elif feature_preprocessing == "one-hot":`
`510`	`531`	`# Use OneHotEncoder for "one-hot"`
`511`	`532`	`categorical_transformer = Pipeline(`
`512`	`533`	`[`
`@@ -516,15 +537,15 @@ def fit(self, X, y=None, embeddings=None):`
`516`	`537`	`]`
`517`	`538`	`)`
`518`	`539`
`519`		`- elif self.categorical_preprocessing == "none":`
	`540`	`+ elif feature_preprocessing == "none":`
`520`	`541`	`# Use OneHotEncoder for "one-hot"`
`521`	`542`	`categorical_transformer = Pipeline(`
`522`	`543`	`[`
`523`	`544`	`("imputer", SimpleImputer(strategy="most_frequent")),`
`524`	`545`	`("none", NoTransformer()),`
`525`	`546`	`]`
`526`	`547`	`)`
`527`		`- elif self.categorical_preprocessing == "pretrained":`
	`548`	`+ elif feature_preprocessing == "pretrained":`
`528`	`549`	`categorical_transformer = Pipeline(`
`529`	`550`	`[`
`530`	`551`	`("imputer", SimpleImputer(strategy="most_frequent")),`
`@@ -533,7 +554,7 @@ def fit(self, X, y=None, embeddings=None):`
`533`	`554`	`)`
`534`	`555`	`else:`
`535`	`556`	`raise ValueError(`
`536`		`- f"Unknown categorical_preprocessing type: {self.categorical_preprocessing}"`
	`557`	`+ f"Unknown categorical_preprocessing type: {feature_preprocessing}"`
`537`	`558`	`)`
`538`	`559`
`539`	`560`	`# Append the transformer for the current categorical feature`