fix preprocessor [FAQ] sklearn raise error: ValueError: not enough values to unpack (expected 2, got 1) #236

AnFreTh · AnFreTh · commit a5beaed6e2d3 · 2025-03-09T19:06:57.000+01:00
diff --git a/mambular/preprocessing/prepro_utils.py b/mambular/preprocessing/prepro_utils.py
@@ -3,7 +3,7 @@
 from sklearn.base import BaseEstimator, TransformerMixin
 
 
-class CustomBinner(TransformerMixin):
+class CustomBinner(TransformerMixin, BaseEstimator):
     def __init__(self, bins):
         # bins can be a scalar (number of bins) or array-like (bin edges)
         self.bins = bins
@@ -30,6 +30,19 @@ def transform(self, X):
         )
         return np.expand_dims(np.array(binned_data), 1)
 
+    def get_feature_names_out(self, input_features=None):
+        """Returns the names of the transformed features.
+
+        Parameters:
+            input_features (list of str): The names of the input features.
+
+        Returns:
+            input_features (array of shape (n_features,)): The names of the output features after transformation.
+        """
+        if input_features is None:
+            raise ValueError("input_features must be specified")
+        return input_features
+
 
 class ContinuousOrdinalEncoder(BaseEstimator, TransformerMixin):
     """This encoder converts categorical features into continuous integer values. Each unique category within a feature
@@ -57,7 +70,10 @@ def fit(self, X, y=None):
             self: Returns the instance itself.
         """
         # Fit should determine the mapping from original categories to sequential integers starting from 0
-        self.mapping_ = [{category: i + 1 for i, category in enumerate(np.unique(col))} for col in X.T]
+        self.mapping_ = [
+            {category: i + 1 for i, category in enumerate(np.unique(col))}
+            for col in X.T
+        ]
         for mapping in self.mapping_:
             mapping[None] = 0  # Assign 0 to unknown values
         return self
@@ -72,7 +88,12 @@ def transform(self, X):
             X_transformed (ndarray of shape (n_samples, n_features)): The transformed data with integer values.
         """
         # Transform the categories to their mapped integer values
-        X_transformed = np.array([[self.mapping_[col].get(value, 0) for col, value in enumerate(row)] for row in X])
+        X_transformed = np.array(
+            [
+                [self.mapping_[col].get(value, 0) for col, value in enumerate(row)]
+                for row in X
+            ]
+        )
         return X_transformed
 
     def get_feature_names_out(self, input_features=None):
@@ -114,7 +135,9 @@ def fit(self, X, y=None):
         Returns:
             self: Returns the instance itself.
         """
-        self.max_bins_ = np.max(X, axis=0).astype(int) + 1  # Find the maximum bin index for each feature
+        self.max_bins_ = (
+            np.max(X, axis=0).astype(int) + 1
+        )  # Find the maximum bin index for each feature
         return self
 
     def transform(self, X):
@@ -197,7 +220,9 @@ def get_feature_names_out(self, input_features=None):
             feature_names (array of shape (n_features,)): The original feature names.
         """
         if input_features is None:
-            raise ValueError("input_features must be provided to generate feature names.")
+            raise ValueError(
+                "input_features must be provided to generate feature names."
+            )
         return np.array(input_features)
 
 
@@ -252,11 +277,17 @@ def transform(self, X):
         - A 2D numpy array with embeddings for each text input.
         """
         if isinstance(X, np.ndarray):
-            X = X.flatten().astype(str).tolist()  # Convert to a list of strings if passed as an array
+            X = (
+                X.flatten().astype(str).tolist()
+            )  # Convert to a list of strings if passed as an array
         elif isinstance(X, list):
             X = [str(x) for x in X]  # Ensure everything is a string
 
         if self.model is None:
-            raise ValueError("Model is not initialized. Ensure that the model is properly loaded.")
-        embeddings = self.model.encode(X, convert_to_numpy=True)  # Get sentence embeddings
+            raise ValueError(
+                "Model is not initialized. Ensure that the model is properly loaded."
+            )
+        embeddings = self.model.encode(
+            X, convert_to_numpy=True
+        )  # Get sentence embeddings
         return embeddings
diff --git a/mambular/preprocessing/preprocessor.py b/mambular/preprocessing/preprocessor.py
@@ -27,9 +27,10 @@
     OneHotFromOrdinal,
     ToFloatTransformer,
 )
+from sklearn.base import TransformerMixin
 
 
-class Preprocessor:
+class Preprocessor(TransformerMixin):
     """A comprehensive preprocessor for structured data, capable of handling both numerical and categorical features.
     It supports various preprocessing strategies for numerical data, including binning, one-hot encoding,
     standardization,and minmax. Categorical features can be transformed using continuous ordinal encoding.
@@ -120,10 +121,14 @@ def __init__(
     ):
         self.n_bins = n_bins
         self.numerical_preprocessing = (
-            numerical_preprocessing.lower() if numerical_preprocessing is not None else "none"
+            numerical_preprocessing.lower()
+            if numerical_preprocessing is not None
+            else "none"
         )
         self.categorical_preprocessing = (
-            categorical_preprocessing.lower() if categorical_preprocessing is not None else "none"
+            categorical_preprocessing.lower()
+            if categorical_preprocessing is not None
+            else "none"
         )
         if self.numerical_preprocessing not in [
             "ple",
@@ -247,13 +252,19 @@ def _detect_column_types(self, X):
                 numerical_features.append(col)
             else:
                 if isinstance(self.cat_cutoff, float):
-                    cutoff_condition = (num_unique_values / total_samples) < self.cat_cutoff
+                    cutoff_condition = (
+                        num_unique_values / total_samples
+                    ) < self.cat_cutoff
                 elif isinstance(self.cat_cutoff, int):
                     cutoff_condition = num_unique_values < self.cat_cutoff
                 else:
-                    raise ValueError("cat_cutoff should be either a float or an integer.")
+                    raise ValueError(
+                        "cat_cutoff should be either a float or an integer."
+                    )
 
-                if X[col].dtype.kind not in "iufc" or (X[col].dtype.kind == "i" and cutoff_condition):
+                if X[col].dtype.kind not in "iufc" or (
+                    X[col].dtype.kind == "i" and cutoff_condition
+                ):
                     categorical_features.append(col)
                 else:
                     numerical_features.append(col)
@@ -266,7 +277,9 @@ def _fit_embeddings(self, embeddings):
             self.embedding_dimensions = {}
             if isinstance(embeddings, np.ndarray):
                 self.embedding_dimensions["embeddings_1"] = embeddings.shape[1]
-            elif isinstance(embeddings, list) and all(isinstance(e, np.ndarray) for e in embeddings):
+            elif isinstance(embeddings, list) and all(
+                isinstance(e, np.ndarray) for e in embeddings
+            ):
                 for idx, e in enumerate(embeddings):
                     self.embedding_dimensions[f"embedding_{idx + 1}"] = e.shape[1]
         else:
@@ -298,7 +311,9 @@ def fit(self, X, y=None, embeddings=None):
 
         if numerical_features:
             for feature in numerical_features:
-                feature_preprocessing = self.feature_preprocessing.get(feature, self.numerical_preprocessing)
+                feature_preprocessing = self.feature_preprocessing.get(
+                    feature, self.numerical_preprocessing
+                )
 
                 # extended the annotation list if new transformer is added, either from sklearn or custom
                 numeric_transformer_steps: list[
@@ -328,13 +343,18 @@ def fit(self, X, y=None, embeddings=None):
                         if self.use_decision_tree_bins
                         else self.n_bins
                     )
+
                     if isinstance(bins, int):
                         numeric_transformer_steps.extend(
                             [
                                 (
                                     "discretizer",
                                     KBinsDiscretizer(
-                                        n_bins=(bins if isinstance(bins, int) else len(bins) - 1),
+                                        n_bins=(
+                                            bins
+                                            if isinstance(bins, int)
+                                            else len(bins) - 1
+                                        ),
                                         encode="ordinal",
                                         strategy=self.binning_strategy,  # type: ignore
                                         subsample=200_000 if len(X) > 200_000 else None,
@@ -343,13 +363,8 @@ def fit(self, X, y=None, embeddings=None):
                             ]
                         )
                     else:
-                        numeric_transformer_steps.append(
-                            [
-                                (
-                                    "discretizer",
-                                    CustomBinner(bins=bins),  # type: ignore
-                                ),
-                            ]
+                        numeric_transformer_steps.extend(
+                            [("CustomBinner", CustomBinner(bins=bins[0]))]
                         )
 
                     if feature_preprocessing == "one-hot":
@@ -363,21 +378,27 @@ def fit(self, X, y=None, embeddings=None):
                     numeric_transformer_steps.append(("scaler", StandardScaler()))
 
                 elif feature_preprocessing == "minmax":
-                    numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
+                    numeric_transformer_steps.append(
+                        ("minmax", MinMaxScaler(feature_range=(-1, 1)))
+                    )
 
                 elif feature_preprocessing == "quantile":
                     numeric_transformer_steps.append(
                         (
                             "quantile",
-                            QuantileTransformer(n_quantiles=self.n_bins, random_state=101),
+                            QuantileTransformer(
+                                n_quantiles=self.n_bins, random_state=101
+                            ),
                         )
                     )
 
                 elif feature_preprocessing == "polynomial":
                     if self.scaling_strategy == "standardization":
                         numeric_transformer_steps.append(("scaler", StandardScaler()))
                     elif self.scaling_strategy == "minmax":
-                        numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
+                        numeric_transformer_steps.append(
+                            ("minmax", MinMaxScaler(feature_range=(-1, 1)))
+                        )
                     numeric_transformer_steps.append(
                         (
                             "polynomial",
@@ -392,7 +413,9 @@ def fit(self, X, y=None, embeddings=None):
                     if self.scaling_strategy == "standardization":
                         numeric_transformer_steps.append(("scaler", StandardScaler()))
                     elif self.scaling_strategy == "minmax":
-                        numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
+                        numeric_transformer_steps.append(
+                            ("minmax", MinMaxScaler(feature_range=(-1, 1)))
+                        )
                     numeric_transformer_steps.append(
                         (
                             "splines",
@@ -411,7 +434,9 @@ def fit(self, X, y=None, embeddings=None):
                     if self.scaling_strategy == "standardization":
                         numeric_transformer_steps.append(("scaler", StandardScaler()))
                     elif self.scaling_strategy == "minmax":
-                        numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
+                        numeric_transformer_steps.append(
+                            ("minmax", MinMaxScaler(feature_range=(-1, 1)))
+                        )
                     numeric_transformer_steps.append(
                         (
                             "rbf",
@@ -428,7 +453,9 @@ def fit(self, X, y=None, embeddings=None):
                     if self.scaling_strategy == "standardization":
                         numeric_transformer_steps.append(("scaler", StandardScaler()))
                     elif self.scaling_strategy == "minmax":
-                        numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
+                        numeric_transformer_steps.append(
+                            ("minmax", MinMaxScaler(feature_range=(-1, 1)))
+                        )
                     numeric_transformer_steps.append(
                         (
                             "sigmoid",
@@ -442,8 +469,12 @@ def fit(self, X, y=None, embeddings=None):
                     )
 
                 elif feature_preprocessing == "ple":
-                    numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
-                    numeric_transformer_steps.append(("ple", PLE(n_bins=self.n_bins, task=self.task)))
+                    numeric_transformer_steps.append(
+                        ("minmax", MinMaxScaler(feature_range=(-1, 1)))
+                    )
+                    numeric_transformer_steps.append(
+                        ("ple", PLE(n_bins=self.n_bins, task=self.task))
+                    )
 
                 elif feature_preprocessing == "box-cox":
                     numeric_transformer_steps.append(
@@ -481,7 +512,9 @@ def fit(self, X, y=None, embeddings=None):
 
         if categorical_features:
             for feature in categorical_features:
-                feature_preprocessing = self.feature_preprocessing.get(feature, self.categorical_preprocessing)
+                feature_preprocessing = self.feature_preprocessing.get(
+                    feature, self.categorical_preprocessing
+                )
                 if feature_preprocessing == "int":
                     # Use ContinuousOrdinalEncoder for "int"
                     categorical_transformer = Pipeline(
@@ -516,12 +549,18 @@ def fit(self, X, y=None, embeddings=None):
                         ]
                     )
                 else:
-                    raise ValueError(f"Unknown categorical_preprocessing type: {feature_preprocessing}")
+                    raise ValueError(
+                        f"Unknown categorical_preprocessing type: {feature_preprocessing}"
+                    )
 
                 # Append the transformer for the current categorical feature
-                transformers.append((f"cat_{feature}", categorical_transformer, [feature]))
+                transformers.append(
+                    (f"cat_{feature}", categorical_transformer, [feature])
+                )
 
-        self.column_transformer = ColumnTransformer(transformers=transformers, remainder="passthrough")
+        self.column_transformer = ColumnTransformer(
+            transformers=transformers, remainder="passthrough"
+        )
         self.column_transformer.fit(X, y)
 
         self.fitted = True
@@ -547,13 +586,17 @@ def _get_decision_tree_bins(self, X, y, numerical_features):
         bins = []
         for feature in numerical_features:
             tree_model = (
-                DecisionTreeClassifier(max_depth=3) if y.dtype.kind in "bi" else DecisionTreeRegressor(max_depth=3)
+                DecisionTreeClassifier(max_depth=5)
+                if y.dtype.kind in "bi"
+                else DecisionTreeRegressor(max_depth=5)
             )
             tree_model.fit(X[[feature]], y)
             thresholds = tree_model.tree_.threshold[tree_model.tree_.feature != -2]  # type: ignore
             bin_edges = np.sort(np.unique(thresholds))
 
-            bins.append(np.concatenate(([X[feature].min()], bin_edges, [X[feature].max()])))
+            bins.append(
+                np.concatenate(([X[feature].min()], bin_edges, [X[feature].max()]))
+            )
         return bins
 
     def transform(self, X, embeddings=None):
@@ -597,7 +640,9 @@ def transform(self, X, embeddings=None):
                         f"but got {embeddings.shape[1]}"
                     )
                 transformed_dict["embedding_1"] = embeddings.astype(np.float32)
-            elif isinstance(embeddings, list) and all(isinstance(e, np.ndarray) for e in embeddings):
+            elif isinstance(embeddings, list) and all(
+                isinstance(e, np.ndarray) for e in embeddings
+            ):
                 for idx, e in enumerate(embeddings):
                     key = f"embedding_{idx + 1}"
                     if self.embedding_dimensions[key] != e.shape[1]:
@@ -607,7 +652,9 @@ def transform(self, X, embeddings=None):
                     transformed_dict[key] = e.astype(np.float32)
         else:
             if self.embeddings is not False:
-                raise ValueError("self.embeddings should be False when embeddings are None.")
+                raise ValueError(
+                    "self.embeddings should be False when embeddings are None."
+                )
             self.embeddings = False
 
         return transformed_dict
@@ -740,7 +787,9 @@ def get_feature_info(self, verbose=True):
                         "categories": None,
                     }
                     if verbose:
-                        print(f"Numerical Feature: {feature_name}, Info: {numerical_feature_info[feature_name]}")
+                        print(
+                            f"Numerical Feature: {feature_name}, Info: {numerical_feature_info[feature_name]}"
+                        )
 
                 elif "continuous_ordinal" in steps:
                     step = transformer_pipeline.named_steps["continuous_ordinal"]
@@ -790,7 +839,9 @@ def get_feature_info(self, verbose=True):
                             "categories": None,
                         }
                     if verbose:
-                        print(f"Feature: {feature_name}, Info: {preprocessing_type}, Dimension: {dimension}")
+                        print(
+                            f"Feature: {feature_name}, Info: {preprocessing_type}, Dimension: {dimension}"
+                        )
 
                 if verbose:
                     print("-" * 50)