Skip to content

Commit a5beaed

Browse files
committed
fix preprocessor [FAQ] sklearn raise error: ValueError: not enough values to unpack (expected 2, got 1) #236
1 parent 9acb5b1 commit a5beaed

2 files changed

Lines changed: 124 additions & 42 deletions

File tree

mambular/preprocessing/prepro_utils.py

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from sklearn.base import BaseEstimator, TransformerMixin
44

55

6-
class CustomBinner(TransformerMixin):
6+
class CustomBinner(TransformerMixin, BaseEstimator):
77
def __init__(self, bins):
88
# bins can be a scalar (number of bins) or array-like (bin edges)
99
self.bins = bins
@@ -30,6 +30,19 @@ def transform(self, X):
3030
)
3131
return np.expand_dims(np.array(binned_data), 1)
3232

33+
def get_feature_names_out(self, input_features=None):
34+
"""Returns the names of the transformed features.
35+
36+
Parameters:
37+
input_features (list of str): The names of the input features.
38+
39+
Returns:
40+
input_features (array of shape (n_features,)): The names of the output features after transformation.
41+
"""
42+
if input_features is None:
43+
raise ValueError("input_features must be specified")
44+
return input_features
45+
3346

3447
class ContinuousOrdinalEncoder(BaseEstimator, TransformerMixin):
3548
"""This encoder converts categorical features into continuous integer values. Each unique category within a feature
@@ -57,7 +70,10 @@ def fit(self, X, y=None):
5770
self: Returns the instance itself.
5871
"""
5972
# Fit should determine the mapping from original categories to sequential integers starting from 0
60-
self.mapping_ = [{category: i + 1 for i, category in enumerate(np.unique(col))} for col in X.T]
73+
self.mapping_ = [
74+
{category: i + 1 for i, category in enumerate(np.unique(col))}
75+
for col in X.T
76+
]
6177
for mapping in self.mapping_:
6278
mapping[None] = 0 # Assign 0 to unknown values
6379
return self
@@ -72,7 +88,12 @@ def transform(self, X):
7288
X_transformed (ndarray of shape (n_samples, n_features)): The transformed data with integer values.
7389
"""
7490
# Transform the categories to their mapped integer values
75-
X_transformed = np.array([[self.mapping_[col].get(value, 0) for col, value in enumerate(row)] for row in X])
91+
X_transformed = np.array(
92+
[
93+
[self.mapping_[col].get(value, 0) for col, value in enumerate(row)]
94+
for row in X
95+
]
96+
)
7697
return X_transformed
7798

7899
def get_feature_names_out(self, input_features=None):
@@ -114,7 +135,9 @@ def fit(self, X, y=None):
114135
Returns:
115136
self: Returns the instance itself.
116137
"""
117-
self.max_bins_ = np.max(X, axis=0).astype(int) + 1 # Find the maximum bin index for each feature
138+
self.max_bins_ = (
139+
np.max(X, axis=0).astype(int) + 1
140+
) # Find the maximum bin index for each feature
118141
return self
119142

120143
def transform(self, X):
@@ -197,7 +220,9 @@ def get_feature_names_out(self, input_features=None):
197220
feature_names (array of shape (n_features,)): The original feature names.
198221
"""
199222
if input_features is None:
200-
raise ValueError("input_features must be provided to generate feature names.")
223+
raise ValueError(
224+
"input_features must be provided to generate feature names."
225+
)
201226
return np.array(input_features)
202227

203228

@@ -252,11 +277,17 @@ def transform(self, X):
252277
- A 2D numpy array with embeddings for each text input.
253278
"""
254279
if isinstance(X, np.ndarray):
255-
X = X.flatten().astype(str).tolist() # Convert to a list of strings if passed as an array
280+
X = (
281+
X.flatten().astype(str).tolist()
282+
) # Convert to a list of strings if passed as an array
256283
elif isinstance(X, list):
257284
X = [str(x) for x in X] # Ensure everything is a string
258285

259286
if self.model is None:
260-
raise ValueError("Model is not initialized. Ensure that the model is properly loaded.")
261-
embeddings = self.model.encode(X, convert_to_numpy=True) # Get sentence embeddings
287+
raise ValueError(
288+
"Model is not initialized. Ensure that the model is properly loaded."
289+
)
290+
embeddings = self.model.encode(
291+
X, convert_to_numpy=True
292+
) # Get sentence embeddings
262293
return embeddings

mambular/preprocessing/preprocessor.py

Lines changed: 85 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,10 @@
2727
OneHotFromOrdinal,
2828
ToFloatTransformer,
2929
)
30+
from sklearn.base import TransformerMixin
3031

3132

32-
class Preprocessor:
33+
class Preprocessor(TransformerMixin):
3334
"""A comprehensive preprocessor for structured data, capable of handling both numerical and categorical features.
3435
It supports various preprocessing strategies for numerical data, including binning, one-hot encoding,
3536
standardization,and minmax. Categorical features can be transformed using continuous ordinal encoding.
@@ -120,10 +121,14 @@ def __init__(
120121
):
121122
self.n_bins = n_bins
122123
self.numerical_preprocessing = (
123-
numerical_preprocessing.lower() if numerical_preprocessing is not None else "none"
124+
numerical_preprocessing.lower()
125+
if numerical_preprocessing is not None
126+
else "none"
124127
)
125128
self.categorical_preprocessing = (
126-
categorical_preprocessing.lower() if categorical_preprocessing is not None else "none"
129+
categorical_preprocessing.lower()
130+
if categorical_preprocessing is not None
131+
else "none"
127132
)
128133
if self.numerical_preprocessing not in [
129134
"ple",
@@ -247,13 +252,19 @@ def _detect_column_types(self, X):
247252
numerical_features.append(col)
248253
else:
249254
if isinstance(self.cat_cutoff, float):
250-
cutoff_condition = (num_unique_values / total_samples) < self.cat_cutoff
255+
cutoff_condition = (
256+
num_unique_values / total_samples
257+
) < self.cat_cutoff
251258
elif isinstance(self.cat_cutoff, int):
252259
cutoff_condition = num_unique_values < self.cat_cutoff
253260
else:
254-
raise ValueError("cat_cutoff should be either a float or an integer.")
261+
raise ValueError(
262+
"cat_cutoff should be either a float or an integer."
263+
)
255264

256-
if X[col].dtype.kind not in "iufc" or (X[col].dtype.kind == "i" and cutoff_condition):
265+
if X[col].dtype.kind not in "iufc" or (
266+
X[col].dtype.kind == "i" and cutoff_condition
267+
):
257268
categorical_features.append(col)
258269
else:
259270
numerical_features.append(col)
@@ -266,7 +277,9 @@ def _fit_embeddings(self, embeddings):
266277
self.embedding_dimensions = {}
267278
if isinstance(embeddings, np.ndarray):
268279
self.embedding_dimensions["embeddings_1"] = embeddings.shape[1]
269-
elif isinstance(embeddings, list) and all(isinstance(e, np.ndarray) for e in embeddings):
280+
elif isinstance(embeddings, list) and all(
281+
isinstance(e, np.ndarray) for e in embeddings
282+
):
270283
for idx, e in enumerate(embeddings):
271284
self.embedding_dimensions[f"embedding_{idx + 1}"] = e.shape[1]
272285
else:
@@ -298,7 +311,9 @@ def fit(self, X, y=None, embeddings=None):
298311

299312
if numerical_features:
300313
for feature in numerical_features:
301-
feature_preprocessing = self.feature_preprocessing.get(feature, self.numerical_preprocessing)
314+
feature_preprocessing = self.feature_preprocessing.get(
315+
feature, self.numerical_preprocessing
316+
)
302317

303318
# extended the annotation list if new transformer is added, either from sklearn or custom
304319
numeric_transformer_steps: list[
@@ -328,13 +343,18 @@ def fit(self, X, y=None, embeddings=None):
328343
if self.use_decision_tree_bins
329344
else self.n_bins
330345
)
346+
331347
if isinstance(bins, int):
332348
numeric_transformer_steps.extend(
333349
[
334350
(
335351
"discretizer",
336352
KBinsDiscretizer(
337-
n_bins=(bins if isinstance(bins, int) else len(bins) - 1),
353+
n_bins=(
354+
bins
355+
if isinstance(bins, int)
356+
else len(bins) - 1
357+
),
338358
encode="ordinal",
339359
strategy=self.binning_strategy, # type: ignore
340360
subsample=200_000 if len(X) > 200_000 else None,
@@ -343,13 +363,8 @@ def fit(self, X, y=None, embeddings=None):
343363
]
344364
)
345365
else:
346-
numeric_transformer_steps.append(
347-
[
348-
(
349-
"discretizer",
350-
CustomBinner(bins=bins), # type: ignore
351-
),
352-
]
366+
numeric_transformer_steps.extend(
367+
[("CustomBinner", CustomBinner(bins=bins[0]))]
353368
)
354369

355370
if feature_preprocessing == "one-hot":
@@ -363,21 +378,27 @@ def fit(self, X, y=None, embeddings=None):
363378
numeric_transformer_steps.append(("scaler", StandardScaler()))
364379

365380
elif feature_preprocessing == "minmax":
366-
numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
381+
numeric_transformer_steps.append(
382+
("minmax", MinMaxScaler(feature_range=(-1, 1)))
383+
)
367384

368385
elif feature_preprocessing == "quantile":
369386
numeric_transformer_steps.append(
370387
(
371388
"quantile",
372-
QuantileTransformer(n_quantiles=self.n_bins, random_state=101),
389+
QuantileTransformer(
390+
n_quantiles=self.n_bins, random_state=101
391+
),
373392
)
374393
)
375394

376395
elif feature_preprocessing == "polynomial":
377396
if self.scaling_strategy == "standardization":
378397
numeric_transformer_steps.append(("scaler", StandardScaler()))
379398
elif self.scaling_strategy == "minmax":
380-
numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
399+
numeric_transformer_steps.append(
400+
("minmax", MinMaxScaler(feature_range=(-1, 1)))
401+
)
381402
numeric_transformer_steps.append(
382403
(
383404
"polynomial",
@@ -392,7 +413,9 @@ def fit(self, X, y=None, embeddings=None):
392413
if self.scaling_strategy == "standardization":
393414
numeric_transformer_steps.append(("scaler", StandardScaler()))
394415
elif self.scaling_strategy == "minmax":
395-
numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
416+
numeric_transformer_steps.append(
417+
("minmax", MinMaxScaler(feature_range=(-1, 1)))
418+
)
396419
numeric_transformer_steps.append(
397420
(
398421
"splines",
@@ -411,7 +434,9 @@ def fit(self, X, y=None, embeddings=None):
411434
if self.scaling_strategy == "standardization":
412435
numeric_transformer_steps.append(("scaler", StandardScaler()))
413436
elif self.scaling_strategy == "minmax":
414-
numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
437+
numeric_transformer_steps.append(
438+
("minmax", MinMaxScaler(feature_range=(-1, 1)))
439+
)
415440
numeric_transformer_steps.append(
416441
(
417442
"rbf",
@@ -428,7 +453,9 @@ def fit(self, X, y=None, embeddings=None):
428453
if self.scaling_strategy == "standardization":
429454
numeric_transformer_steps.append(("scaler", StandardScaler()))
430455
elif self.scaling_strategy == "minmax":
431-
numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
456+
numeric_transformer_steps.append(
457+
("minmax", MinMaxScaler(feature_range=(-1, 1)))
458+
)
432459
numeric_transformer_steps.append(
433460
(
434461
"sigmoid",
@@ -442,8 +469,12 @@ def fit(self, X, y=None, embeddings=None):
442469
)
443470

444471
elif feature_preprocessing == "ple":
445-
numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
446-
numeric_transformer_steps.append(("ple", PLE(n_bins=self.n_bins, task=self.task)))
472+
numeric_transformer_steps.append(
473+
("minmax", MinMaxScaler(feature_range=(-1, 1)))
474+
)
475+
numeric_transformer_steps.append(
476+
("ple", PLE(n_bins=self.n_bins, task=self.task))
477+
)
447478

448479
elif feature_preprocessing == "box-cox":
449480
numeric_transformer_steps.append(
@@ -481,7 +512,9 @@ def fit(self, X, y=None, embeddings=None):
481512

482513
if categorical_features:
483514
for feature in categorical_features:
484-
feature_preprocessing = self.feature_preprocessing.get(feature, self.categorical_preprocessing)
515+
feature_preprocessing = self.feature_preprocessing.get(
516+
feature, self.categorical_preprocessing
517+
)
485518
if feature_preprocessing == "int":
486519
# Use ContinuousOrdinalEncoder for "int"
487520
categorical_transformer = Pipeline(
@@ -516,12 +549,18 @@ def fit(self, X, y=None, embeddings=None):
516549
]
517550
)
518551
else:
519-
raise ValueError(f"Unknown categorical_preprocessing type: {feature_preprocessing}")
552+
raise ValueError(
553+
f"Unknown categorical_preprocessing type: {feature_preprocessing}"
554+
)
520555

521556
# Append the transformer for the current categorical feature
522-
transformers.append((f"cat_{feature}", categorical_transformer, [feature]))
557+
transformers.append(
558+
(f"cat_{feature}", categorical_transformer, [feature])
559+
)
523560

524-
self.column_transformer = ColumnTransformer(transformers=transformers, remainder="passthrough")
561+
self.column_transformer = ColumnTransformer(
562+
transformers=transformers, remainder="passthrough"
563+
)
525564
self.column_transformer.fit(X, y)
526565

527566
self.fitted = True
@@ -547,13 +586,17 @@ def _get_decision_tree_bins(self, X, y, numerical_features):
547586
bins = []
548587
for feature in numerical_features:
549588
tree_model = (
550-
DecisionTreeClassifier(max_depth=3) if y.dtype.kind in "bi" else DecisionTreeRegressor(max_depth=3)
589+
DecisionTreeClassifier(max_depth=5)
590+
if y.dtype.kind in "bi"
591+
else DecisionTreeRegressor(max_depth=5)
551592
)
552593
tree_model.fit(X[[feature]], y)
553594
thresholds = tree_model.tree_.threshold[tree_model.tree_.feature != -2] # type: ignore
554595
bin_edges = np.sort(np.unique(thresholds))
555596

556-
bins.append(np.concatenate(([X[feature].min()], bin_edges, [X[feature].max()])))
597+
bins.append(
598+
np.concatenate(([X[feature].min()], bin_edges, [X[feature].max()]))
599+
)
557600
return bins
558601

559602
def transform(self, X, embeddings=None):
@@ -597,7 +640,9 @@ def transform(self, X, embeddings=None):
597640
f"but got {embeddings.shape[1]}"
598641
)
599642
transformed_dict["embedding_1"] = embeddings.astype(np.float32)
600-
elif isinstance(embeddings, list) and all(isinstance(e, np.ndarray) for e in embeddings):
643+
elif isinstance(embeddings, list) and all(
644+
isinstance(e, np.ndarray) for e in embeddings
645+
):
601646
for idx, e in enumerate(embeddings):
602647
key = f"embedding_{idx + 1}"
603648
if self.embedding_dimensions[key] != e.shape[1]:
@@ -607,7 +652,9 @@ def transform(self, X, embeddings=None):
607652
transformed_dict[key] = e.astype(np.float32)
608653
else:
609654
if self.embeddings is not False:
610-
raise ValueError("self.embeddings should be False when embeddings are None.")
655+
raise ValueError(
656+
"self.embeddings should be False when embeddings are None."
657+
)
611658
self.embeddings = False
612659

613660
return transformed_dict
@@ -740,7 +787,9 @@ def get_feature_info(self, verbose=True):
740787
"categories": None,
741788
}
742789
if verbose:
743-
print(f"Numerical Feature: {feature_name}, Info: {numerical_feature_info[feature_name]}")
790+
print(
791+
f"Numerical Feature: {feature_name}, Info: {numerical_feature_info[feature_name]}"
792+
)
744793

745794
elif "continuous_ordinal" in steps:
746795
step = transformer_pipeline.named_steps["continuous_ordinal"]
@@ -790,7 +839,9 @@ def get_feature_info(self, verbose=True):
790839
"categories": None,
791840
}
792841
if verbose:
793-
print(f"Feature: {feature_name}, Info: {preprocessing_type}, Dimension: {dimension}")
842+
print(
843+
f"Feature: {feature_name}, Info: {preprocessing_type}, Dimension: {dimension}"
844+
)
794845

795846
if verbose:
796847
print("-" * 50)

0 commit comments

Comments
 (0)