2727 OneHotFromOrdinal ,
2828 ToFloatTransformer ,
2929)
30+ from sklearn .base import TransformerMixin
3031
3132
32- class Preprocessor :
33+ class Preprocessor ( TransformerMixin ) :
3334 """A comprehensive preprocessor for structured data, capable of handling both numerical and categorical features.
3435 It supports various preprocessing strategies for numerical data, including binning, one-hot encoding,
3536 standardization,and minmax. Categorical features can be transformed using continuous ordinal encoding.
@@ -120,10 +121,14 @@ def __init__(
120121 ):
121122 self .n_bins = n_bins
122123 self .numerical_preprocessing = (
123- numerical_preprocessing .lower () if numerical_preprocessing is not None else "none"
124+ numerical_preprocessing .lower ()
125+ if numerical_preprocessing is not None
126+ else "none"
124127 )
125128 self .categorical_preprocessing = (
126- categorical_preprocessing .lower () if categorical_preprocessing is not None else "none"
129+ categorical_preprocessing .lower ()
130+ if categorical_preprocessing is not None
131+ else "none"
127132 )
128133 if self .numerical_preprocessing not in [
129134 "ple" ,
@@ -247,13 +252,19 @@ def _detect_column_types(self, X):
247252 numerical_features .append (col )
248253 else :
249254 if isinstance (self .cat_cutoff , float ):
250- cutoff_condition = (num_unique_values / total_samples ) < self .cat_cutoff
255+ cutoff_condition = (
256+ num_unique_values / total_samples
257+ ) < self .cat_cutoff
251258 elif isinstance (self .cat_cutoff , int ):
252259 cutoff_condition = num_unique_values < self .cat_cutoff
253260 else :
254- raise ValueError ("cat_cutoff should be either a float or an integer." )
261+ raise ValueError (
262+ "cat_cutoff should be either a float or an integer."
263+ )
255264
256- if X [col ].dtype .kind not in "iufc" or (X [col ].dtype .kind == "i" and cutoff_condition ):
265+ if X [col ].dtype .kind not in "iufc" or (
266+ X [col ].dtype .kind == "i" and cutoff_condition
267+ ):
257268 categorical_features .append (col )
258269 else :
259270 numerical_features .append (col )
@@ -266,7 +277,9 @@ def _fit_embeddings(self, embeddings):
266277 self .embedding_dimensions = {}
267278 if isinstance (embeddings , np .ndarray ):
268279 self .embedding_dimensions ["embeddings_1" ] = embeddings .shape [1 ]
269- elif isinstance (embeddings , list ) and all (isinstance (e , np .ndarray ) for e in embeddings ):
280+ elif isinstance (embeddings , list ) and all (
281+ isinstance (e , np .ndarray ) for e in embeddings
282+ ):
270283 for idx , e in enumerate (embeddings ):
271284 self .embedding_dimensions [f"embedding_{ idx + 1 } " ] = e .shape [1 ]
272285 else :
@@ -298,7 +311,9 @@ def fit(self, X, y=None, embeddings=None):
298311
299312 if numerical_features :
300313 for feature in numerical_features :
301- feature_preprocessing = self .feature_preprocessing .get (feature , self .numerical_preprocessing )
314+ feature_preprocessing = self .feature_preprocessing .get (
315+ feature , self .numerical_preprocessing
316+ )
302317
303318 # extended the annotation list if new transformer is added, either from sklearn or custom
304319 numeric_transformer_steps : list [
@@ -328,13 +343,18 @@ def fit(self, X, y=None, embeddings=None):
328343 if self .use_decision_tree_bins
329344 else self .n_bins
330345 )
346+
331347 if isinstance (bins , int ):
332348 numeric_transformer_steps .extend (
333349 [
334350 (
335351 "discretizer" ,
336352 KBinsDiscretizer (
337- n_bins = (bins if isinstance (bins , int ) else len (bins ) - 1 ),
353+ n_bins = (
354+ bins
355+ if isinstance (bins , int )
356+ else len (bins ) - 1
357+ ),
338358 encode = "ordinal" ,
339359 strategy = self .binning_strategy , # type: ignore
340360 subsample = 200_000 if len (X ) > 200_000 else None ,
@@ -343,13 +363,8 @@ def fit(self, X, y=None, embeddings=None):
343363 ]
344364 )
345365 else :
346- numeric_transformer_steps .append (
347- [
348- (
349- "discretizer" ,
350- CustomBinner (bins = bins ), # type: ignore
351- ),
352- ]
366+ numeric_transformer_steps .extend (
367+ [("CustomBinner" , CustomBinner (bins = bins [0 ]))]
353368 )
354369
355370 if feature_preprocessing == "one-hot" :
@@ -363,21 +378,27 @@ def fit(self, X, y=None, embeddings=None):
363378 numeric_transformer_steps .append (("scaler" , StandardScaler ()))
364379
365380 elif feature_preprocessing == "minmax" :
366- numeric_transformer_steps .append (("minmax" , MinMaxScaler (feature_range = (- 1 , 1 ))))
381+ numeric_transformer_steps .append (
382+ ("minmax" , MinMaxScaler (feature_range = (- 1 , 1 )))
383+ )
367384
368385 elif feature_preprocessing == "quantile" :
369386 numeric_transformer_steps .append (
370387 (
371388 "quantile" ,
372- QuantileTransformer (n_quantiles = self .n_bins , random_state = 101 ),
389+ QuantileTransformer (
390+ n_quantiles = self .n_bins , random_state = 101
391+ ),
373392 )
374393 )
375394
376395 elif feature_preprocessing == "polynomial" :
377396 if self .scaling_strategy == "standardization" :
378397 numeric_transformer_steps .append (("scaler" , StandardScaler ()))
379398 elif self .scaling_strategy == "minmax" :
380- numeric_transformer_steps .append (("minmax" , MinMaxScaler (feature_range = (- 1 , 1 ))))
399+ numeric_transformer_steps .append (
400+ ("minmax" , MinMaxScaler (feature_range = (- 1 , 1 )))
401+ )
381402 numeric_transformer_steps .append (
382403 (
383404 "polynomial" ,
@@ -392,7 +413,9 @@ def fit(self, X, y=None, embeddings=None):
392413 if self .scaling_strategy == "standardization" :
393414 numeric_transformer_steps .append (("scaler" , StandardScaler ()))
394415 elif self .scaling_strategy == "minmax" :
395- numeric_transformer_steps .append (("minmax" , MinMaxScaler (feature_range = (- 1 , 1 ))))
416+ numeric_transformer_steps .append (
417+ ("minmax" , MinMaxScaler (feature_range = (- 1 , 1 )))
418+ )
396419 numeric_transformer_steps .append (
397420 (
398421 "splines" ,
@@ -411,7 +434,9 @@ def fit(self, X, y=None, embeddings=None):
411434 if self .scaling_strategy == "standardization" :
412435 numeric_transformer_steps .append (("scaler" , StandardScaler ()))
413436 elif self .scaling_strategy == "minmax" :
414- numeric_transformer_steps .append (("minmax" , MinMaxScaler (feature_range = (- 1 , 1 ))))
437+ numeric_transformer_steps .append (
438+ ("minmax" , MinMaxScaler (feature_range = (- 1 , 1 )))
439+ )
415440 numeric_transformer_steps .append (
416441 (
417442 "rbf" ,
@@ -428,7 +453,9 @@ def fit(self, X, y=None, embeddings=None):
428453 if self .scaling_strategy == "standardization" :
429454 numeric_transformer_steps .append (("scaler" , StandardScaler ()))
430455 elif self .scaling_strategy == "minmax" :
431- numeric_transformer_steps .append (("minmax" , MinMaxScaler (feature_range = (- 1 , 1 ))))
456+ numeric_transformer_steps .append (
457+ ("minmax" , MinMaxScaler (feature_range = (- 1 , 1 )))
458+ )
432459 numeric_transformer_steps .append (
433460 (
434461 "sigmoid" ,
@@ -442,8 +469,12 @@ def fit(self, X, y=None, embeddings=None):
442469 )
443470
444471 elif feature_preprocessing == "ple" :
445- numeric_transformer_steps .append (("minmax" , MinMaxScaler (feature_range = (- 1 , 1 ))))
446- numeric_transformer_steps .append (("ple" , PLE (n_bins = self .n_bins , task = self .task )))
472+ numeric_transformer_steps .append (
473+ ("minmax" , MinMaxScaler (feature_range = (- 1 , 1 )))
474+ )
475+ numeric_transformer_steps .append (
476+ ("ple" , PLE (n_bins = self .n_bins , task = self .task ))
477+ )
447478
448479 elif feature_preprocessing == "box-cox" :
449480 numeric_transformer_steps .append (
@@ -481,7 +512,9 @@ def fit(self, X, y=None, embeddings=None):
481512
482513 if categorical_features :
483514 for feature in categorical_features :
484- feature_preprocessing = self .feature_preprocessing .get (feature , self .categorical_preprocessing )
515+ feature_preprocessing = self .feature_preprocessing .get (
516+ feature , self .categorical_preprocessing
517+ )
485518 if feature_preprocessing == "int" :
486519 # Use ContinuousOrdinalEncoder for "int"
487520 categorical_transformer = Pipeline (
@@ -516,12 +549,18 @@ def fit(self, X, y=None, embeddings=None):
516549 ]
517550 )
518551 else :
519- raise ValueError (f"Unknown categorical_preprocessing type: { feature_preprocessing } " )
552+ raise ValueError (
553+ f"Unknown categorical_preprocessing type: { feature_preprocessing } "
554+ )
520555
521556 # Append the transformer for the current categorical feature
522- transformers .append ((f"cat_{ feature } " , categorical_transformer , [feature ]))
557+ transformers .append (
558+ (f"cat_{ feature } " , categorical_transformer , [feature ])
559+ )
523560
524- self .column_transformer = ColumnTransformer (transformers = transformers , remainder = "passthrough" )
561+ self .column_transformer = ColumnTransformer (
562+ transformers = transformers , remainder = "passthrough"
563+ )
525564 self .column_transformer .fit (X , y )
526565
527566 self .fitted = True
@@ -547,13 +586,17 @@ def _get_decision_tree_bins(self, X, y, numerical_features):
547586 bins = []
548587 for feature in numerical_features :
549588 tree_model = (
550- DecisionTreeClassifier (max_depth = 3 ) if y .dtype .kind in "bi" else DecisionTreeRegressor (max_depth = 3 )
589+ DecisionTreeClassifier (max_depth = 5 )
590+ if y .dtype .kind in "bi"
591+ else DecisionTreeRegressor (max_depth = 5 )
551592 )
552593 tree_model .fit (X [[feature ]], y )
553594 thresholds = tree_model .tree_ .threshold [tree_model .tree_ .feature != - 2 ] # type: ignore
554595 bin_edges = np .sort (np .unique (thresholds ))
555596
556- bins .append (np .concatenate (([X [feature ].min ()], bin_edges , [X [feature ].max ()])))
597+ bins .append (
598+ np .concatenate (([X [feature ].min ()], bin_edges , [X [feature ].max ()]))
599+ )
557600 return bins
558601
559602 def transform (self , X , embeddings = None ):
@@ -597,7 +640,9 @@ def transform(self, X, embeddings=None):
597640 f"but got { embeddings .shape [1 ]} "
598641 )
599642 transformed_dict ["embedding_1" ] = embeddings .astype (np .float32 )
600- elif isinstance (embeddings , list ) and all (isinstance (e , np .ndarray ) for e in embeddings ):
643+ elif isinstance (embeddings , list ) and all (
644+ isinstance (e , np .ndarray ) for e in embeddings
645+ ):
601646 for idx , e in enumerate (embeddings ):
602647 key = f"embedding_{ idx + 1 } "
603648 if self .embedding_dimensions [key ] != e .shape [1 ]:
@@ -607,7 +652,9 @@ def transform(self, X, embeddings=None):
607652 transformed_dict [key ] = e .astype (np .float32 )
608653 else :
609654 if self .embeddings is not False :
610- raise ValueError ("self.embeddings should be False when embeddings are None." )
655+ raise ValueError (
656+ "self.embeddings should be False when embeddings are None."
657+ )
611658 self .embeddings = False
612659
613660 return transformed_dict
@@ -740,7 +787,9 @@ def get_feature_info(self, verbose=True):
740787 "categories" : None ,
741788 }
742789 if verbose :
743- print (f"Numerical Feature: { feature_name } , Info: { numerical_feature_info [feature_name ]} " )
790+ print (
791+ f"Numerical Feature: { feature_name } , Info: { numerical_feature_info [feature_name ]} "
792+ )
744793
745794 elif "continuous_ordinal" in steps :
746795 step = transformer_pipeline .named_steps ["continuous_ordinal" ]
@@ -790,7 +839,9 @@ def get_feature_info(self, verbose=True):
790839 "categories" : None ,
791840 }
792841 if verbose :
793- print (f"Feature: { feature_name } , Info: { preprocessing_type } , Dimension: { dimension } " )
842+ print (
843+ f"Feature: { feature_name } , Info: { preprocessing_type } , Dimension: { dimension } "
844+ )
794845
795846 if verbose :
796847 print ("-" * 50 )
0 commit comments