|
| 1 | +import pandas as pd |
| 2 | +import numpy as np |
| 3 | +from sklearn.base import TransformerMixin, BaseEstimator |
| 4 | + |
| 5 | + |
| 6 | +class CustomBinner(TransformerMixin): |
| 7 | + def __init__(self, bins): |
| 8 | + # bins can be a scalar (number of bins) or array-like (bin edges) |
| 9 | + self.bins = bins |
| 10 | + |
| 11 | + def fit(self, X, y=None): |
| 12 | + # Fit doesn't need to do anything as we are directly using provided bins |
| 13 | + return self |
| 14 | + |
| 15 | + def transform(self, X): |
| 16 | + if isinstance(self.bins, int): |
| 17 | + # Calculate equal width bins based on the range of the data and number of bins |
| 18 | + _, bins = pd.cut(X.squeeze(), bins=self.bins, retbins=True) |
| 19 | + else: |
| 20 | + # Use predefined bins |
| 21 | + bins = self.bins |
| 22 | + |
| 23 | + # Apply the bins to the data |
| 24 | + binned_data = pd.cut( |
| 25 | + X.squeeze(), |
| 26 | + bins=np.sort(np.unique(bins)), |
| 27 | + labels=False, |
| 28 | + include_lowest=True, |
| 29 | + ) |
| 30 | + print(binned_data) |
| 31 | + return np.expand_dims(np.array(binned_data), 1) |
| 32 | + |
| 33 | + |
| 34 | +class ContinuousOrdinalEncoder(BaseEstimator, TransformerMixin): |
| 35 | + """ |
| 36 | + This encoder converts categorical features into continuous integer values. Each unique category within a feature |
| 37 | + is assigned a unique integer based on its order of appearance in the dataset. This transformation is useful for |
| 38 | + models that can only handle continuous data. |
| 39 | +
|
| 40 | + Attributes: |
| 41 | + mapping_ (list of dicts): A list where each element is a dictionary mapping original categories to integers |
| 42 | + for a single feature. |
| 43 | +
|
| 44 | + Methods: |
| 45 | + fit(X, y=None): Learns the mapping from original categories to integers. |
| 46 | + transform(X): Applies the learned mapping to the data. |
| 47 | + get_feature_names_out(input_features=None): Returns the input features after transformation. |
| 48 | + """ |
| 49 | + |
| 50 | + def fit(self, X, y=None): |
| 51 | + """ |
| 52 | + Learns the mapping from original categories to integers for each feature. |
| 53 | +
|
| 54 | + Parameters: |
| 55 | + X (array-like of shape (n_samples, n_features)): The input data to fit. |
| 56 | + y (ignored): Not used, present for API consistency by convention. |
| 57 | +
|
| 58 | + Returns: |
| 59 | + self: Returns the instance itself. |
| 60 | + """ |
| 61 | + # Fit should determine the mapping from original categories to sequential integers starting from 0 |
| 62 | + self.mapping_ = [ |
| 63 | + {category: i + 1 for i, category in enumerate(np.unique(col))} |
| 64 | + for col in X.T |
| 65 | + ] |
| 66 | + for mapping in self.mapping_: |
| 67 | + mapping[None] = 0 # Assign 0 to unknown values |
| 68 | + return self |
| 69 | + |
| 70 | + def transform(self, X): |
| 71 | + """ |
| 72 | + Transforms the categories in X to their corresponding integer values based on the learned mapping. |
| 73 | +
|
| 74 | + Parameters: |
| 75 | + X (array-like of shape (n_samples, n_features)): The input data to transform. |
| 76 | +
|
| 77 | + Returns: |
| 78 | + X_transformed (ndarray of shape (n_samples, n_features)): The transformed data with integer values. |
| 79 | + """ |
| 80 | + # Transform the categories to their mapped integer values |
| 81 | + X_transformed = np.array( |
| 82 | + [ |
| 83 | + [self.mapping_[col].get(value, 0) for col, value in enumerate(row)] |
| 84 | + for row in X |
| 85 | + ] |
| 86 | + ) |
| 87 | + return X_transformed |
| 88 | + |
| 89 | + def get_feature_names_out(self, input_features=None): |
| 90 | + """ |
| 91 | + Returns the names of the transformed features. |
| 92 | +
|
| 93 | + Parameters: |
| 94 | + input_features (list of str): The names of the input features. |
| 95 | +
|
| 96 | + Returns: |
| 97 | + input_features (array of shape (n_features,)): The names of the output features after transformation. |
| 98 | + """ |
| 99 | + if input_features is None: |
| 100 | + raise ValueError("input_features must be specified") |
| 101 | + return input_features |
| 102 | + |
| 103 | + |
| 104 | +class OneHotFromOrdinal(TransformerMixin, BaseEstimator): |
| 105 | + """ |
| 106 | + A transformer that takes ordinal-encoded features and converts them into one-hot encoded format. This is useful |
| 107 | + in scenarios where features have been pre-encoded with ordinal encoding and a one-hot representation is required |
| 108 | + for model training. |
| 109 | +
|
| 110 | + Attributes: |
| 111 | + max_bins_ (ndarray of shape (n_features,)): An array containing the maximum bin index for each feature, |
| 112 | + determining the size of the one-hot encoded array for that feature. |
| 113 | +
|
| 114 | + Methods: |
| 115 | + fit(X, y=None): Learns the maximum bin index for each feature. |
| 116 | + transform(X): Converts ordinal-encoded features into one-hot format. |
| 117 | + get_feature_names_out(input_features=None): Returns the feature names after one-hot encoding. |
| 118 | + """ |
| 119 | + |
| 120 | + def fit(self, X, y=None): |
| 121 | + """ |
| 122 | + Learns the maximum bin index for each feature from the data. |
| 123 | +
|
| 124 | + Parameters: |
| 125 | + X (array-like of shape (n_samples, n_features)): The input data to fit, containing ordinal-encoded features. |
| 126 | + y (ignored): Not used, present for API consistency by convention. |
| 127 | +
|
| 128 | + Returns: |
| 129 | + self: Returns the instance itself. |
| 130 | + """ |
| 131 | + self.max_bins_ = ( |
| 132 | + np.max(X, axis=0).astype(int) + 1 |
| 133 | + ) # Find the maximum bin index for each feature |
| 134 | + return self |
| 135 | + |
| 136 | + def transform(self, X): |
| 137 | + """ |
| 138 | + Transforms ordinal-encoded features into one-hot encoded format based on the `max_bins_` learned during fitting. |
| 139 | +
|
| 140 | + Parameters: |
| 141 | + X (array-like of shape (n_samples, n_features)): The input data to transform, containing ordinal-encoded features. |
| 142 | +
|
| 143 | + Returns: |
| 144 | + X_one_hot (ndarray of shape (n_samples, n_output_features)): The one-hot encoded features. |
| 145 | + """ |
| 146 | + # Initialize an empty list to hold the one-hot encoded arrays |
| 147 | + one_hot_encoded = [] |
| 148 | + for i, max_bins in enumerate(self.max_bins_): |
| 149 | + # Convert each feature to one-hot using its max_bins |
| 150 | + feature_one_hot = np.eye(max_bins)[X[:, i].astype(int)] |
| 151 | + one_hot_encoded.append(feature_one_hot) |
| 152 | + # Concatenate the one-hot encoded features horizontally |
| 153 | + return np.hstack(one_hot_encoded) |
| 154 | + |
| 155 | + def get_feature_names_out(self, input_features=None): |
| 156 | + """ |
| 157 | + Generates feature names for the one-hot encoded features based on the input feature names and the number of bins. |
| 158 | +
|
| 159 | + Parameters: |
| 160 | + input_features (list of str): The names of the input features that were ordinal-encoded. |
| 161 | +
|
| 162 | + Returns: |
| 163 | + feature_names (array of shape (n_output_features,)): The names of the one-hot encoded features. |
| 164 | + """ |
| 165 | + feature_names = [] |
| 166 | + for i, max_bins in enumerate(self.max_bins_): |
| 167 | + feature_names.extend( |
| 168 | + [f"{input_features[i]}_bin_{j}" for j in range(int(max_bins))] |
| 169 | + ) |
| 170 | + return np.array(feature_names) |
0 commit comments