created preprocessing folder

AnFreTh · AnFreTh · commit c91175b3f412 · 2024-06-28T12:19:24.000Z
diff --git a/mambular/preprocessing/__init__.py b/mambular/preprocessing/__init__.py
@@ -0,0 +1,3 @@
+from .preprocessor import Preprocessor
+
+__all__ = ["Preprocessor"]
diff --git a/mambular/preprocessing/ple_encoding.py b/mambular/preprocessing/ple_encoding.py
@@ -0,0 +1,156 @@
+import numpy as np
+from tqdm import tqdm
+import pandas as pd
+import bisect
+import re
+from sklearn.tree import _tree
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+import pandas as pd
+import numpy as np
+from sklearn.base import TransformerMixin, BaseEstimator
+from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
+
+
+def tree_to_code(tree, feature_names):
+    """
+    Convert a scikit-learn decision tree into a list of conditions.
+
+    Args:
+        tree (sklearn.tree.DecisionTreeRegressor or sklearn.tree.DecisionTreeClassifier):
+            The decision tree model to be converted.
+        feature_names (list of str): The names of the features used in the tree.
+        Y (array-like): The target values associated with the tree.
+
+    Returns:
+        list of str: A list of conditions representing the decision tree paths.
+
+    Example:
+        # Convert a decision tree into a list of conditions
+        tree_conditions = tree_to_code(tree_model, feature_names, target_values)
+    """
+
+    tree_ = tree.tree_
+    feature_name = [
+        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
+        for i in tree_.feature
+    ]
+
+    pathto = dict()
+    my_list = []
+
+    global k
+    k = 0
+
+    def recurse(node, depth, parent):
+        global k
+        indent = "  " * depth
+
+        if tree_.feature[node] != _tree.TREE_UNDEFINED:
+            # name = df_name + "[" + "'" + feature_name[node]+ "'" + "]"
+            name = feature_name[node]
+            threshold = tree_.threshold[node]
+            s = "{} <= {} ".format(name, threshold, node)
+            if node == 0:
+                pathto[node] = "(" + s + ")"
+            else:
+                pathto[node] = "(" + pathto[parent] + ")" + " & " + "(" + s + ")"
+
+            recurse(tree_.children_left[node], depth + 1, node)
+            s = "{} > {}".format(name, threshold)
+            if node == 0:
+                pathto[node] = s
+            else:
+                pathto[node] = "(" + pathto[parent] + ")" + " & " + "(" + s + ")"
+            recurse(tree_.children_right[node], depth + 1, node)
+        else:
+            k = k + 1
+            my_list.append(pathto[parent])
+            # print(k,')',pathto[parent], tree_.value[node])
+
+    recurse(0, 1, 0)
+
+    return my_list
+
+
+class PLE(BaseEstimator, TransformerMixin):
+    def __init__(
+        self, n_bins=20, tree_params={}, task="regression", conditions=None, **kwargs
+    ):
+        super(PLE, self).__init__(**kwargs)
+
+        self.task = task
+        self.tree_params = tree_params
+        self.n_bins = n_bins
+        self.conditions = conditions
+        self.pattern = (
+            r"-?\d+\.?\d*[eE]?[+-]?\d*"  # This pattern matches integers and floats
+        )
+
+    def fit(self, feature, target):
+        if self.task == "regression":
+            dt = DecisionTreeRegressor(max_leaf_nodes=self.n_bins)
+        elif self.task == "classification":
+            dt = DecisionTreeClassifier(max_leaf_nodes=self.n_bins)
+        else:
+            raise ValueError("This task is not supported")
+
+        dt.fit(feature, target)
+
+        self.conditions = tree_to_code(dt, ["feature"])
+        return self
+
+    def transform(self, feature):
+        if feature.shape == (feature.shape[0], 1):
+            feature = np.squeeze(feature, axis=1)
+        else:
+            feature = feature
+        result_list = []
+        for idx, cond in enumerate(self.conditions):
+            result_list.append(eval(cond) * (idx + 1))
+
+        encoded_feature = np.expand_dims(np.sum(np.stack(result_list).T, axis=1), 1)
+
+        encoded_feature = np.array(encoded_feature - 1, dtype=np.int64)
+
+        # Initialize an empty list to store the extracted numbers
+        locations = []
+        # Iterate through the strings and extract numbers
+        for string in self.conditions:
+            matches = re.findall(self.pattern, string)
+            locations.extend(matches)
+
+        locations = [float(number) for number in locations]
+        locations = list(set(locations))
+        locations = np.sort(locations)
+
+        ple_encoded_feature = np.zeros((len(feature), locations.shape[0] + 1))
+        if locations[-1] > np.max(feature):
+            locations[-1] = np.max(feature)
+
+        for idx in range(len(encoded_feature)):
+            if feature[idx] >= locations[-1]:
+                ple_encoded_feature[idx][encoded_feature[idx]] = feature[idx]
+                ple_encoded_feature[idx, : encoded_feature[idx][0]] = 1
+            elif feature[idx] <= locations[0]:
+                ple_encoded_feature[idx][encoded_feature[idx]] = feature[idx]
+
+            else:
+                ple_encoded_feature[idx][encoded_feature[idx]] = (
+                    feature[idx] - locations[(encoded_feature[idx] - 1)[0]]
+                ) / (
+                    locations[(encoded_feature[idx])[0]]
+                    - locations[(encoded_feature[idx] - 1)[0]]
+                )
+
+                ple_encoded_feature[idx, : encoded_feature[idx][0]] = 1
+
+        if ple_encoded_feature.shape[1] == 1:
+            return np.zeros([len(feature), self.n_bins])
+
+        else:
+            return np.array(ple_encoded_feature, dtype=np.float32)
+
+    def get_feature_names_out(self, input_features=None):
+        if input_features is None:
+            raise ValueError("input_features must be specified")
+        return input_features
diff --git a/mambular/preprocessing/prepro_utils.py b/mambular/preprocessing/prepro_utils.py
@@ -0,0 +1,170 @@
+import pandas as pd
+import numpy as np
+from sklearn.base import TransformerMixin, BaseEstimator
+
+
+class CustomBinner(TransformerMixin):
+    def __init__(self, bins):
+        # bins can be a scalar (number of bins) or array-like (bin edges)
+        self.bins = bins
+
+    def fit(self, X, y=None):
+        # Fit doesn't need to do anything as we are directly using provided bins
+        return self
+
+    def transform(self, X):
+        if isinstance(self.bins, int):
+            # Calculate equal width bins based on the range of the data and number of bins
+            _, bins = pd.cut(X.squeeze(), bins=self.bins, retbins=True)
+        else:
+            # Use predefined bins
+            bins = self.bins
+
+        # Apply the bins to the data
+        binned_data = pd.cut(
+            X.squeeze(),
+            bins=np.sort(np.unique(bins)),
+            labels=False,
+            include_lowest=True,
+        )
+        print(binned_data)
+        return np.expand_dims(np.array(binned_data), 1)
+
+
+class ContinuousOrdinalEncoder(BaseEstimator, TransformerMixin):
+    """
+    This encoder converts categorical features into continuous integer values. Each unique category within a feature
+    is assigned a unique integer based on its order of appearance in the dataset. This transformation is useful for
+    models that can only handle continuous data.
+
+    Attributes:
+        mapping_ (list of dicts): A list where each element is a dictionary mapping original categories to integers
+                                  for a single feature.
+
+    Methods:
+        fit(X, y=None): Learns the mapping from original categories to integers.
+        transform(X): Applies the learned mapping to the data.
+        get_feature_names_out(input_features=None): Returns the input features after transformation.
+    """
+
+    def fit(self, X, y=None):
+        """
+        Learns the mapping from original categories to integers for each feature.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to fit.
+            y (ignored): Not used, present for API consistency by convention.
+
+        Returns:
+            self: Returns the instance itself.
+        """
+        # Fit should determine the mapping from original categories to sequential integers starting from 0
+        self.mapping_ = [
+            {category: i + 1 for i, category in enumerate(np.unique(col))}
+            for col in X.T
+        ]
+        for mapping in self.mapping_:
+            mapping[None] = 0  # Assign 0 to unknown values
+        return self
+
+    def transform(self, X):
+        """
+        Transforms the categories in X to their corresponding integer values based on the learned mapping.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to transform.
+
+        Returns:
+            X_transformed (ndarray of shape (n_samples, n_features)): The transformed data with integer values.
+        """
+        # Transform the categories to their mapped integer values
+        X_transformed = np.array(
+            [
+                [self.mapping_[col].get(value, 0) for col, value in enumerate(row)]
+                for row in X
+            ]
+        )
+        return X_transformed
+
+    def get_feature_names_out(self, input_features=None):
+        """
+        Returns the names of the transformed features.
+
+        Parameters:
+            input_features (list of str): The names of the input features.
+
+        Returns:
+            input_features (array of shape (n_features,)): The names of the output features after transformation.
+        """
+        if input_features is None:
+            raise ValueError("input_features must be specified")
+        return input_features
+
+
+class OneHotFromOrdinal(TransformerMixin, BaseEstimator):
+    """
+    A transformer that takes ordinal-encoded features and converts them into one-hot encoded format. This is useful
+    in scenarios where features have been pre-encoded with ordinal encoding and a one-hot representation is required
+    for model training.
+
+    Attributes:
+        max_bins_ (ndarray of shape (n_features,)): An array containing the maximum bin index for each feature,
+                                                    determining the size of the one-hot encoded array for that feature.
+
+    Methods:
+        fit(X, y=None): Learns the maximum bin index for each feature.
+        transform(X): Converts ordinal-encoded features into one-hot format.
+        get_feature_names_out(input_features=None): Returns the feature names after one-hot encoding.
+    """
+
+    def fit(self, X, y=None):
+        """
+        Learns the maximum bin index for each feature from the data.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to fit, containing ordinal-encoded features.
+            y (ignored): Not used, present for API consistency by convention.
+
+        Returns:
+            self: Returns the instance itself.
+        """
+        self.max_bins_ = (
+            np.max(X, axis=0).astype(int) + 1
+        )  # Find the maximum bin index for each feature
+        return self
+
+    def transform(self, X):
+        """
+        Transforms ordinal-encoded features into one-hot encoded format based on the `max_bins_` learned during fitting.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to transform, containing ordinal-encoded features.
+
+        Returns:
+            X_one_hot (ndarray of shape (n_samples, n_output_features)): The one-hot encoded features.
+        """
+        # Initialize an empty list to hold the one-hot encoded arrays
+        one_hot_encoded = []
+        for i, max_bins in enumerate(self.max_bins_):
+            # Convert each feature to one-hot using its max_bins
+            feature_one_hot = np.eye(max_bins)[X[:, i].astype(int)]
+            one_hot_encoded.append(feature_one_hot)
+        # Concatenate the one-hot encoded features horizontally
+        return np.hstack(one_hot_encoded)
+
+    def get_feature_names_out(self, input_features=None):
+        """
+        Generates feature names for the one-hot encoded features based on the input feature names and the number of bins.
+
+        Parameters:
+            input_features (list of str): The names of the input features that were ordinal-encoded.
+
+        Returns:
+            feature_names (array of shape (n_output_features,)): The names of the one-hot encoded features.
+        """
+        feature_names = []
+        for i, max_bins in enumerate(self.max_bins_):
+            feature_names.extend(
+                [f"{input_features[i]}_bin_{j}" for j in range(int(max_bins))]
+            )
+        return np.array(feature_names)
diff --git a/mambular/preprocessing/preprocessor.py b/mambular/preprocessing/preprocessor.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .preprocessor import Preprocessor`
	`2`	`+`
	`3`	`+__all__ = ["Preprocessor"]`