Skip to content

Commit c91175b

Browse files
committed
created preprocessing folder
1 parent ac6501f commit c91175b

4 files changed

Lines changed: 833 additions & 0 deletions

File tree

mambular/preprocessing/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .preprocessor import Preprocessor
2+
3+
__all__ = ["Preprocessor"]
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
import numpy as np
2+
from tqdm import tqdm
3+
import pandas as pd
4+
import bisect
5+
import re
6+
from sklearn.tree import _tree
7+
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
8+
import pandas as pd
9+
import numpy as np
10+
from sklearn.base import TransformerMixin, BaseEstimator
11+
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
12+
13+
14+
def tree_to_code(tree, feature_names):
15+
"""
16+
Convert a scikit-learn decision tree into a list of conditions.
17+
18+
Args:
19+
tree (sklearn.tree.DecisionTreeRegressor or sklearn.tree.DecisionTreeClassifier):
20+
The decision tree model to be converted.
21+
feature_names (list of str): The names of the features used in the tree.
22+
Y (array-like): The target values associated with the tree.
23+
24+
Returns:
25+
list of str: A list of conditions representing the decision tree paths.
26+
27+
Example:
28+
# Convert a decision tree into a list of conditions
29+
tree_conditions = tree_to_code(tree_model, feature_names, target_values)
30+
"""
31+
32+
tree_ = tree.tree_
33+
feature_name = [
34+
feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
35+
for i in tree_.feature
36+
]
37+
38+
pathto = dict()
39+
my_list = []
40+
41+
global k
42+
k = 0
43+
44+
def recurse(node, depth, parent):
45+
global k
46+
indent = " " * depth
47+
48+
if tree_.feature[node] != _tree.TREE_UNDEFINED:
49+
# name = df_name + "[" + "'" + feature_name[node]+ "'" + "]"
50+
name = feature_name[node]
51+
threshold = tree_.threshold[node]
52+
s = "{} <= {} ".format(name, threshold, node)
53+
if node == 0:
54+
pathto[node] = "(" + s + ")"
55+
else:
56+
pathto[node] = "(" + pathto[parent] + ")" + " & " + "(" + s + ")"
57+
58+
recurse(tree_.children_left[node], depth + 1, node)
59+
s = "{} > {}".format(name, threshold)
60+
if node == 0:
61+
pathto[node] = s
62+
else:
63+
pathto[node] = "(" + pathto[parent] + ")" + " & " + "(" + s + ")"
64+
recurse(tree_.children_right[node], depth + 1, node)
65+
else:
66+
k = k + 1
67+
my_list.append(pathto[parent])
68+
# print(k,')',pathto[parent], tree_.value[node])
69+
70+
recurse(0, 1, 0)
71+
72+
return my_list
73+
74+
75+
class PLE(BaseEstimator, TransformerMixin):
76+
def __init__(
77+
self, n_bins=20, tree_params={}, task="regression", conditions=None, **kwargs
78+
):
79+
super(PLE, self).__init__(**kwargs)
80+
81+
self.task = task
82+
self.tree_params = tree_params
83+
self.n_bins = n_bins
84+
self.conditions = conditions
85+
self.pattern = (
86+
r"-?\d+\.?\d*[eE]?[+-]?\d*" # This pattern matches integers and floats
87+
)
88+
89+
def fit(self, feature, target):
90+
if self.task == "regression":
91+
dt = DecisionTreeRegressor(max_leaf_nodes=self.n_bins)
92+
elif self.task == "classification":
93+
dt = DecisionTreeClassifier(max_leaf_nodes=self.n_bins)
94+
else:
95+
raise ValueError("This task is not supported")
96+
97+
dt.fit(feature, target)
98+
99+
self.conditions = tree_to_code(dt, ["feature"])
100+
return self
101+
102+
def transform(self, feature):
103+
if feature.shape == (feature.shape[0], 1):
104+
feature = np.squeeze(feature, axis=1)
105+
else:
106+
feature = feature
107+
result_list = []
108+
for idx, cond in enumerate(self.conditions):
109+
result_list.append(eval(cond) * (idx + 1))
110+
111+
encoded_feature = np.expand_dims(np.sum(np.stack(result_list).T, axis=1), 1)
112+
113+
encoded_feature = np.array(encoded_feature - 1, dtype=np.int64)
114+
115+
# Initialize an empty list to store the extracted numbers
116+
locations = []
117+
# Iterate through the strings and extract numbers
118+
for string in self.conditions:
119+
matches = re.findall(self.pattern, string)
120+
locations.extend(matches)
121+
122+
locations = [float(number) for number in locations]
123+
locations = list(set(locations))
124+
locations = np.sort(locations)
125+
126+
ple_encoded_feature = np.zeros((len(feature), locations.shape[0] + 1))
127+
if locations[-1] > np.max(feature):
128+
locations[-1] = np.max(feature)
129+
130+
for idx in range(len(encoded_feature)):
131+
if feature[idx] >= locations[-1]:
132+
ple_encoded_feature[idx][encoded_feature[idx]] = feature[idx]
133+
ple_encoded_feature[idx, : encoded_feature[idx][0]] = 1
134+
elif feature[idx] <= locations[0]:
135+
ple_encoded_feature[idx][encoded_feature[idx]] = feature[idx]
136+
137+
else:
138+
ple_encoded_feature[idx][encoded_feature[idx]] = (
139+
feature[idx] - locations[(encoded_feature[idx] - 1)[0]]
140+
) / (
141+
locations[(encoded_feature[idx])[0]]
142+
- locations[(encoded_feature[idx] - 1)[0]]
143+
)
144+
145+
ple_encoded_feature[idx, : encoded_feature[idx][0]] = 1
146+
147+
if ple_encoded_feature.shape[1] == 1:
148+
return np.zeros([len(feature), self.n_bins])
149+
150+
else:
151+
return np.array(ple_encoded_feature, dtype=np.float32)
152+
153+
def get_feature_names_out(self, input_features=None):
154+
if input_features is None:
155+
raise ValueError("input_features must be specified")
156+
return input_features
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
import pandas as pd
2+
import numpy as np
3+
from sklearn.base import TransformerMixin, BaseEstimator
4+
5+
6+
class CustomBinner(TransformerMixin):
7+
def __init__(self, bins):
8+
# bins can be a scalar (number of bins) or array-like (bin edges)
9+
self.bins = bins
10+
11+
def fit(self, X, y=None):
12+
# Fit doesn't need to do anything as we are directly using provided bins
13+
return self
14+
15+
def transform(self, X):
16+
if isinstance(self.bins, int):
17+
# Calculate equal width bins based on the range of the data and number of bins
18+
_, bins = pd.cut(X.squeeze(), bins=self.bins, retbins=True)
19+
else:
20+
# Use predefined bins
21+
bins = self.bins
22+
23+
# Apply the bins to the data
24+
binned_data = pd.cut(
25+
X.squeeze(),
26+
bins=np.sort(np.unique(bins)),
27+
labels=False,
28+
include_lowest=True,
29+
)
30+
print(binned_data)
31+
return np.expand_dims(np.array(binned_data), 1)
32+
33+
34+
class ContinuousOrdinalEncoder(BaseEstimator, TransformerMixin):
35+
"""
36+
This encoder converts categorical features into continuous integer values. Each unique category within a feature
37+
is assigned a unique integer based on its order of appearance in the dataset. This transformation is useful for
38+
models that can only handle continuous data.
39+
40+
Attributes:
41+
mapping_ (list of dicts): A list where each element is a dictionary mapping original categories to integers
42+
for a single feature.
43+
44+
Methods:
45+
fit(X, y=None): Learns the mapping from original categories to integers.
46+
transform(X): Applies the learned mapping to the data.
47+
get_feature_names_out(input_features=None): Returns the input features after transformation.
48+
"""
49+
50+
def fit(self, X, y=None):
51+
"""
52+
Learns the mapping from original categories to integers for each feature.
53+
54+
Parameters:
55+
X (array-like of shape (n_samples, n_features)): The input data to fit.
56+
y (ignored): Not used, present for API consistency by convention.
57+
58+
Returns:
59+
self: Returns the instance itself.
60+
"""
61+
# Fit should determine the mapping from original categories to sequential integers starting from 0
62+
self.mapping_ = [
63+
{category: i + 1 for i, category in enumerate(np.unique(col))}
64+
for col in X.T
65+
]
66+
for mapping in self.mapping_:
67+
mapping[None] = 0 # Assign 0 to unknown values
68+
return self
69+
70+
def transform(self, X):
71+
"""
72+
Transforms the categories in X to their corresponding integer values based on the learned mapping.
73+
74+
Parameters:
75+
X (array-like of shape (n_samples, n_features)): The input data to transform.
76+
77+
Returns:
78+
X_transformed (ndarray of shape (n_samples, n_features)): The transformed data with integer values.
79+
"""
80+
# Transform the categories to their mapped integer values
81+
X_transformed = np.array(
82+
[
83+
[self.mapping_[col].get(value, 0) for col, value in enumerate(row)]
84+
for row in X
85+
]
86+
)
87+
return X_transformed
88+
89+
def get_feature_names_out(self, input_features=None):
90+
"""
91+
Returns the names of the transformed features.
92+
93+
Parameters:
94+
input_features (list of str): The names of the input features.
95+
96+
Returns:
97+
input_features (array of shape (n_features,)): The names of the output features after transformation.
98+
"""
99+
if input_features is None:
100+
raise ValueError("input_features must be specified")
101+
return input_features
102+
103+
104+
class OneHotFromOrdinal(TransformerMixin, BaseEstimator):
105+
"""
106+
A transformer that takes ordinal-encoded features and converts them into one-hot encoded format. This is useful
107+
in scenarios where features have been pre-encoded with ordinal encoding and a one-hot representation is required
108+
for model training.
109+
110+
Attributes:
111+
max_bins_ (ndarray of shape (n_features,)): An array containing the maximum bin index for each feature,
112+
determining the size of the one-hot encoded array for that feature.
113+
114+
Methods:
115+
fit(X, y=None): Learns the maximum bin index for each feature.
116+
transform(X): Converts ordinal-encoded features into one-hot format.
117+
get_feature_names_out(input_features=None): Returns the feature names after one-hot encoding.
118+
"""
119+
120+
def fit(self, X, y=None):
121+
"""
122+
Learns the maximum bin index for each feature from the data.
123+
124+
Parameters:
125+
X (array-like of shape (n_samples, n_features)): The input data to fit, containing ordinal-encoded features.
126+
y (ignored): Not used, present for API consistency by convention.
127+
128+
Returns:
129+
self: Returns the instance itself.
130+
"""
131+
self.max_bins_ = (
132+
np.max(X, axis=0).astype(int) + 1
133+
) # Find the maximum bin index for each feature
134+
return self
135+
136+
def transform(self, X):
137+
"""
138+
Transforms ordinal-encoded features into one-hot encoded format based on the `max_bins_` learned during fitting.
139+
140+
Parameters:
141+
X (array-like of shape (n_samples, n_features)): The input data to transform, containing ordinal-encoded features.
142+
143+
Returns:
144+
X_one_hot (ndarray of shape (n_samples, n_output_features)): The one-hot encoded features.
145+
"""
146+
# Initialize an empty list to hold the one-hot encoded arrays
147+
one_hot_encoded = []
148+
for i, max_bins in enumerate(self.max_bins_):
149+
# Convert each feature to one-hot using its max_bins
150+
feature_one_hot = np.eye(max_bins)[X[:, i].astype(int)]
151+
one_hot_encoded.append(feature_one_hot)
152+
# Concatenate the one-hot encoded features horizontally
153+
return np.hstack(one_hot_encoded)
154+
155+
def get_feature_names_out(self, input_features=None):
156+
"""
157+
Generates feature names for the one-hot encoded features based on the input feature names and the number of bins.
158+
159+
Parameters:
160+
input_features (list of str): The names of the input features that were ordinal-encoded.
161+
162+
Returns:
163+
feature_names (array of shape (n_output_features,)): The names of the one-hot encoded features.
164+
"""
165+
feature_names = []
166+
for i, max_bins in enumerate(self.max_bins_):
167+
feature_names.extend(
168+
[f"{input_features[i]}_bin_{j}" for j in range(int(max_bins))]
169+
)
170+
return np.array(feature_names)

0 commit comments

Comments
 (0)