forked from h2oai/driverlessai-recipes
-
Notifications
You must be signed in to change notification settings - Fork 1
/
log_scale_target_encoding.py
55 lines (43 loc) · 1.84 KB
/
log_scale_target_encoding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
"""Target-encode numbers by their logarithm"""
import math
from h2oaicore.transformer_utils import CustomTransformer
import datatable as dt
import numpy as np
from h2oaicore.transformers import CVTargetEncodeTransformer
from sklearn.preprocessing import LabelEncoder
class LogScaleBinner:
def fit_transform(self, X: dt.Frame):
return self.transform(X)
def transform(self, X: dt.Frame):
assert X.ncols == 1
return X[:, dt.stype.str32(dt.stype.int32(dt.log(dt.f[0])))]
class LogScaleTargetEncodingTransformer(CustomTransformer):
_testing_can_skip_failure = False # ensure tested as if shouldn't fail
_unsupervised = False # uses target
_uses_target = True # uses target
_target_encoding_based = True
@staticmethod
def get_default_properties():
return dict(col_type="numeric", min_cols=1, max_cols=1, relative_importance=1)
def fit_transform(self, X: dt.Frame, y: np.array = None):
# Roughly: Convert numbers to a string of their exponent
self.binner = LogScaleBinner()
X = self.binner.fit_transform(X)
# Compute mean target (out of fold) per same string
self.cvte = CVTargetEncodeTransformer(cat_cols=X.names)
if self.labels is not None:
# for classification, always turn y into numeric form, even if already integer
y = dt.Frame(LabelEncoder().fit(self.labels).transform(y))
X = dt.Frame(self.cvte.fit_transform(X, y))
# ensure no inf
# Don't leave inf/-inf
for i in range(X.ncols):
X.replace([math.inf, -math.inf], None)
return X
def transform(self, X: dt.Frame):
X = self.binner.transform(X)
X = dt.Frame(self.cvte.transform(X))
# Don't leave inf/-inf
for i in range(X.ncols):
X.replace([math.inf, -math.inf], None)
return X