Here is my target encoder ...

Unfortunately it seems that fit is only called with X ... so this will not work.

import sklearn

class TargetEncoder(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):

def __init__(self, min_samples_leaf=1, smoothing=1, noise_level=0):

self.dict_averages = {}

self.dict_priors = {}

self.min_samples_leaf = min_samples_leaf

self.smoothing = smoothing

self.noise_level = noise_level

def fit(self, X, y=None):

assert y is not None

target = y

self.y_col = y.name

trn_series = X

col = X.name

temp = pd.concat([trn_series, target], axis=1)

# Compute target mean

averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])

# Compute smoothing

smoothing = 1 / (1 + np.exp(-(averages["count"] - self.min_samples_leaf) / self.smoothing))

# Apply average function to all target data

prior = target.mean()

# The bigger the count the less full_avg is taken into account

averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing

averages.drop(["mean", "count"], axis=1, inplace=True)

self.dict_averages.update({col: averages})

self.dict_priors.update({col: prior})

return self

def transform(self, X):

trn_series = X

col = X.name

ft_trn_series = pd.merge(

trn_series.to_frame(trn_series.name),

self.dict_averages[col].reset_index().rename(columns={'index': self.y_col, self.y_col: 'average'}),

on=trn_series.name, how='left')['average'].rename(trn_series.name).fillna(self.dict_priors[col])

# pd.merge does not keep the index so restore it

ft_trn_series.index = trn_series.index

X = ft_trn_series

return X

processor = TargetEncoder()