Source code for nyaggle.feature.category_encoder.target_encoder

from typing import List, Optional, Iterable, Union

import category_encoders as ce
import numpy as np
import pandas as pd
from category_encoders.utils import convert_input, convert_input_vector
from sklearn.base import BaseEstimator, clone
from sklearn.model_selection import BaseCrossValidator

from nyaggle.feature.base import BaseFeaturizer
from nyaggle.validation.split import check_cv


[docs]class KFoldEncoderWrapper(BaseFeaturizer): """KFold Wrapper for sklearn like interface This class wraps sklearn's TransformerMixIn (object that has fit/transform/fit_transform methods), and call it as K-fold manner. Args: base_transformer: Transformer object to be wrapped. cv: int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. - None, to use the default ``KFold(5, random_state=0, shuffle=True)``, - integer, to specify the number of folds in a ``(Stratified)KFold``, - CV splitter (the instance of ``BaseCrossValidator``), - An iterable yielding (train, test) splits as arrays of indices. groups: Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). return_same_type: If True, `transform` and `fit_transform` return the same type as X. If False, these APIs always return a numpy array, similar to sklearn's API. """ def __init__(self, base_transformer: BaseEstimator, cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, return_same_type: bool = True, groups: Optional[pd.Series] = None): self.cv = cv self.base_transformer = base_transformer self.n_splits = None self.transformers = None self.return_same_type = return_same_type self.groups = groups def _pre_train(self, y): self.cv = check_cv(self.cv, y) self.n_splits = self.cv.get_n_splits() self.transformers = [clone(self.base_transformer) for _ in range(self.n_splits + 1)] def _fit_train(self, X: pd.DataFrame, y: Optional[pd.Series], **fit_params) -> pd.DataFrame: if y is None: X_ = self.transformers[-1].transform(X) return self._post_transform(X_) X_ = X.copy() for i, (train_index, test_index) in enumerate(self.cv.split(X_, y, self.groups)): self.transformers[i].fit(X.iloc[train_index], y.iloc[train_index], **fit_params) X_.iloc[test_index, :] = self.transformers[i].transform(X.iloc[test_index]) self.transformers[-1].fit(X, y, **fit_params) return X_ def _post_fit(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame: return X def _post_transform(self, X: pd.DataFrame) -> pd.DataFrame: return X
[docs] def fit(self, X: pd.DataFrame, y: pd.Series): """ Fit models for each fold. Args: X: Data y: Target Returns: returns the transformer object. """ self._post_fit(self.fit_transform(X, y), y) return self
[docs] def transform(self, X: Union[pd.DataFrame, np.ndarray]) -> Union[pd.DataFrame, np.ndarray]: """ Transform X Args: X: Data Returns: Transformed version of X. It will be pd.DataFrame If X is `pd.DataFrame` and return_same_type is True. """ is_pandas = isinstance(X, pd.DataFrame) X_ = self._fit_train(X, None) X_ = self._post_transform(X_) return X_ if self.return_same_type and is_pandas else X_.values
[docs] def fit_transform(self, X: Union[pd.DataFrame, np.ndarray], y: pd.Series = None, **fit_params) \ -> Union[pd.DataFrame, np.ndarray]: """ Fit models for each fold, then transform X Args: X: Data y: Target fit_params: Additional parameters passed to models Returns: Transformed version of X. It will be pd.DataFrame If X is `pd.DataFrame` and return_same_type is True. """ assert len(X) == len(y) self._pre_train(y) is_pandas = isinstance(X, pd.DataFrame) X = convert_input(X) y = convert_input_vector(y, X.index) if y.isnull().sum() > 0: # y == null is regarded as test data X_ = X.copy() X_.loc[~y.isnull(), :] = self._fit_train(X[~y.isnull()], y[~y.isnull()], **fit_params) X_.loc[y.isnull(), :] = self._fit_train(X[y.isnull()], None, **fit_params) else: X_ = self._fit_train(X, y, **fit_params) X_ = self._post_transform(self._post_fit(X_, y)) return X_ if self.return_same_type and is_pandas else X_.values
[docs]class TargetEncoder(KFoldEncoderWrapper): """Target Encoder KFold version of category_encoders.TargetEncoder in https://contrib.scikit-learn.org/categorical-encoding/targetencoder.html. Args: cv: int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. - None, to use the default ``KFold(5, random_state=0, shuffle=True)``, - integer, to specify the number of folds in a ``(Stratified)KFold``, - CV splitter (the instance of ``BaseCrossValidator``), - An iterable yielding (train, test) splits as arrays of indices. groups: Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). cols: A list of columns to encode, if None, all string columns will be encoded. drop_invariant: Boolean for whether or not to drop columns with 0 variance. handle_missing: Options are ‘error’, ‘return_nan’ and ‘value’, defaults to ‘value’, which returns the target mean. handle_unknown: Options are ‘error’, ‘return_nan’ and ‘value’, defaults to ‘value’, which returns the target mean. min_samples_leaf: Minimum samples to take category average into account. smoothing: Smoothing effect to balance categorical average vs prior. Higher value means stronger regularization. The value must be strictly bigger than 0. return_same_type: If True, ``transform`` and ``fit_transform`` return the same type as X. If False, these APIs always return a numpy array, similar to sklearn's API. """ def __init__(self, cv: Optional[Union[Iterable, BaseCrossValidator]] = None, groups: Optional[pd.Series] = None, cols: List[str] = None, drop_invariant: bool = False, handle_missing: str = 'value', handle_unknown: str = 'value', min_samples_leaf: int = 20, smoothing: float = 10.0, return_same_type: bool = True): e = ce.TargetEncoder(cols=cols, drop_invariant=drop_invariant, return_df=True, handle_missing=handle_missing, handle_unknown=handle_unknown, min_samples_leaf=min_samples_leaf, smoothing=smoothing) super().__init__(e, cv, return_same_type, groups) def _post_transform(self, X: pd.DataFrame) -> pd.DataFrame: cols = self.transformers[0].cols for c in cols: X[c] = X[c].astype(float) return X