from collections import namedtuple
from typing import Iterable, List, Optional, Union
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, BaseCrossValidator
from nyaggle.environment import requires_lightgbm
from nyaggle.util import is_instance
from nyaggle.validation.cross_validate import cross_validate
from nyaggle.validation.split import Take
ADVResult = namedtuple('ADVResult', ['auc', 'importance'])
[docs]def adversarial_validate(X_train: pd.DataFrame,
X_test: pd.DataFrame,
importance_type: str = 'gain',
estimator: Optional[BaseEstimator] = None,
categorical_feature: List[str] = None,
cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None) -> ADVResult:
"""
Perform adversarial validation between X_train and X_test.
Args:
X_train:
Training data
X_test:
Test data
importance_type:
The type of feature importance calculated.
estimator:
The custom estimator. If None, LGBMClassifier is automatically used.
Only LGBMModel or CatBoost instances are supported.
categorical_feature:
List of categorical column names. If ``None``, categorical columns are automatically determined by dtype.
cv:
Cross validation split. If ``None``, the first fold out of 5 fold is used as validation.
Returns:
Namedtuple with following members
* auc:
float, ROC AUC score of adversarial validation.
* importance:
pandas DataFrame, feature importance of adversarial model (order by importance)
Example:
>>> from sklearn.model_selection import train_test_split
>>> from nyaggle.testing import make_regression_df
>>> from nyaggle.validation import adversarial_validate
>>> X, y = make_regression_df(n_samples=8)
>>> X_train, X_test, y_train, y_test = train_test_split(X, y)
>>> auc, importance = cross_validate(X_train, X_test)
>>>
>>> print(auc)
0.51078231
>>> importance.head()
feature importance
col_1 231.5827204
col_5 207.1837266
col_7 188.6920685
col_4 174.5668498
col_9 170.6438643
"""
concat = pd.concat([X_train, X_test]).copy().reset_index(drop=True)
y = np.array([1] * len(X_train) + [0] * len(X_test))
if estimator is None:
requires_lightgbm()
from lightgbm import LGBMClassifier
estimator = LGBMClassifier(n_estimators=10000, objective='binary', importance_type=importance_type,
random_state=0)
else:
assert is_instance(estimator, ('lightgbm.sklearn.LGBMModel', 'catboost.core.CatBoost')), \
'Only CatBoostClassifier or LGBMClassifier is allowed'
if cv is None:
cv = Take(1, KFold(5, shuffle=True, random_state=0))
fit_params = {'verbose': -1}
if categorical_feature:
fit_params['categorical_feature'] = categorical_feature
result = cross_validate(estimator, concat, y, None, cv=cv,
eval_func=roc_auc_score, fit_params=fit_params, importance_type=importance_type)
importance = pd.concat(result.importance)
importance = importance.groupby('feature')['importance'].mean().reset_index()
importance.sort_values(by='importance', ascending=False, inplace=True)
importance.reset_index(drop=True, inplace=True)
return ADVResult(result.scores[-1], importance)