Source code for nyaggle.experiment.hyperparameter_tuner

import copy
import warnings
from typing import Dict, Iterable, Optional, Union

import optuna.integration.lightgbm as optuna_lgb
import pandas as pd
import sklearn.utils.multiclass as multiclass
from sklearn.model_selection import BaseCrossValidator

from nyaggle.validation.split import check_cv


[docs]def find_best_lgbm_parameter(base_param: Dict, X: pd.DataFrame, y: pd.Series, cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, groups: Optional[pd.Series] = None, time_budget: Optional[int] = None, type_of_target: str = 'auto') -> Dict: """ Search hyperparameter for lightgbm using optuna. Args: base_param: Base parameters passed to lgb.train. X: Training data. y: Target cv: int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. groups: Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). time_budget: Time budget for tuning (in seconds). type_of_target: The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``. Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported. Returns: The best parameters found """ cv = check_cv(cv, y) if type_of_target == 'auto': type_of_target = multiclass.type_of_target(y) train_index, test_index = next(cv.split(X, y, groups)) dtrain = optuna_lgb.Dataset(X.iloc[train_index], y.iloc[train_index]) dvalid = optuna_lgb.Dataset(X.iloc[test_index], y.iloc[test_index]) params = copy.deepcopy(base_param) if 'early_stopping_rounds' not in params: params['early_stopping_rounds'] = 100 if params.get('feature_pre_filter'): warnings.warn("feature_pre_filter will be set to False to tune min_data_in_leaf.") params['feature_pre_filter'] = False if not any([p in params for p in ('num_iterations', 'num_iteration', 'num_trees', 'num_tree', 'num_rounds', 'num_round')]): params['num_iterations'] = params.get('n_estimators', 10000) if 'objective' not in params: tot_to_objective = { 'binary': 'binary', 'continuous': 'regression', 'multiclass': 'multiclass' } params['objective'] = tot_to_objective[type_of_target] if 'metric' not in params and 'objective' in params: if params['objective'] in ['regression', 'regression_l2', 'l2', 'mean_squared_error', 'mse', 'l2_root', 'root_mean_squared_error', 'rmse']: params['metric'] = 'l2' if params['objective'] in ['regression_l1', 'l1', 'mean_absolute_error', 'mae']: params['metric'] = 'l1' if params['objective'] in ['binary']: params['metric'] = 'binary_logloss' if params['objective'] in ['multiclass']: params['metric'] = 'multi_logloss' if not any([p in params for p in ('verbose', 'verbosity')]): params['verbosity'] = -1 model = optuna_lgb.train(params, dtrain, valid_sets=[dvalid], verbose_eval=0, time_budget=time_budget) return model.params