Source code for nyaggle.util.submission

from typing import Optional

import numpy as np
import pandas as pd


[docs]def make_submission_df(test_prediction: np.ndarray, sample_submission: Optional[pd.DataFrame] = None, y: Optional[pd.Series] = None) -> pd.DataFrame: """ Make a dataframe formatted as a kaggle competition style. Args: test_prediction: A test prediction to be formatted. sample_submission: A sample dataframe alined with test data (Usually in Kaggle, it is available as sample_submission.csv). The submission file will be created with the same schema as this dataframe. y: Target variables which is used for inferring the column name. Ignored if ``sample_submission`` is passed. Returns: The formatted dataframe """ if sample_submission is not None: submit_df = sample_submission.copy() if test_prediction.ndim > 1 and test_prediction.shape[1] > 1: n_id_cols = submit_df.shape[1] - test_prediction.shape[1] for i in range(test_prediction.shape[1]): submit_df.iloc[:, n_id_cols + i] = test_prediction[:, i] else: submit_df.iloc[:, -1] = test_prediction else: submit_df = pd.DataFrame() id_col_name = y.index.name if y is not None and y.index.name else 'id' submit_df[id_col_name] = np.arange(len(test_prediction)) if test_prediction.ndim > 1 and test_prediction.shape[1] > 1: tgt_col_names = sorted(y.unique()) if y is not None else [str(i) for i in range(test_prediction.shape[1])] for i, y in enumerate(tgt_col_names): submit_df[y] = test_prediction[:, i] else: tgt_col_name = y.name if y is not None and y.name else 'target' submit_df[tgt_col_name] = test_prediction return submit_df