ssa_everyone/regressors/stacking_regressor.py

from normal_use import *


# -------------------------- here is stacking method --------------------------
kf = KFold(n_splits=5, shuffle=True)
train_test_data = None
target_cols = None

stacking_model_regressors = [CatBoostRegressor, LGBMRegressor, XGBRegressor, RandomForestRegressor]
stacking_model_params = [{"silent": True}, {}, {}, {}]

# --------- change the stacking model here, use anything you want! --------- #
# LinearRegression, Ridge, XGBRegressor, CatBoostRegressor, LGBMRegressor
# deepforest.CascadeForestRegressor
REG_TOTAL = Ridge


class SklearnWrapper:
    def __init__(self, clf, seed=0, params={}):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)


def get_oof(clf, col_name):
    x_train = train_test_data['X_train']
    y_train = train_test_data['y_train'][col_name]
    x_test = train_test_data['X_test']

    oof_train = np.zeros((x_train.shape[0],))
    oof_test = np.zeros((x_test.shape[0],))
    oof_test_skf = np.empty((5, x_test.shape[0]))

    for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
        trn_x, trn_y, val_x, val_y = x_train.iloc[train_index], y_train.iloc[
            train_index], x_train.iloc[valid_index], y_train.iloc[valid_index]

        clf.train(trn_x, trn_y)
        oof_train[valid_index] = clf.predict(val_x)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


def stack_model(train_stack, test_stack, y):
    train_stack = np.hstack(train_stack)
    test_stack = np.hstack(test_stack)

    oof = np.zeros((train_stack.shape[0],))

    # usually we use this
    # predictions = np.zeros((test_stack.shape[0],))
    # deepforest.CascadeForestRegressor needed below
    predictions = np.zeros((test_stack.shape[0],1))

    scores = []
    for fold_, (trn_idx, val_idx) in enumerate(kf.split(train_stack, y)):
        trn_data, trn_y = train_stack[trn_idx], y.iloc[trn_idx]
        val_data, val_y = train_stack[val_idx], y.iloc[val_idx]

        clf = REG_TOTAL()
        clf.fit(trn_data, trn_y)

        tmp =  clf.predict(test_stack)
        tmp = tmp.reshape(-1,1)
        predictions += tmp/5

    return oof, predictions


def stacking_train(trainset):
    """
    Description
    -----------
    start stacking train

    Parameters
    ----------
    trainset : dict
        train_data_set

    Returns
    -------
    print result

    """
    global target_cols, train_test_data
    train_test_data = trainset
    target_cols = train_test_data['y_train'].columns

    evals = []
    for col_name in target_cols:
        oof_train = []
        oof_test = []
        y_train = train_test_data['y_train'][col_name]
        for i, (reg, param) in enumerate(zip(stacking_model_regressors, stacking_model_params)):
            regressor = SklearnWrapper(reg, params=param)
            t_train, t_test = get_oof(regressor, col_name=col_name)
            oof_train.append(t_train)
            oof_test.append(t_test)
        oof_stack, prediction_stack = stack_model(oof_train, oof_test, y_train)

        y_hat = prediction_stack  # fake
        y = train_test_data['y_test'][col_name]  # real
        rmse = metrics.mean_squared_error(y, y_hat, squared=False)
        r2 = metrics.r2_score(y, y_hat)
        eval_dict = {'Error': col_name, 'RMSE': rmse, 'R^2': r2}
        evals.append(eval_dict)
        print(f"{col_name} finished")
    print(f"Stacking -- {REG_TOTAL.__name__}")
    print(pd.DataFrame(evals))
    print("Average R2:  ", average_R2(evals))