ssa_everyone/regressors/linear_study.py

from scipy.sparse.construct import random
from normal_use import *


sumRegressors = [LGBMRegressor, RandomForestRegressor, XGBRegressor, CatBoostRegressor]
sumRegressor = Union[type(sumRegressors)]
sumParams = [{},{},{},{"silent": True}]
weight = [0.1, 0.2, 0.3, 0.4]
Sums = {}
train_test_data = None
out_weights = []


oof_train = {}
oof_test = {}
# Some changes
# LinearRegression, Ridge, XGBRegressor, CatBoostRegressor, LGBMRegressor
# deepforest.CascadeForestRegressor
REG_TOTAL = Ridge


def get_random_small_train(X, y, Percentage = 0.8, seed = 0):
    # return X_train & y_train
    data_keys = ['X_train', 'X_test', 'y_train', 'y_test']
    data_vals = train_test_split(X, y, random_state=seed,test_size=(1-Percentage))
    train_test_data = dict(zip(data_keys, data_vals))
    return train_test_data['X_train'], train_test_data['y_train']


def train_one_regressor(id, regType: sumRegressor, use_RFsample = False, seed = 0):
    full_X, full_ys = train_test_data['X_train'], train_test_data['y_train']
    tX, tys = train_test_data['X_test'], train_test_data['y_test']
    X, ys = full_X, full_ys
    if use_RFsample:
        X, ys = get_random_small_train(X, ys, seed=seed)
        # which xxx_moon?
        # make_moons(n_samples=100, shuffle=True, noise=None, random_state=None)
        # pass
    check_X_y(X, ys, multi_output=True)
    models = {}
    evals = []
    for target_col in ys.columns:
        y = ys[target_col]
        reg = regType(**sumParams[id])
        reg.fit(X, y)
        models[target_col] = reg
        # test in full train_test
        y_hat = reg.predict(full_X)
        oof_train[target_col].append(y_hat.reshape(-1, 1))
        # predict test
        ty_hat = reg.predict(tX)
        oof_test[target_col].append(ty_hat.reshape(-1, 1))
        ty = tys[target_col]
        # one evals
        rmse = metrics.mean_squared_error(ty, ty_hat, squared=False)
        r2 = metrics.r2_score(ty, ty_hat)
        eval_dict = {'Error': target_col, 'RMSE': rmse, 'R^2': r2}
        evals.append(eval_dict)
    print(regType.__name__)
    print(pd.DataFrame(evals))
    print("Average R2:  ", average_R2(evals))

    joblib.dump(models, f"linear/{regType.__name__}_study_{id}.model")

def train_linear_sumer():
    ys = train_test_data['y_train']
    tys = train_test_data['y_test'] # real
    evals = []
    for target_col in oof_train:
        X = np.hstack(oof_train[target_col])
        tX = np.hstack(oof_test[target_col])
        print(ys.shape,X.shape,tys.shape,tX.shape)
        y = ys[target_col]
        ty = tys[target_col]
        clf = REG_TOTAL()
        clf.fit(X, y)
        ty_hat = clf.predict(tX) # fake
        rmse = metrics.mean_squared_error(ty, ty_hat, squared=False)
        r2 = metrics.r2_score(ty, ty_hat)
        eval_dict = {'Error': target_col, 'RMSE': rmse, 'R^2': r2}
        evals.append(eval_dict)

    print("linear *study* for {} regressors!".format(len(sumRegressors)))
    print(pd.DataFrame(evals))
    print("Average R2:  ", average_R2(evals))

def study_linear(trainset):
    """
    Description
    -----------
    create a linear combination, weight and regressors here to change

    Parameters
    ----------
    trainset : dict
        train_data_set

    Returns
    -------
    print result on screen
    """
    global train_test_data
    train_test_data = trainset

    for target_col in train_test_data['y_train'].columns:
        oof_train[target_col] = []
        oof_test[target_col] = []

    for i, reg in enumerate(sumRegressors):
        train_one_regressor(i, reg, use_RFsample=True, seed=1024)

    train_linear_sumer()