You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
114 lines
3.5 KiB
Python
114 lines
3.5 KiB
Python
from scipy.sparse.construct import random
|
|
from normal_use import *
|
|
|
|
|
|
sumRegressors = [LGBMRegressor, RandomForestRegressor, XGBRegressor, CatBoostRegressor]
|
|
sumRegressor = Union[type(sumRegressors)]
|
|
sumParams = [{},{},{},{"silent": True}]
|
|
weight = [0.1, 0.2, 0.3, 0.4]
|
|
Sums = {}
|
|
train_test_data = None
|
|
out_weights = []
|
|
|
|
|
|
oof_train = {}
|
|
oof_test = {}
|
|
# Some changes
|
|
# LinearRegression, Ridge, XGBRegressor, CatBoostRegressor, LGBMRegressor
|
|
# deepforest.CascadeForestRegressor
|
|
REG_TOTAL = Ridge
|
|
|
|
|
|
def get_random_small_train(X, y, Percentage = 0.8, seed = 0):
|
|
# return X_train & y_train
|
|
data_keys = ['X_train', 'X_test', 'y_train', 'y_test']
|
|
data_vals = train_test_split(X, y, random_state=seed,test_size=(1-Percentage))
|
|
train_test_data = dict(zip(data_keys, data_vals))
|
|
return train_test_data['X_train'], train_test_data['y_train']
|
|
|
|
|
|
def train_one_regressor(id, regType: sumRegressor, use_RFsample = False, seed = 0):
|
|
full_X, full_ys = train_test_data['X_train'], train_test_data['y_train']
|
|
tX, tys = train_test_data['X_test'], train_test_data['y_test']
|
|
X, ys = full_X, full_ys
|
|
if use_RFsample:
|
|
X, ys = get_random_small_train(X, ys, seed=seed)
|
|
# which xxx_moon?
|
|
# make_moons(n_samples=100, shuffle=True, noise=None, random_state=None)
|
|
# pass
|
|
check_X_y(X, ys, multi_output=True)
|
|
models = {}
|
|
evals = []
|
|
for target_col in ys.columns:
|
|
y = ys[target_col]
|
|
reg = regType(**sumParams[id])
|
|
reg.fit(X, y)
|
|
models[target_col] = reg
|
|
# test in full train_test
|
|
y_hat = reg.predict(full_X)
|
|
oof_train[target_col].append(y_hat.reshape(-1, 1))
|
|
# predict test
|
|
ty_hat = reg.predict(tX)
|
|
oof_test[target_col].append(ty_hat.reshape(-1, 1))
|
|
ty = tys[target_col]
|
|
# one evals
|
|
rmse = metrics.mean_squared_error(ty, ty_hat, squared=False)
|
|
r2 = metrics.r2_score(ty, ty_hat)
|
|
eval_dict = {'Error': target_col, 'RMSE': rmse, 'R^2': r2}
|
|
evals.append(eval_dict)
|
|
print(regType.__name__)
|
|
print(pd.DataFrame(evals))
|
|
print("Average R2: ", average_R2(evals))
|
|
|
|
joblib.dump(models, f"linear/{regType.__name__}_study_{id}.model")
|
|
|
|
def train_linear_sumer():
|
|
ys = train_test_data['y_train']
|
|
tys = train_test_data['y_test'] # real
|
|
evals = []
|
|
for target_col in oof_train:
|
|
X = np.hstack(oof_train[target_col])
|
|
tX = np.hstack(oof_test[target_col])
|
|
print(ys.shape,X.shape,tys.shape,tX.shape)
|
|
y = ys[target_col]
|
|
ty = tys[target_col]
|
|
clf = REG_TOTAL()
|
|
clf.fit(X, y)
|
|
ty_hat = clf.predict(tX) # fake
|
|
rmse = metrics.mean_squared_error(ty, ty_hat, squared=False)
|
|
r2 = metrics.r2_score(ty, ty_hat)
|
|
eval_dict = {'Error': target_col, 'RMSE': rmse, 'R^2': r2}
|
|
evals.append(eval_dict)
|
|
|
|
print("linear *study* for {} regressors!".format(len(sumRegressors)))
|
|
print(pd.DataFrame(evals))
|
|
print("Average R2: ", average_R2(evals))
|
|
|
|
def study_linear(trainset):
|
|
"""
|
|
Description
|
|
-----------
|
|
create a linear combination, weight and regressors here to change
|
|
|
|
Parameters
|
|
----------
|
|
trainset : dict
|
|
train_data_set
|
|
|
|
Returns
|
|
-------
|
|
print result on screen
|
|
"""
|
|
global train_test_data
|
|
train_test_data = trainset
|
|
|
|
for target_col in train_test_data['y_train'].columns:
|
|
oof_train[target_col] = []
|
|
oof_test[target_col] = []
|
|
|
|
for i, reg in enumerate(sumRegressors):
|
|
train_one_regressor(i, reg, use_RFsample=True, seed=1024)
|
|
|
|
train_linear_sumer()
|
|
|