You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
120 lines
3.6 KiB
Python
120 lines
3.6 KiB
Python
from normal_use import *
|
|
|
|
|
|
# -------------------------- here is stacking method --------------------------
|
|
kf = KFold(n_splits=5, shuffle=True)
|
|
train_test_data = None
|
|
target_cols = None
|
|
|
|
stacking_model_regressors = [CatBoostRegressor, LGBMRegressor, XGBRegressor, RandomForestRegressor]
|
|
stacking_model_params = [{"silent": True}, {}, {}, {}]
|
|
|
|
# --------- change the stacking model here, use anything you want! --------- #
|
|
# LinearRegression, Ridge, XGBRegressor, CatBoostRegressor, LGBMRegressor
|
|
# deepforest.CascadeForestRegressor
|
|
REG_TOTAL = Ridge
|
|
|
|
|
|
class SklearnWrapper:
|
|
def __init__(self, clf, seed=0, params={}):
|
|
params['random_state'] = seed
|
|
self.clf = clf(**params)
|
|
|
|
def train(self, x_train, y_train):
|
|
self.clf.fit(x_train, y_train)
|
|
|
|
def predict(self, x):
|
|
return self.clf.predict(x)
|
|
|
|
|
|
def get_oof(clf, col_name):
|
|
x_train = train_test_data['X_train']
|
|
y_train = train_test_data['y_train'][col_name]
|
|
x_test = train_test_data['X_test']
|
|
|
|
oof_train = np.zeros((x_train.shape[0],))
|
|
oof_test = np.zeros((x_test.shape[0],))
|
|
oof_test_skf = np.empty((5, x_test.shape[0]))
|
|
|
|
for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
|
|
trn_x, trn_y, val_x, val_y = x_train.iloc[train_index], y_train.iloc[
|
|
train_index], x_train.iloc[valid_index], y_train.iloc[valid_index]
|
|
|
|
clf.train(trn_x, trn_y)
|
|
oof_train[valid_index] = clf.predict(val_x)
|
|
oof_test_skf[i, :] = clf.predict(x_test)
|
|
|
|
oof_test[:] = oof_test_skf.mean(axis=0)
|
|
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
|
|
|
|
|
|
def stack_model(train_stack, test_stack, y):
|
|
train_stack = np.hstack(train_stack)
|
|
test_stack = np.hstack(test_stack)
|
|
|
|
oof = np.zeros((train_stack.shape[0],))
|
|
|
|
# usually we use this
|
|
# predictions = np.zeros((test_stack.shape[0],))
|
|
# deepforest.CascadeForestRegressor needed below
|
|
predictions = np.zeros((test_stack.shape[0],1))
|
|
|
|
scores = []
|
|
for fold_, (trn_idx, val_idx) in enumerate(kf.split(train_stack, y)):
|
|
trn_data, trn_y = train_stack[trn_idx], y.iloc[trn_idx]
|
|
val_data, val_y = train_stack[val_idx], y.iloc[val_idx]
|
|
|
|
clf = REG_TOTAL()
|
|
clf.fit(trn_data, trn_y)
|
|
|
|
tmp = clf.predict(test_stack)
|
|
tmp = tmp.reshape(-1,1)
|
|
predictions += tmp/5
|
|
|
|
return oof, predictions
|
|
|
|
|
|
def stacking_train(trainset):
|
|
"""
|
|
Description
|
|
-----------
|
|
start stacking train
|
|
|
|
Parameters
|
|
----------
|
|
trainset : dict
|
|
train_data_set
|
|
|
|
Returns
|
|
-------
|
|
print result
|
|
|
|
"""
|
|
global target_cols, train_test_data
|
|
train_test_data = trainset
|
|
target_cols = train_test_data['y_train'].columns
|
|
|
|
evals = []
|
|
for col_name in target_cols:
|
|
oof_train = []
|
|
oof_test = []
|
|
y_train = train_test_data['y_train'][col_name]
|
|
for i, (reg, param) in enumerate(zip(stacking_model_regressors, stacking_model_params)):
|
|
regressor = SklearnWrapper(reg, params=param)
|
|
t_train, t_test = get_oof(regressor, col_name=col_name)
|
|
oof_train.append(t_train)
|
|
oof_test.append(t_test)
|
|
oof_stack, prediction_stack = stack_model(oof_train, oof_test, y_train)
|
|
|
|
y_hat = prediction_stack # fake
|
|
y = train_test_data['y_test'][col_name] # real
|
|
rmse = metrics.mean_squared_error(y, y_hat, squared=False)
|
|
r2 = metrics.r2_score(y, y_hat)
|
|
eval_dict = {'Error': col_name, 'RMSE': rmse, 'R^2': r2}
|
|
evals.append(eval_dict)
|
|
print(f"{col_name} finished")
|
|
print(f"Stacking -- {REG_TOTAL.__name__}")
|
|
print(pd.DataFrame(evals))
|
|
print("Average R2: ", average_R2(evals))
|
|
|