You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

120 lines
3.6 KiB
Python

from normal_use import *
# -------------------------- here is stacking method --------------------------
kf = KFold(n_splits=5, shuffle=True)
train_test_data = None
target_cols = None
stacking_model_regressors = [CatBoostRegressor, LGBMRegressor, XGBRegressor, RandomForestRegressor]
stacking_model_params = [{"silent": True}, {}, {}, {}]
# --------- change the stacking model here, use anything you want! --------- #
# LinearRegression, Ridge, XGBRegressor, CatBoostRegressor, LGBMRegressor
# deepforest.CascadeForestRegressor
REG_TOTAL = Ridge
class SklearnWrapper:
def __init__(self, clf, seed=0, params={}):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def get_oof(clf, col_name):
x_train = train_test_data['X_train']
y_train = train_test_data['y_train'][col_name]
x_test = train_test_data['X_test']
oof_train = np.zeros((x_train.shape[0],))
oof_test = np.zeros((x_test.shape[0],))
oof_test_skf = np.empty((5, x_test.shape[0]))
for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
trn_x, trn_y, val_x, val_y = x_train.iloc[train_index], y_train.iloc[
train_index], x_train.iloc[valid_index], y_train.iloc[valid_index]
clf.train(trn_x, trn_y)
oof_train[valid_index] = clf.predict(val_x)
oof_test_skf[i, :] = clf.predict(x_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
def stack_model(train_stack, test_stack, y):
train_stack = np.hstack(train_stack)
test_stack = np.hstack(test_stack)
oof = np.zeros((train_stack.shape[0],))
# usually we use this
# predictions = np.zeros((test_stack.shape[0],))
# deepforest.CascadeForestRegressor needed below
predictions = np.zeros((test_stack.shape[0],1))
scores = []
for fold_, (trn_idx, val_idx) in enumerate(kf.split(train_stack, y)):
trn_data, trn_y = train_stack[trn_idx], y.iloc[trn_idx]
val_data, val_y = train_stack[val_idx], y.iloc[val_idx]
clf = REG_TOTAL()
clf.fit(trn_data, trn_y)
tmp = clf.predict(test_stack)
tmp = tmp.reshape(-1,1)
predictions += tmp/5
return oof, predictions
def stacking_train(trainset):
"""
Description
-----------
start stacking train
Parameters
----------
trainset : dict
train_data_set
Returns
-------
print result
"""
global target_cols, train_test_data
train_test_data = trainset
target_cols = train_test_data['y_train'].columns
evals = []
for col_name in target_cols:
oof_train = []
oof_test = []
y_train = train_test_data['y_train'][col_name]
for i, (reg, param) in enumerate(zip(stacking_model_regressors, stacking_model_params)):
regressor = SklearnWrapper(reg, params=param)
t_train, t_test = get_oof(regressor, col_name=col_name)
oof_train.append(t_train)
oof_test.append(t_test)
oof_stack, prediction_stack = stack_model(oof_train, oof_test, y_train)
y_hat = prediction_stack # fake
y = train_test_data['y_test'][col_name] # real
rmse = metrics.mean_squared_error(y, y_hat, squared=False)
r2 = metrics.r2_score(y, y_hat)
eval_dict = {'Error': col_name, 'RMSE': rmse, 'R^2': r2}
evals.append(eval_dict)
print(f"{col_name} finished")
print(f"Stacking -- {REG_TOTAL.__name__}")
print(pd.DataFrame(evals))
print("Average R2: ", average_R2(evals))