from normal_use import * # -------------------------- here is stacking method -------------------------- kf = KFold(n_splits=5, shuffle=True) train_test_data = None target_cols = None stacking_model_regressors = [CatBoostRegressor, LGBMRegressor, XGBRegressor, RandomForestRegressor] stacking_model_params = [{"silent": True}, {}, {}, {}] # --------- change the stacking model here, use anything you want! --------- # # LinearRegression, Ridge, XGBRegressor, CatBoostRegressor, LGBMRegressor # deepforest.CascadeForestRegressor REG_TOTAL = Ridge class SklearnWrapper: def __init__(self, clf, seed=0, params={}): params['random_state'] = seed self.clf = clf(**params) def train(self, x_train, y_train): self.clf.fit(x_train, y_train) def predict(self, x): return self.clf.predict(x) def get_oof(clf, col_name): x_train = train_test_data['X_train'] y_train = train_test_data['y_train'][col_name] x_test = train_test_data['X_test'] oof_train = np.zeros((x_train.shape[0],)) oof_test = np.zeros((x_test.shape[0],)) oof_test_skf = np.empty((5, x_test.shape[0])) for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)): trn_x, trn_y, val_x, val_y = x_train.iloc[train_index], y_train.iloc[ train_index], x_train.iloc[valid_index], y_train.iloc[valid_index] clf.train(trn_x, trn_y) oof_train[valid_index] = clf.predict(val_x) oof_test_skf[i, :] = clf.predict(x_test) oof_test[:] = oof_test_skf.mean(axis=0) return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) def stack_model(train_stack, test_stack, y): train_stack = np.hstack(train_stack) test_stack = np.hstack(test_stack) oof = np.zeros((train_stack.shape[0],)) # usually we use this # predictions = np.zeros((test_stack.shape[0],)) # deepforest.CascadeForestRegressor needed below predictions = np.zeros((test_stack.shape[0],1)) scores = [] for fold_, (trn_idx, val_idx) in enumerate(kf.split(train_stack, y)): trn_data, trn_y = train_stack[trn_idx], y.iloc[trn_idx] val_data, val_y = train_stack[val_idx], y.iloc[val_idx] clf = REG_TOTAL() clf.fit(trn_data, trn_y) tmp = clf.predict(test_stack) tmp = tmp.reshape(-1,1) predictions += tmp/5 return oof, predictions def stacking_train(trainset): """ Description ----------- start stacking train Parameters ---------- trainset : dict train_data_set Returns ------- print result """ global target_cols, train_test_data train_test_data = trainset target_cols = train_test_data['y_train'].columns evals = [] for col_name in target_cols: oof_train = [] oof_test = [] y_train = train_test_data['y_train'][col_name] for i, (reg, param) in enumerate(zip(stacking_model_regressors, stacking_model_params)): regressor = SklearnWrapper(reg, params=param) t_train, t_test = get_oof(regressor, col_name=col_name) oof_train.append(t_train) oof_test.append(t_test) oof_stack, prediction_stack = stack_model(oof_train, oof_test, y_train) y_hat = prediction_stack # fake y = train_test_data['y_test'][col_name] # real rmse = metrics.mean_squared_error(y, y_hat, squared=False) r2 = metrics.r2_score(y, y_hat) eval_dict = {'Error': col_name, 'RMSE': rmse, 'R^2': r2} evals.append(eval_dict) print(f"{col_name} finished") print(f"Stacking -- {REG_TOTAL.__name__}") print(pd.DataFrame(evals)) print("Average R2: ", average_R2(evals))