normal update, not many things
parent
7bbb84215e
commit
386efc5523
@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"python.analysis.extraPaths": [
|
||||||
|
"./codes",
|
||||||
|
"./regressors"
|
||||||
|
]
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@ -0,0 +1,56 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from itertools import product
|
||||||
|
import joblib
|
||||||
|
|
||||||
|
|
||||||
|
def get_state_vect_cols(prefix=''):
|
||||||
|
if prefix:
|
||||||
|
prefix += '_'
|
||||||
|
vectors = ['r', 'v']
|
||||||
|
components = ['x', 'y', 'z']
|
||||||
|
col_names = [f'{prefix}{v}_{c}' for v, c in product(vectors, components)]
|
||||||
|
return col_names
|
||||||
|
|
||||||
|
|
||||||
|
def create_train_data(seed = 0, test_size = 0.2):
|
||||||
|
"""
|
||||||
|
Description
|
||||||
|
-----------
|
||||||
|
create a new train set from dataset(.parquet) by using seed
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
seed : int (default=-1)
|
||||||
|
seed for train_test_split, let's say seed = 0 means random
|
||||||
|
|
||||||
|
test_size : double (default=0.2)
|
||||||
|
test_size for train_test_split
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
and traindata in folder "create_traindata" named "seed_{seed}.td"
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
df = pd.read_parquet("traindata/physics_preds.parquet")
|
||||||
|
feature_cols = [
|
||||||
|
'elapsed_seconds'
|
||||||
|
] + get_state_vect_cols('physics_pred') + get_state_vect_cols('start')
|
||||||
|
print(feature_cols)
|
||||||
|
# The target values are the errors between the physical model predictions
|
||||||
|
# and the ground truth observations
|
||||||
|
target_cols = get_state_vect_cols('physics_err')
|
||||||
|
print(target_cols)
|
||||||
|
# Create feature and target matrices
|
||||||
|
X = df[feature_cols]
|
||||||
|
y = df[target_cols]
|
||||||
|
data_keys = ['X_train', 'X_test', 'y_train', 'y_test']
|
||||||
|
if seed == 0:
|
||||||
|
data_vals = train_test_split(X, y, test_size=test_size)
|
||||||
|
else:
|
||||||
|
data_vals = train_test_split(X, y, test_size=test_size, random_state=seed)
|
||||||
|
train_test_data = dict(zip(data_keys, data_vals))
|
||||||
|
|
||||||
|
joblib.dump(train_test_data, f"create_datas/seed_{seed}.td")
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
# wait for tommorow!
|
||||||
@ -0,0 +1,90 @@
|
|||||||
|
from normal_use import *
|
||||||
|
|
||||||
|
sumRegressors = [LGBMRegressor, RandomForestRegressor, XGBRegressor, CatBoostRegressor]
|
||||||
|
sumRegressor = Union[type(sumRegressors)]
|
||||||
|
sumParams = [{},{},{},{"silent": True}]
|
||||||
|
weight = [0.1, 0.2, 0.3, 0.4]
|
||||||
|
Sums = {}
|
||||||
|
train_test_data = None
|
||||||
|
out_weights = []
|
||||||
|
|
||||||
|
def get_random_small_train(X, y, Percentage = 0.8):
|
||||||
|
# return X_train & y_train
|
||||||
|
data_keys = ['X_train', 'X_test', 'y_train', 'y_test']
|
||||||
|
data_vals = train_test_split(X, y, test_size=(1-Percentage))
|
||||||
|
train_test_data = dict(zip(data_keys, data_vals))
|
||||||
|
return train_test_data['X_train'], train_test_data['y_train']
|
||||||
|
|
||||||
|
|
||||||
|
def train_linear(id, regType: sumRegressor, use_RFsample = False):
|
||||||
|
X, ys = train_test_data['X_train'], train_test_data['y_train']
|
||||||
|
if use_RFsample:
|
||||||
|
# X, ys = get_random_small_train(X, ys)
|
||||||
|
# which xxx_moon?
|
||||||
|
pass
|
||||||
|
check_X_y(X, ys, multi_output=True)
|
||||||
|
models = {}
|
||||||
|
for target_col in ys.columns:
|
||||||
|
y = ys[target_col]
|
||||||
|
reg = regType(**sumParams[id])
|
||||||
|
reg.fit(X, y)
|
||||||
|
models[target_col] = reg
|
||||||
|
joblib.dump(models, f"linear/{regType.__name__}_{id}.model")
|
||||||
|
|
||||||
|
|
||||||
|
def eval_linear(id, regType: sumRegressor):
|
||||||
|
models = joblib.load(f"linear/{regType.__name__}_{id}.model")
|
||||||
|
X, ys = train_test_data['X_test'], train_test_data['y_test']
|
||||||
|
evals = []
|
||||||
|
out_w_dict = {'Regressor': regType.__name__, 'Weight': weight[id]}
|
||||||
|
out_weights.append(out_w_dict)
|
||||||
|
for target_col, reg in models.items():
|
||||||
|
y_hat = reg.predict(X) # fake
|
||||||
|
y = ys[target_col] # real
|
||||||
|
rmse = metrics.mean_squared_error(y, y_hat, squared=False)
|
||||||
|
r2 = metrics.r2_score(y, y_hat)
|
||||||
|
eval_dict = {'Error': target_col, 'RMSE': rmse, 'R^2': r2}
|
||||||
|
evals.append(eval_dict)
|
||||||
|
if Sums.get(target_col) is None:
|
||||||
|
Sums[target_col] = weight[id] * y_hat
|
||||||
|
else:
|
||||||
|
Sums[target_col] += weight[id] * y_hat
|
||||||
|
print(f"{regType.__name__}_{id}")
|
||||||
|
print(pd.DataFrame(evals))
|
||||||
|
|
||||||
|
|
||||||
|
def only_linear(trainset):
|
||||||
|
"""
|
||||||
|
Description
|
||||||
|
-----------
|
||||||
|
create a linear combination, weight and regressors here to change
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
trainset : dict
|
||||||
|
train_data_set
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
print result on screen
|
||||||
|
"""
|
||||||
|
global train_test_data
|
||||||
|
train_test_data = trainset
|
||||||
|
|
||||||
|
for i, reg in enumerate(sumRegressors):
|
||||||
|
train_linear(i, reg)
|
||||||
|
for i, reg in enumerate(sumRegressors):
|
||||||
|
eval_linear(i, reg)
|
||||||
|
ys = train_test_data['y_test']
|
||||||
|
evals = []
|
||||||
|
for target_col in Sums:
|
||||||
|
y_hat = Sums[target_col] # fake
|
||||||
|
y = ys[target_col] # real
|
||||||
|
rmse = metrics.mean_squared_error(y, y_hat, squared=False)
|
||||||
|
r2 = metrics.r2_score(y, y_hat)
|
||||||
|
eval_dict = {'Error': target_col, 'RMSE': rmse, 'R^2': r2}
|
||||||
|
evals.append(eval_dict)
|
||||||
|
print("linear sum for {} regressors!".format(len(sumRegressors)))
|
||||||
|
print(pd.DataFrame(out_weights))
|
||||||
|
print(pd.DataFrame(evals))
|
||||||
|
print("Average R2: ", average_R2(evals))
|
||||||
@ -0,0 +1,23 @@
|
|||||||
|
from ngboost import NGBRegressor
|
||||||
|
from sklearn.ensemble import RandomForestRegressor
|
||||||
|
from catboost import CatBoostRegressor
|
||||||
|
from lightgbm import LGBMRegressor
|
||||||
|
from xgboost import XGBRegressor
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
from sklearn.linear_model import Ridge
|
||||||
|
from sklearn.model_selection import KFold
|
||||||
|
import deepforest
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from typing import Union
|
||||||
|
from sklearn import metrics
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.utils.validation import check_X_y
|
||||||
|
import joblib
|
||||||
|
|
||||||
|
def average_R2(evals):
|
||||||
|
sum = 0
|
||||||
|
for item in evals:
|
||||||
|
sum += item['R^2']
|
||||||
|
return sum/len(evals)
|
||||||
@ -0,0 +1,64 @@
|
|||||||
|
from normal_use import *
|
||||||
|
|
||||||
|
|
||||||
|
Regressors = [NGBRegressor, RandomForestRegressor, CatBoostRegressor, LGBMRegressor, XGBRegressor]
|
||||||
|
Params = [{}, {}, {"silent": True},{},{}]
|
||||||
|
Regressor = Union[type(Regressors)]
|
||||||
|
train_test_data = None
|
||||||
|
|
||||||
|
|
||||||
|
def train_model(id, regType: Regressor):
|
||||||
|
X, ys = train_test_data['X_train'], train_test_data['y_train']
|
||||||
|
check_X_y(X, ys, multi_output=True)
|
||||||
|
models = {}
|
||||||
|
for target_col in ys.columns:
|
||||||
|
y = ys[target_col]
|
||||||
|
reg = regType(**Params[id])
|
||||||
|
reg.fit(X, y)
|
||||||
|
models[target_col] = reg
|
||||||
|
print(regType.__name__, target_col)
|
||||||
|
joblib.dump(models, f"models/{regType.__name__}.model")
|
||||||
|
|
||||||
|
|
||||||
|
def eval_model(regType: Regressor):
|
||||||
|
models = joblib.load(f"models/{regType.__name__}.model")
|
||||||
|
X, ys = train_test_data['X_test'], train_test_data['y_test']
|
||||||
|
evals = []
|
||||||
|
for target_col, reg in models.items():
|
||||||
|
y_hat = reg.predict(X) # fake
|
||||||
|
y = ys[target_col] # real
|
||||||
|
rmse = metrics.mean_squared_error(y, y_hat, squared=False)
|
||||||
|
r2 = metrics.r2_score(y, y_hat)
|
||||||
|
eval_dict = {'Error': target_col, 'RMSE': rmse, 'R^2': r2}
|
||||||
|
evals.append(eval_dict)
|
||||||
|
print(regType.__name__)
|
||||||
|
print(pd.DataFrame(evals))
|
||||||
|
print("Average R2: ", average_R2(evals))
|
||||||
|
|
||||||
|
|
||||||
|
def train_one_models(trainsets):
|
||||||
|
"""
|
||||||
|
Description
|
||||||
|
-----------
|
||||||
|
call this to start trainning each regressors.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
trainset : dict
|
||||||
|
use joblib to extract target dataset(create_datas) and put it in here.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
NO returns, but models in folder "models" and print R2 on screen
|
||||||
|
|
||||||
|
"""
|
||||||
|
global train_test_data
|
||||||
|
train_test_data = trainsets
|
||||||
|
|
||||||
|
for i, reg in enumerate(Regressors):
|
||||||
|
train_model(i, reg)
|
||||||
|
|
||||||
|
for reg in Regressors:
|
||||||
|
eval_model(reg)
|
||||||
|
|
||||||
|
|
||||||
@ -0,0 +1,119 @@
|
|||||||
|
from normal_use import *
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------- here is stacking method --------------------------
|
||||||
|
kf = KFold(n_splits=5, shuffle=True)
|
||||||
|
train_test_data = None
|
||||||
|
target_cols = None
|
||||||
|
|
||||||
|
stacking_model_regressors = [CatBoostRegressor, LGBMRegressor, XGBRegressor, RandomForestRegressor]
|
||||||
|
stacking_model_params = [{"silent": True}, {}, {}, {}]
|
||||||
|
|
||||||
|
# --------- change the stacking model here, use anything you want! --------- #
|
||||||
|
# LinearRegression, Ridge, XGBRegressor, CatBoostRegressor, LGBMRegressor
|
||||||
|
# deepforest.CascadeForestRegressor
|
||||||
|
REG_TOTAL = Ridge
|
||||||
|
|
||||||
|
|
||||||
|
class SklearnWrapper:
|
||||||
|
def __init__(self, clf, seed=0, params={}):
|
||||||
|
params['random_state'] = seed
|
||||||
|
self.clf = clf(**params)
|
||||||
|
|
||||||
|
def train(self, x_train, y_train):
|
||||||
|
self.clf.fit(x_train, y_train)
|
||||||
|
|
||||||
|
def predict(self, x):
|
||||||
|
return self.clf.predict(x)
|
||||||
|
|
||||||
|
|
||||||
|
def get_oof(clf, col_name):
|
||||||
|
x_train = train_test_data['X_train']
|
||||||
|
y_train = train_test_data['y_train'][col_name]
|
||||||
|
x_test = train_test_data['X_test']
|
||||||
|
|
||||||
|
oof_train = np.zeros((x_train.shape[0],))
|
||||||
|
oof_test = np.zeros((x_test.shape[0],))
|
||||||
|
oof_test_skf = np.empty((5, x_test.shape[0]))
|
||||||
|
|
||||||
|
for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
|
||||||
|
trn_x, trn_y, val_x, val_y = x_train.iloc[train_index], y_train.iloc[
|
||||||
|
train_index], x_train.iloc[valid_index], y_train.iloc[valid_index]
|
||||||
|
|
||||||
|
clf.train(trn_x, trn_y)
|
||||||
|
oof_train[valid_index] = clf.predict(val_x)
|
||||||
|
oof_test_skf[i, :] = clf.predict(x_test)
|
||||||
|
|
||||||
|
oof_test[:] = oof_test_skf.mean(axis=0)
|
||||||
|
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def stack_model(train_stack, test_stack, y):
|
||||||
|
train_stack = np.hstack(train_stack)
|
||||||
|
test_stack = np.hstack(test_stack)
|
||||||
|
|
||||||
|
oof = np.zeros((train_stack.shape[0],))
|
||||||
|
|
||||||
|
# usually we use this
|
||||||
|
# predictions = np.zeros((test_stack.shape[0],))
|
||||||
|
# deepforest.CascadeForestRegressor needed below
|
||||||
|
predictions = np.zeros((test_stack.shape[0],1))
|
||||||
|
|
||||||
|
scores = []
|
||||||
|
for fold_, (trn_idx, val_idx) in enumerate(kf.split(train_stack, y)):
|
||||||
|
trn_data, trn_y = train_stack[trn_idx], y.iloc[trn_idx]
|
||||||
|
val_data, val_y = train_stack[val_idx], y.iloc[val_idx]
|
||||||
|
|
||||||
|
clf = REG_TOTAL()
|
||||||
|
clf.fit(trn_data, trn_y)
|
||||||
|
|
||||||
|
tmp = clf.predict(test_stack)
|
||||||
|
tmp = tmp.reshape(-1,1)
|
||||||
|
predictions += tmp/5
|
||||||
|
|
||||||
|
return oof, predictions
|
||||||
|
|
||||||
|
|
||||||
|
def stacking_train(trainset):
|
||||||
|
"""
|
||||||
|
Description
|
||||||
|
-----------
|
||||||
|
start stacking train
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
trainset : dict
|
||||||
|
train_data_set
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
print result
|
||||||
|
|
||||||
|
"""
|
||||||
|
global target_cols, train_test_data
|
||||||
|
train_test_data = trainset
|
||||||
|
target_cols = train_test_data['y_train'].columns
|
||||||
|
|
||||||
|
evals = []
|
||||||
|
for col_name in target_cols:
|
||||||
|
oof_train = []
|
||||||
|
oof_test = []
|
||||||
|
y_train = train_test_data['y_train'][col_name]
|
||||||
|
for i, (reg, param) in enumerate(zip(stacking_model_regressors, stacking_model_params)):
|
||||||
|
regressor = SklearnWrapper(reg, params=param)
|
||||||
|
t_train, t_test = get_oof(regressor, col_name=col_name)
|
||||||
|
oof_train.append(t_train)
|
||||||
|
oof_test.append(t_test)
|
||||||
|
oof_stack, prediction_stack = stack_model(oof_train, oof_test, y_train)
|
||||||
|
|
||||||
|
y_hat = prediction_stack # fake
|
||||||
|
y = train_test_data['y_test'][col_name] # real
|
||||||
|
rmse = metrics.mean_squared_error(y, y_hat, squared=False)
|
||||||
|
r2 = metrics.r2_score(y, y_hat)
|
||||||
|
eval_dict = {'Error': col_name, 'RMSE': rmse, 'R^2': r2}
|
||||||
|
evals.append(eval_dict)
|
||||||
|
print(f"{col_name} finished")
|
||||||
|
print(f"Stacking -- {REG_TOTAL.__name__}")
|
||||||
|
print(pd.DataFrame(evals))
|
||||||
|
print("Average R2: ", average_R2(evals))
|
||||||
|
|
||||||
@ -0,0 +1,27 @@
|
|||||||
|
# %%
|
||||||
|
import joblib
|
||||||
|
import sys
|
||||||
|
sys.path.append("./codes")
|
||||||
|
sys.path.append("./regressors")
|
||||||
|
import create_traindata
|
||||||
|
import one_regressor
|
||||||
|
import stacking_regressor
|
||||||
|
import linear_sum_regressor
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# create train data
|
||||||
|
seed = 514
|
||||||
|
create_traindata.create_train_data(seed=seed)
|
||||||
|
train_test_data = joblib.load(f"create_datas/seed_{seed}.td")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# test one regressor
|
||||||
|
one_regressor.train_one_models(train_test_data)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# test stacking method
|
||||||
|
stacking_regressor.stacking_train(train_test_data)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# test linear combination
|
||||||
|
linear_sum_regressor.only_linear(train_test_data)
|
||||||
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue