normal update, not many things

master
lj020 4 years ago
parent 7bbb84215e
commit 386efc5523

@ -0,0 +1,6 @@
{
"python.analysis.extraPaths": [
"./codes",
"./regressors"
]
}

@ -1,3 +1,22 @@
# ssa_everyone # ssa_everyone
more test, more hope more test, more hope
目前更新的regressor技术都在同名文件夹使用有点怪因为调整都在对应python里而不是总的test里。
明天在改把。
**test_full.py可以试试就在这master位置打开把要不然感觉可能会混乱位置**
然后新regressor就往那边丢了终于是分开了天——
细节可能需要修一修average也有了seed可选model太大就不传勒
明天考虑加入nn结构和其他更多厕所

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,56 @@
import pandas as pd
from sklearn.model_selection import train_test_split
from itertools import product
import joblib
def get_state_vect_cols(prefix=''):
if prefix:
prefix += '_'
vectors = ['r', 'v']
components = ['x', 'y', 'z']
col_names = [f'{prefix}{v}_{c}' for v, c in product(vectors, components)]
return col_names
def create_train_data(seed = 0, test_size = 0.2):
"""
Description
-----------
create a new train set from dataset(.parquet) by using seed
Parameters
----------
seed : int (default=-1)
seed for train_test_split, let's say seed = 0 means random
test_size : double (default=0.2)
test_size for train_test_split
Returns
-------
and traindata in folder "create_traindata" named "seed_{seed}.td"
"""
df = pd.read_parquet("traindata/physics_preds.parquet")
feature_cols = [
'elapsed_seconds'
] + get_state_vect_cols('physics_pred') + get_state_vect_cols('start')
print(feature_cols)
# The target values are the errors between the physical model predictions
# and the ground truth observations
target_cols = get_state_vect_cols('physics_err')
print(target_cols)
# Create feature and target matrices
X = df[feature_cols]
y = df[target_cols]
data_keys = ['X_train', 'X_test', 'y_train', 'y_test']
if seed == 0:
data_vals = train_test_split(X, y, test_size=test_size)
else:
data_vals = train_test_split(X, y, test_size=test_size, random_state=seed)
train_test_data = dict(zip(data_keys, data_vals))
joblib.dump(train_test_data, f"create_datas/seed_{seed}.td")

Binary file not shown.

Binary file not shown.

@ -0,0 +1 @@
# wait for tommorow!

@ -0,0 +1,90 @@
from normal_use import *
sumRegressors = [LGBMRegressor, RandomForestRegressor, XGBRegressor, CatBoostRegressor]
sumRegressor = Union[type(sumRegressors)]
sumParams = [{},{},{},{"silent": True}]
weight = [0.1, 0.2, 0.3, 0.4]
Sums = {}
train_test_data = None
out_weights = []
def get_random_small_train(X, y, Percentage = 0.8):
# return X_train & y_train
data_keys = ['X_train', 'X_test', 'y_train', 'y_test']
data_vals = train_test_split(X, y, test_size=(1-Percentage))
train_test_data = dict(zip(data_keys, data_vals))
return train_test_data['X_train'], train_test_data['y_train']
def train_linear(id, regType: sumRegressor, use_RFsample = False):
X, ys = train_test_data['X_train'], train_test_data['y_train']
if use_RFsample:
# X, ys = get_random_small_train(X, ys)
# which xxx_moon?
pass
check_X_y(X, ys, multi_output=True)
models = {}
for target_col in ys.columns:
y = ys[target_col]
reg = regType(**sumParams[id])
reg.fit(X, y)
models[target_col] = reg
joblib.dump(models, f"linear/{regType.__name__}_{id}.model")
def eval_linear(id, regType: sumRegressor):
models = joblib.load(f"linear/{regType.__name__}_{id}.model")
X, ys = train_test_data['X_test'], train_test_data['y_test']
evals = []
out_w_dict = {'Regressor': regType.__name__, 'Weight': weight[id]}
out_weights.append(out_w_dict)
for target_col, reg in models.items():
y_hat = reg.predict(X) # fake
y = ys[target_col] # real
rmse = metrics.mean_squared_error(y, y_hat, squared=False)
r2 = metrics.r2_score(y, y_hat)
eval_dict = {'Error': target_col, 'RMSE': rmse, 'R^2': r2}
evals.append(eval_dict)
if Sums.get(target_col) is None:
Sums[target_col] = weight[id] * y_hat
else:
Sums[target_col] += weight[id] * y_hat
print(f"{regType.__name__}_{id}")
print(pd.DataFrame(evals))
def only_linear(trainset):
"""
Description
-----------
create a linear combination, weight and regressors here to change
Parameters
----------
trainset : dict
train_data_set
Returns
-------
print result on screen
"""
global train_test_data
train_test_data = trainset
for i, reg in enumerate(sumRegressors):
train_linear(i, reg)
for i, reg in enumerate(sumRegressors):
eval_linear(i, reg)
ys = train_test_data['y_test']
evals = []
for target_col in Sums:
y_hat = Sums[target_col] # fake
y = ys[target_col] # real
rmse = metrics.mean_squared_error(y, y_hat, squared=False)
r2 = metrics.r2_score(y, y_hat)
eval_dict = {'Error': target_col, 'RMSE': rmse, 'R^2': r2}
evals.append(eval_dict)
print("linear sum for {} regressors!".format(len(sumRegressors)))
print(pd.DataFrame(out_weights))
print(pd.DataFrame(evals))
print("Average R2: ", average_R2(evals))

@ -0,0 +1,23 @@
from ngboost import NGBRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
import deepforest
import numpy as np
import pandas as pd
from typing import Union
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_X_y
import joblib
def average_R2(evals):
sum = 0
for item in evals:
sum += item['R^2']
return sum/len(evals)

@ -0,0 +1,64 @@
from normal_use import *
Regressors = [NGBRegressor, RandomForestRegressor, CatBoostRegressor, LGBMRegressor, XGBRegressor]
Params = [{}, {}, {"silent": True},{},{}]
Regressor = Union[type(Regressors)]
train_test_data = None
def train_model(id, regType: Regressor):
X, ys = train_test_data['X_train'], train_test_data['y_train']
check_X_y(X, ys, multi_output=True)
models = {}
for target_col in ys.columns:
y = ys[target_col]
reg = regType(**Params[id])
reg.fit(X, y)
models[target_col] = reg
print(regType.__name__, target_col)
joblib.dump(models, f"models/{regType.__name__}.model")
def eval_model(regType: Regressor):
models = joblib.load(f"models/{regType.__name__}.model")
X, ys = train_test_data['X_test'], train_test_data['y_test']
evals = []
for target_col, reg in models.items():
y_hat = reg.predict(X) # fake
y = ys[target_col] # real
rmse = metrics.mean_squared_error(y, y_hat, squared=False)
r2 = metrics.r2_score(y, y_hat)
eval_dict = {'Error': target_col, 'RMSE': rmse, 'R^2': r2}
evals.append(eval_dict)
print(regType.__name__)
print(pd.DataFrame(evals))
print("Average R2: ", average_R2(evals))
def train_one_models(trainsets):
"""
Description
-----------
call this to start trainning each regressors.
Parameters
----------
trainset : dict
use joblib to extract target dataset(create_datas) and put it in here.
Returns
-------
NO returns, but models in folder "models" and print R2 on screen
"""
global train_test_data
train_test_data = trainsets
for i, reg in enumerate(Regressors):
train_model(i, reg)
for reg in Regressors:
eval_model(reg)

@ -0,0 +1,119 @@
from normal_use import *
# -------------------------- here is stacking method --------------------------
kf = KFold(n_splits=5, shuffle=True)
train_test_data = None
target_cols = None
stacking_model_regressors = [CatBoostRegressor, LGBMRegressor, XGBRegressor, RandomForestRegressor]
stacking_model_params = [{"silent": True}, {}, {}, {}]
# --------- change the stacking model here, use anything you want! --------- #
# LinearRegression, Ridge, XGBRegressor, CatBoostRegressor, LGBMRegressor
# deepforest.CascadeForestRegressor
REG_TOTAL = Ridge
class SklearnWrapper:
def __init__(self, clf, seed=0, params={}):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def get_oof(clf, col_name):
x_train = train_test_data['X_train']
y_train = train_test_data['y_train'][col_name]
x_test = train_test_data['X_test']
oof_train = np.zeros((x_train.shape[0],))
oof_test = np.zeros((x_test.shape[0],))
oof_test_skf = np.empty((5, x_test.shape[0]))
for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
trn_x, trn_y, val_x, val_y = x_train.iloc[train_index], y_train.iloc[
train_index], x_train.iloc[valid_index], y_train.iloc[valid_index]
clf.train(trn_x, trn_y)
oof_train[valid_index] = clf.predict(val_x)
oof_test_skf[i, :] = clf.predict(x_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
def stack_model(train_stack, test_stack, y):
train_stack = np.hstack(train_stack)
test_stack = np.hstack(test_stack)
oof = np.zeros((train_stack.shape[0],))
# usually we use this
# predictions = np.zeros((test_stack.shape[0],))
# deepforest.CascadeForestRegressor needed below
predictions = np.zeros((test_stack.shape[0],1))
scores = []
for fold_, (trn_idx, val_idx) in enumerate(kf.split(train_stack, y)):
trn_data, trn_y = train_stack[trn_idx], y.iloc[trn_idx]
val_data, val_y = train_stack[val_idx], y.iloc[val_idx]
clf = REG_TOTAL()
clf.fit(trn_data, trn_y)
tmp = clf.predict(test_stack)
tmp = tmp.reshape(-1,1)
predictions += tmp/5
return oof, predictions
def stacking_train(trainset):
"""
Description
-----------
start stacking train
Parameters
----------
trainset : dict
train_data_set
Returns
-------
print result
"""
global target_cols, train_test_data
train_test_data = trainset
target_cols = train_test_data['y_train'].columns
evals = []
for col_name in target_cols:
oof_train = []
oof_test = []
y_train = train_test_data['y_train'][col_name]
for i, (reg, param) in enumerate(zip(stacking_model_regressors, stacking_model_params)):
regressor = SklearnWrapper(reg, params=param)
t_train, t_test = get_oof(regressor, col_name=col_name)
oof_train.append(t_train)
oof_test.append(t_test)
oof_stack, prediction_stack = stack_model(oof_train, oof_test, y_train)
y_hat = prediction_stack # fake
y = train_test_data['y_test'][col_name] # real
rmse = metrics.mean_squared_error(y, y_hat, squared=False)
r2 = metrics.r2_score(y, y_hat)
eval_dict = {'Error': col_name, 'RMSE': rmse, 'R^2': r2}
evals.append(eval_dict)
print(f"{col_name} finished")
print(f"Stacking -- {REG_TOTAL.__name__}")
print(pd.DataFrame(evals))
print("Average R2: ", average_R2(evals))

@ -0,0 +1,27 @@
# %%
import joblib
import sys
sys.path.append("./codes")
sys.path.append("./regressors")
import create_traindata
import one_regressor
import stacking_regressor
import linear_sum_regressor
# %%
# create train data
seed = 514
create_traindata.create_train_data(seed=seed)
train_test_data = joblib.load(f"create_datas/seed_{seed}.td")
# %%
# test one regressor
one_regressor.train_one_models(train_test_data)
# %%
# test stacking method
stacking_regressor.stacking_train(train_test_data)
# %%
# test linear combination
linear_sum_regressor.only_linear(train_test_data)

Binary file not shown.

Binary file not shown.
Loading…
Cancel
Save