You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
115 lines
3.0 KiB
Python
115 lines
3.0 KiB
Python
# %%
|
|
from typing import Union
|
|
import pandas as pd
|
|
|
|
# %%
|
|
from itertools import product
|
|
|
|
|
|
def get_state_vect_cols(prefix=''):
|
|
if prefix:
|
|
prefix += '_'
|
|
vectors = ['r', 'v']
|
|
components = ['x', 'y', 'z']
|
|
col_names = [f'{prefix}{v}_{c}' for v, c in product(vectors, components)]
|
|
return col_names
|
|
|
|
|
|
# %%
|
|
df = pd.read_parquet("traindata/physics_preds.parquet")
|
|
test_set = df[df['aso_id'] == "05277"]
|
|
|
|
train_set = df.groupby('aso_id').apply(lambda x: x.head(x.count()[0] - 3))
|
|
print(df.count()[0], train_set.count()[0], test_set.count()[0])
|
|
test_set
|
|
|
|
# %%
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
feature_cols = [
|
|
'elapsed_seconds'
|
|
] + get_state_vect_cols('physics_pred') + get_state_vect_cols('start')
|
|
print(feature_cols)
|
|
# The target values are the errors between the physical model predictions
|
|
# and the ground truth observations
|
|
target_cols = get_state_vect_cols('physics_err')
|
|
print(target_cols)
|
|
# Create feature and target matrices
|
|
X = df[feature_cols]
|
|
y = df[target_cols]
|
|
data_keys = ['X_train', 'X_test', 'y_train', 'y_test']
|
|
data_vals = train_test_split(X, y, test_size=0.2)
|
|
train_test_data = dict(zip(data_keys, data_vals))
|
|
# train_test_data['X_test'] = test_set[feature_cols]
|
|
# train_test_data['y_test'] = test_set[target_cols]
|
|
# train_test_data = {
|
|
# 'X_train': train_set[feature_cols],
|
|
# 'y_train': train_set[target_cols],
|
|
# 'X_test': test_set[feature_cols],
|
|
# 'y_test': test_set[target_cols],
|
|
# }
|
|
|
|
# %%
|
|
from sklearn.utils.validation import check_X_y
|
|
import joblib
|
|
|
|
from catboost import CatBoostRegressor
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
|
|
Regressor = Union[CatBoostRegressor, RandomForestRegressor]
|
|
|
|
|
|
def train_model(regType: Regressor):
|
|
X, ys = train_test_data['X_train'], train_test_data['y_train']
|
|
check_X_y(X, ys, multi_output=True)
|
|
models = {}
|
|
|
|
for target_col in ys.columns:
|
|
y1 = ys[target_col]
|
|
print(X.shape, y1.shape)
|
|
reg = regType()
|
|
reg.fit(X, y1)
|
|
models[target_col] = reg
|
|
print(target_col)
|
|
joblib.dump(models, f"models/{regType.__name__}.model")
|
|
|
|
|
|
for reg in [
|
|
#CatBoostRegressor, LGBMRegressor, XGBRegressor, RandomForestRegressor,
|
|
CatBoostRegressor
|
|
]:
|
|
train_model(reg)
|
|
|
|
# %%
|
|
from sklearn import metrics
|
|
|
|
|
|
def eval_model(regType: Regressor):
|
|
models = joblib.load(f"models/{regType.__name__}.model")
|
|
X, ys = train_test_data['X_test'], train_test_data['y_test']
|
|
evals = []
|
|
for target_col, reg in models.items():
|
|
y_hat = reg.predict(X) # fake
|
|
y = ys[target_col] # real
|
|
dy = (y - y_hat).abs()
|
|
rmse = metrics.mean_squared_error(y, y_hat, squared=False)
|
|
r2 = metrics.r2_score(y, y_hat)
|
|
eval_dict = {
|
|
'Error': target_col,
|
|
'RMSE': rmse,
|
|
'R^2': r2,
|
|
"err_max": dy.max(),
|
|
"err_min": dy.min(),
|
|
"err_mean": dy.mean(),
|
|
}
|
|
evals.append(eval_dict)
|
|
print(regType.__name__)
|
|
print(pd.DataFrame(evals))
|
|
|
|
|
|
for reg in [
|
|
CatBoostRegressor,
|
|
]:
|
|
eval_model(reg)
|
|
# %%
|