ssa/train_model.py

# %%
from typing import Union
import pandas as pd

# %%
from itertools import product


def get_state_vect_cols(prefix=''):
    if prefix:
        prefix += '_'
    vectors = ['r', 'v']
    components = ['x', 'y', 'z']
    col_names = [f'{prefix}{v}_{c}' for v, c in product(vectors, components)]
    return col_names


# %%
df = pd.read_parquet("traindata/physics_preds.parquet")
test_set = df[df['aso_id'] == "05277"]

train_set = df.groupby('aso_id').apply(lambda x: x.head(x.count()[0] - 3))
print(df.count()[0], train_set.count()[0], test_set.count()[0])
test_set

# %%
from sklearn.model_selection import train_test_split

feature_cols = [
    'elapsed_seconds'
] + get_state_vect_cols('physics_pred') + get_state_vect_cols('start')
print(feature_cols)
# The target values are the errors between the physical model predictions
# and the ground truth observations
target_cols = get_state_vect_cols('physics_err')
print(target_cols)
# Create feature and target matrices
X = df[feature_cols]
y = df[target_cols]
data_keys = ['X_train', 'X_test', 'y_train', 'y_test']
data_vals = train_test_split(X, y, test_size=0.2)
train_test_data = dict(zip(data_keys, data_vals))
# train_test_data['X_test'] = test_set[feature_cols]
# train_test_data['y_test'] = test_set[target_cols]
# train_test_data = {
#     'X_train': train_set[feature_cols],
#     'y_train': train_set[target_cols],
#     'X_test': test_set[feature_cols],
#     'y_test': test_set[target_cols],
# }

# %%
from sklearn.utils.validation import check_X_y
import joblib

from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor

Regressor = Union[CatBoostRegressor, RandomForestRegressor]


def train_model(regType: Regressor):
    X, ys = train_test_data['X_train'], train_test_data['y_train']
    check_X_y(X, ys, multi_output=True)
    models = {}

    for target_col in ys.columns:
        y1 = ys[target_col]
        print(X.shape, y1.shape)
        reg = regType()
        reg.fit(X, y1)
        models[target_col] = reg
        print(target_col)
    joblib.dump(models, f"models/{regType.__name__}.model")


for reg in [
        #CatBoostRegressor, LGBMRegressor, XGBRegressor, RandomForestRegressor,
        CatBoostRegressor
]:
    train_model(reg)

# %%
from sklearn import metrics


def eval_model(regType: Regressor):
    models = joblib.load(f"models/{regType.__name__}.model")
    X, ys = train_test_data['X_test'], train_test_data['y_test']
    evals = []
    for target_col, reg in models.items():
        y_hat = reg.predict(X)  # fake
        y = ys[target_col]  # real
        dy = (y - y_hat).abs()
        rmse = metrics.mean_squared_error(y, y_hat, squared=False)
        r2 = metrics.r2_score(y, y_hat)
        eval_dict = {
            'Error': target_col,
            'RMSE': rmse,
            'R^2': r2,
            "err_max": dy.max(),
            "err_min": dy.min(),
            "err_mean": dy.mean(),
        }
        evals.append(eval_dict)
    print(regType.__name__)
    print(pd.DataFrame(evals))


for reg in [
        CatBoostRegressor,
]:
    eval_model(reg)
# %%