# %% from typing import Union import pandas as pd # %% from itertools import product def get_state_vect_cols(prefix=''): if prefix: prefix += '_' vectors = ['r', 'v'] components = ['x', 'y', 'z'] col_names = [f'{prefix}{v}_{c}' for v, c in product(vectors, components)] return col_names # %% df = pd.read_parquet("traindata/physics_preds.parquet") test_set = df[df['aso_id'] == "05277"] train_set = df.groupby('aso_id').apply(lambda x: x.head(x.count()[0] - 3)) print(df.count()[0], train_set.count()[0], test_set.count()[0]) test_set # %% from sklearn.model_selection import train_test_split feature_cols = [ 'elapsed_seconds' ] + get_state_vect_cols('physics_pred') + get_state_vect_cols('start') print(feature_cols) # The target values are the errors between the physical model predictions # and the ground truth observations target_cols = get_state_vect_cols('physics_err') print(target_cols) # Create feature and target matrices X = df[feature_cols] y = df[target_cols] data_keys = ['X_train', 'X_test', 'y_train', 'y_test'] data_vals = train_test_split(X, y, test_size=0.2) train_test_data = dict(zip(data_keys, data_vals)) # train_test_data['X_test'] = test_set[feature_cols] # train_test_data['y_test'] = test_set[target_cols] # train_test_data = { # 'X_train': train_set[feature_cols], # 'y_train': train_set[target_cols], # 'X_test': test_set[feature_cols], # 'y_test': test_set[target_cols], # } # %% from sklearn.utils.validation import check_X_y import joblib from catboost import CatBoostRegressor from sklearn.ensemble import RandomForestRegressor Regressor = Union[CatBoostRegressor, RandomForestRegressor] def train_model(regType: Regressor): X, ys = train_test_data['X_train'], train_test_data['y_train'] check_X_y(X, ys, multi_output=True) models = {} for target_col in ys.columns: y1 = ys[target_col] print(X.shape, y1.shape) reg = regType() reg.fit(X, y1) models[target_col] = reg print(target_col) joblib.dump(models, f"models/{regType.__name__}.model") for reg in [ #CatBoostRegressor, LGBMRegressor, XGBRegressor, RandomForestRegressor, CatBoostRegressor ]: train_model(reg) # %% from sklearn import metrics def eval_model(regType: Regressor): models = joblib.load(f"models/{regType.__name__}.model") X, ys = train_test_data['X_test'], train_test_data['y_test'] evals = [] for target_col, reg in models.items(): y_hat = reg.predict(X) # fake y = ys[target_col] # real dy = (y - y_hat).abs() rmse = metrics.mean_squared_error(y, y_hat, squared=False) r2 = metrics.r2_score(y, y_hat) eval_dict = { 'Error': target_col, 'RMSE': rmse, 'R^2': r2, "err_max": dy.max(), "err_min": dy.min(), "err_mean": dy.mean(), } evals.append(eval_dict) print(regType.__name__) print(pd.DataFrame(evals)) for reg in [ CatBoostRegressor, ]: eval_model(reg) # %%