Sat, 16 Oct 2021 20:49:55 GMT

4 years ago · 1135697a8a
parent 5b65d871a5
commit 1135697a8a
10 changed files with 98 additions and 9768 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,3 @@
+{
+    "python.pythonPath": "D:\\PortableApps\\Python\\3.9\\Scripts\\python.exe"
+}
--- a/catboost_info/catboost_training.json
+++ b/catboost_info/catboost_training.json
--- a/catboost_info/learn/events.out.tfevents
+++ b/catboost_info/learn/events.out.tfevents
--- a/catboost_info/learn_error.tsv
+++ b/catboost_info/learn_error.tsv
--- a/catboost_info/time_left.tsv
+++ b/catboost_info/time_left.tsv
--- a/models/model_CatBoostRegressor.pickle
+++ b/models/model_CatBoostRegressor.pickle
--- a/models/model_LGBMRegressor.pickle
+++ b/models/model_LGBMRegressor.pickle
--- a/models/model_XGBRegressor.pickle
+++ b/models/model_XGBRegressor.pickle
--- a/train_model.ipynb
+++ b/train_model.ipynb
--- a/train_model.py
+++ b/train_model.py
@ -0,0 +1,95 @@
+# 要添加一个新单元，输入 '# %%'
+# 要添加一个新的标记单元，输入 '# %% [markdown]'
+# %%
+import pandas as pd
+
+# %%
+from itertools import product
+
+
+def get_state_vect_cols(prefix=''):
+    if prefix:
+        prefix += '_'
+    vectors = ['r', 'v']
+    components = ['x', 'y', 'z']
+    col_names = [f'{prefix}{v}_{c}' for v, c in product(vectors, components)]
+    return col_names
+
+
+# %%
+df = pd.read_parquet("traindata/physics_preds.parquet")
+test_set = df[df['aso_id'] == "05277"]
+
+train_set = df.groupby('aso_id').apply(lambda x: x.head(x.count()[0] - 3))
+print(df.count()[0], train_set.count()[0], test_set.count()[0])
+test_set
+
+# %%
+from sklearn.model_selection import train_test_split
+
+feature_cols = [
+    'elapsed_seconds'
+] + get_state_vect_cols('physics_pred') + get_state_vect_cols('start')
+print(feature_cols)
+# The target values are the errors between the physical model predictions
+# and the ground truth observations
+target_cols = get_state_vect_cols('physics_err')
+print(target_cols)
+# Create feature and target matrices
+X = df[feature_cols]
+y = df[target_cols]
+data_keys = ['X_train', 'X_test', 'y_train', 'y_test']
+data_vals = train_test_split(X, y, test_size=0.2)
+train_test_data = dict(zip(data_keys, data_vals))
+train_test_data['X_test'] = test_set[feature_cols]
+train_test_data['y_test'] = test_set[target_cols]
+# train_test_data = {
+#     'X_train': train_set[feature_cols],
+#     'y_train': train_set[target_cols],
+#     'X_test': test_set[feature_cols],
+#     'y_test': test_set[target_cols],
+# }
+
+# %%
+from sklearn.utils.validation import check_X_y
+import pickle
+
+from catboost import CatBoostRegressor
+from lightgbm import LGBMRegressor
+from xgboost import XGBRegressor
+
+
+def train_model(regType):
+    X, ys = train_test_data['X_train'], train_test_data['y_train']
+    check_X_y(X, ys, multi_output=True)
+    models = {}
+    for target_col in ys.columns:
+        y = ys[target_col]
+        reg = regType()
+        reg.fit(X, y, verbose=False)
+        models[target_col] = reg
+        print(target_col)
+    CatBoostRegressor.__name__
+    with open(f"models/model_{regType.__name__}.pickle", "wb") as f:
+        pickle.dump(models, f)
+
+
+for reg in [CatBoostRegressor, LGBMRegressor, XGBRegressor]:
+    train_model(reg)
+
+# %%
+from sklearn import metrics
+with open("cat.pickle", "rb") as f:
+    models = pickle.load(f)
+X, ys = train_test_data['X_test'], train_test_data['y_test']
+evals = []
+for target_col, reg in models.items():
+    y_hat = reg.predict(X)  # fake
+    y = ys[target_col]  # real
+    rmse = metrics.mean_squared_error(y, y_hat, squared=False)
+    r2 = metrics.r2_score(y, y_hat)
+    eval_dict = {'Error': target_col, 'RMSE': rmse, 'R^2': r2}
+    evals.append(eval_dict)
+pd.DataFrame(evals)
+
+# %%