You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
57 lines
1.6 KiB
Python
57 lines
1.6 KiB
Python
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
from itertools import product
|
|
import joblib
|
|
|
|
|
|
def get_state_vect_cols(prefix=''):
|
|
if prefix:
|
|
prefix += '_'
|
|
vectors = ['r', 'v']
|
|
components = ['x', 'y', 'z']
|
|
col_names = [f'{prefix}{v}_{c}' for v, c in product(vectors, components)]
|
|
return col_names
|
|
|
|
|
|
def create_train_data(seed = 0, test_size = 0.2):
|
|
"""
|
|
Description
|
|
-----------
|
|
create a new train set from dataset(.parquet) by using seed
|
|
|
|
Parameters
|
|
----------
|
|
seed : int (default=-1)
|
|
seed for train_test_split, let's say seed = 0 means random
|
|
|
|
test_size : double (default=0.2)
|
|
test_size for train_test_split
|
|
|
|
Returns
|
|
-------
|
|
and traindata in folder "create_traindata" named "seed_{seed}.td"
|
|
|
|
"""
|
|
|
|
|
|
df = pd.read_parquet("traindata/physics_preds.parquet")
|
|
feature_cols = [
|
|
'elapsed_seconds'
|
|
] + get_state_vect_cols('physics_pred') + get_state_vect_cols('start')
|
|
print(feature_cols)
|
|
# The target values are the errors between the physical model predictions
|
|
# and the ground truth observations
|
|
target_cols = get_state_vect_cols('physics_err')
|
|
print(target_cols)
|
|
# Create feature and target matrices
|
|
X = df[feature_cols]
|
|
y = df[target_cols]
|
|
data_keys = ['X_train', 'X_test', 'y_train', 'y_test']
|
|
if seed == 0:
|
|
data_vals = train_test_split(X, y, test_size=test_size)
|
|
else:
|
|
data_vals = train_test_split(X, y, test_size=test_size, random_state=seed)
|
|
train_test_data = dict(zip(data_keys, data_vals))
|
|
|
|
joblib.dump(train_test_data, f"create_datas/seed_{seed}.td")
|