import pandas as pd from sklearn.model_selection import train_test_split from itertools import product import joblib def get_state_vect_cols(prefix=''): if prefix: prefix += '_' vectors = ['r', 'v'] components = ['x', 'y', 'z'] col_names = [f'{prefix}{v}_{c}' for v, c in product(vectors, components)] return col_names def create_train_data(seed = 0, test_size = 0.2): """ Description ----------- create a new train set from dataset(.parquet) by using seed Parameters ---------- seed : int (default=-1) seed for train_test_split, let's say seed = 0 means random test_size : double (default=0.2) test_size for train_test_split Returns ------- and traindata in folder "create_traindata" named "seed_{seed}.td" """ df = pd.read_parquet("traindata/physics_preds.parquet") feature_cols = [ 'elapsed_seconds' ] + get_state_vect_cols('physics_pred') + get_state_vect_cols('start') print(feature_cols) # The target values are the errors between the physical model predictions # and the ground truth observations target_cols = get_state_vect_cols('physics_err') print(target_cols) # Create feature and target matrices X = df[feature_cols] y = df[target_cols] data_keys = ['X_train', 'X_test', 'y_train', 'y_test'] if seed == 0: data_vals = train_test_split(X, y, test_size=test_size) else: data_vals = train_test_split(X, y, test_size=test_size, random_state=seed) train_test_data = dict(zip(data_keys, data_vals)) joblib.dump(train_test_data, f"create_datas/seed_{seed}.td")