diff --git a/ml/basic_tree_based_models.py b/ml/basic_tree_based_models.py new file mode 100644 index 0000000000000000000000000000000000000000..fb3081c000ffbec0843f038ef72f4f89817c24ea --- /dev/null +++ b/ml/basic_tree_based_models.py @@ -0,0 +1,81 @@ +from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor +from sklearn.model_selection import train_test_split, cross_val_score +import numpy as np +import matplotlib.pyplot as plot + +#################################################################################### +# +# MAKE SURE THAT rootToNumpy.py RUNS +# DO NOT RUN THIS FILE UNTIL BOTH .npy +# FILES HAVE BEEN CREATED BY rootToNumpy.py +# +#################################################################################### + +def main(): + zdc_sideA_withRPD = np.load("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.npy") + zdc_sideC_withRPD = np.load("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC.npy") + zdc_sideA_noRPD = np.load("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.npy") + zdc_sideC_noRPD = np.load("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC.npy") + + zdc_sideA_noRPD = np.delete(zdc_sideA_noRPD, 4, 1) + zdc_sideC_noRPD = np.delete(zdc_sideC_noRPD, 4, 1) + + # print(len(zdc_sideA_noRPD)) + + # rf_cross_val(zdc_sideA_withRPD[:10000, :5], zdc_sideA_withRPD[:10000, 5], 4) + # rf_cross_val(zdc_sideC_withRPD[:10000, :5], zdc_sideC_withRPD[:10000, 5], 4) + # rf_cross_val(zdc_sideA_noRPD[:10000, :4], zdc_sideA_noRPD[:10000, 4], 4) + # rf_cross_val(zdc_sideC_noRPD[:10000, :4], zdc_sideC_noRPD[:10000, 4], 4) + + # xg_cross_val(zdc_sideA_withRPD[:10000, :5], zdc_sideA_withRPD[:10000, 5], 4) + # xg_cross_val(zdc_sideC_withRPD[:10000, :5], zdc_sideC_withRPD[:10000, 5], 4) + # xg_cross_val(zdc_sideA_noRPD[:10000, :4], zdc_sideA_noRPD[:10000, 4], 4) + # xg_cross_val(zdc_sideC_noRPD[:10000, :4], zdc_sideC_noRPD[:10000, 4], 4) + + tree_depth = 4 + training_ratio = 0.05 + + xg_train_and_test(zdc_sideA_withRPD[:, :5], zdc_sideA_withRPD[:, 5], tree_depth, training_ratio, "Side A (WITH RPD)") + xg_train_and_test(zdc_sideC_withRPD[:, :5], zdc_sideC_withRPD[:, 5], tree_depth, training_ratio, "Side C (WITH RPD)") + xg_train_and_test(zdc_sideA_noRPD[:, :4], zdc_sideA_noRPD[:, 4], tree_depth, training_ratio, "Side A (NO RPD)") + xg_train_and_test(zdc_sideC_noRPD[:, :4], zdc_sideC_noRPD[:, 4], tree_depth, training_ratio, "Side C (NO RPD)") + +def rf_cross_val(X, y, tree_depth): + # X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8) + regr = RandomForestRegressor(max_depth = tree_depth, random_state = 2) + regr.fit(X, y) + + print(cross_val_score(regr, X, y, cv = 5)) + +def xg_cross_val(X, y, tree_depth): + regr = HistGradientBoostingRegressor(max_depth = tree_depth) + regr.fit(X, y) + + print(cross_val_score(regr, X, y, cv = 5)) + +def xg_train_and_test(X, y, tree_depth, training_ratio, name): + + print(name) + + X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = training_ratio) + model = HistGradientBoostingRegressor(max_depth = tree_depth) + + model.fit(X_train, y_train) + + # Evaluate Model Against Test Set + y_pred = model.predict(X_test) + y_test_mean = sum(v for v in y_test) / len(y_test) + + ######## R^2 value = 1 - SSE/SST + accuracy = 1 - sum( (y_test[i] - y_pred[i])**2 for i in range(len(y_test)) ) / sum( (y_test[i] - y_test_mean)**2 for i in range(len(y_test)) ) + print("Test accuracy:", accuracy) + + # Cross-validation + scores = cross_val_score(model, X, y, cv=5) + print("Cross-validation scores:", scores) + print("Mean cross-validation score:", scores.mean()) + + print() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ml/rootToNumpy.py b/ml/rootToNumpy.py index 27c662332069ce649ad59bef1d196f1f2005b8d6..036d155c667676cfee000bea26f849262ef11098 100644 --- a/ml/rootToNumpy.py +++ b/ml/rootToNumpy.py @@ -2,8 +2,9 @@ import ROOT import numpy as np from json import dumps -FILE_GLOB = "../data/zdcTopoAnalysis_1N.root" +FILE_GLOB = "./data/zdcTopoAnalysis_1N.root" +# To run, call python3 ml/rootToNumpy.py in command prompt from top of ml4zdc repo def main(): """ @@ -29,7 +30,7 @@ def main(): """ dataframe = ROOT.RDataFrame("zdcTree", FILE_GLOB) - + print(dataframe) # print all columns and their types columns = [str(col) for col in dataframe.GetColumnNames()] columns_and_types = {col: dataframe.GetColumnType(col) for col in columns} @@ -70,6 +71,10 @@ def main(): print(zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.shape) print(zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA) + np.save("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC.npy", zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC) + np.save("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.npy", zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA) + + return if __name__ == "__main__": main()