diff --git a/ml/basic_tree_based_models.py b/ml/basic_tree_based_models.py new file mode 100644 index 0000000000000000000000000000000000000000..1e27e791adbe6672052e5ee846df1cbcd8369299 --- /dev/null +++ b/ml/basic_tree_based_models.py @@ -0,0 +1,164 @@ +from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor +from sklearn.model_selection import train_test_split, cross_val_score +from sklearn.tree import export_graphviz, plot_tree +from sklearn.inspection import permutation_importance +import graphviz +import numpy as np +import matplotlib.pyplot as plot + +def main(): + zdc_sideA_withRPD = np.load("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.npy") + zdc_sideC_withRPD = np.load("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC.npy") + zdc_sideA_noRPD = np.load("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.npy") + zdc_sideC_noRPD = np.load("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC.npy") + + zdc_sideA_noRPD = np.delete(zdc_sideA_noRPD, 4, 1) + zdc_sideC_noRPD = np.delete(zdc_sideC_noRPD, 4, 1) + + # print(len(zdc_sideA_noRPD)) + + # rf_cross_val(zdc_sideA_withRPD[:10000, :5], zdc_sideA_withRPD[:10000, 5], 4) + # rf_cross_val(zdc_sideC_withRPD[:10000, :5], zdc_sideC_withRPD[:10000, 5], 4) + # rf_cross_val(zdc_sideA_noRPD[:10000, :4], zdc_sideA_noRPD[:10000, 4], 4) + # rf_cross_val(zdc_sideC_noRPD[:10000, :4], zdc_sideC_noRPD[:10000, 4], 4) + + # xg_cross_val(zdc_sideA_withRPD[:, :5], zdc_sideA_withRPD[:, 5], 4) + # xg_cross_val(zdc_sideC_withRPD[:, :5], zdc_sideC_withRPD[:, 5], 4) + # xg_cross_val(zdc_sideA_noRPD[:, :4], zdc_sideA_noRPD[:, 4], 4) + # xg_cross_val(zdc_sideC_noRPD[:, :4], zdc_sideC_noRPD[:, 4], 4) + + hist_tree_depth = -1 + hist_training_ratio = 0.8 + + vis_tree_depth = 4 + vis_training_ratio = 0.05 + + xg_train_and_test(zdc_sideA_withRPD[:, :5], zdc_sideA_withRPD[:, 5], hist_tree_depth, hist_training_ratio, "Side A (WITH RPD)") + # xg_visualize(zdc_sideA_withRPD[:, :5], zdc_sideA_withRPD[:, 5], vis_tree_depth, vis_training_ratio, "Side A (WITH RPD)") + + xg_train_and_test(zdc_sideC_withRPD[:, :5], zdc_sideC_withRPD[:, 5], hist_tree_depth, hist_training_ratio, "Side C (WITH RPD)") + # xg_visualize(zdc_sideC_withRPD[:, :5], zdc_sideC_withRPD[:, 5], vis_tree_depth, vis_training_ratio, "Side C (WITH RPD)") + + xg_train_and_test(zdc_sideA_noRPD[:, :4], zdc_sideA_noRPD[:, 4], hist_tree_depth, hist_training_ratio, "Side A (NO RPD)") + # xg_visualize(zdc_sideA_noRPD[:, :5], zdc_sideA_noRPD[:, 5], vis_tree_depth, vis_training_ratio, "Side A (NO RPD)") + + xg_train_and_test(zdc_sideC_noRPD[:, :4], zdc_sideC_noRPD[:, 4], hist_tree_depth, hist_training_ratio, "Side C (NO RPD)") + # xg_visualize(zdc_sideC_noRPD[:, :5], zdc_sideC_noRPD[:, 5], vis_tree_depth, vis_training_ratio, "Side C (NO RPD)") + + +############################################# +# Creates a HistGradientBoostingRegressor +# Any allowed decision tree heights +# 80/20 Train-Test Split +# Actual model used in predicting BRAN +############################################# +def xg_train_and_test(X, y, tree_depth, training_ratio, name): + + print(name) + + X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = training_ratio) + model = 0 + if tree_depth < 1: + model = HistGradientBoostingRegressor() + else: + model = HistGradientBoostingRegressor(max_depth = tree_depth) + + model.fit(X_train, y_train) + + # Evaluate Model Against Test Set + y_pred = model.predict(X_test) + y_test_mean = sum(v for v in y_test) / len(y_test) + + ######## Test R^2 value = 1 - SSE/SST + accuracy = 1 - sum( (y_test[i] - y_pred[i])**2 for i in range(len(y_test)) ) / sum( (y_test[i] - y_test_mean)**2 for i in range(len(y_test)) ) + print("Test accuracy:", accuracy) + + # Cross-validation + scores = cross_val_score(model, X, y, cv=5) + print("Cross-validation scores:", scores) + print("Mean cross-validation score:", scores.mean()) + + # Feature Importances + + result = permutation_importance(model, X_test, y_test, scoring='neg_mean_squared_error', n_repeats=30) + feature_importances = result.importances_mean + + feature_importances = np.array(feature_importances) / sum(feature_importances) + + x_labs_WITHRPD = ["EM", "HAD1", "HAD2", "HAD3", "RPD"] + x_labs_NORPD = ["EM", "HAD1", "HAD2", "HAD3"] + x_labs = x_labs_NORPD if len(feature_importances) == 4 else x_labs_WITHRPD + + plot.bar(x_labs, feature_importances, color='blue') + plot.title(f"{name} Importance Metrics of Modules in Predicting BRAN Energy", fontsize=8) + plot.xlabel("Modules") + plot.ylabel("Importance") + plot.savefig(f"{name}ImportanceBars.png") + plot.clf() + + print() + +######################################################################## +# Creates a GradientBoostingRegressor +# Any allowed decision tree heights BUT 4-6 range is ideal for speed +# 0.05-0.2 Training Ratio is ideal for speed +# Primarily for visualizing individual decision trees +# since HistGradientBoostingRegressor's trees can't be viewed +######################################################################## +def xg_visualize(X, y, tree_depth, training_ratio, name): + + print(name) + + X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = training_ratio) + model = 0 + if tree_depth < 1: + model = GradientBoostingRegressor() + else: + model = GradientBoostingRegressor(max_depth = tree_depth) + + model.fit(X_train, y_train) + + # Evaluate Model Against Test Set + y_pred = model.predict(X_test) + y_test_mean = sum(v for v in y_test) / len(y_test) + + ######## Test R^2 value = 1 - SSE/SST + accuracy = 1 - sum( (y_test[i] - y_pred[i])**2 for i in range(len(y_test)) ) / sum( (y_test[i] - y_test_mean)**2 for i in range(len(y_test)) ) + print("Test accuracy:", accuracy) + + print() + + estimators = model.estimators_ + feat_names = ["EM", "HAD1", "HAD2", "HAD3", "RPD"] + + for i in range(3): + visualize_tree(estimators, feat_names, i, name) + +def visualize_tree(estimators, feat_names, idx, name): + estimator = estimators[idx][0] + export_graphviz(estimator, out_file=f"{name}_tree{idx}.dot", feature_names=feat_names, filled=True, rounded=True, special_characters=True) + with open(f"{name}_tree{idx}.dot") as f: + dot_graph = f.read() + graph = graphviz.Source(dot_graph) + graph.format = "png" + graph.render(f"{name}_tree{idx}") + +############################################################################### + +def rf_cross_val(X, y, tree_depth): + # X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8) + regr = RandomForestRegressor(max_depth = tree_depth, random_state = 2) + regr.fit(X, y) + + print(cross_val_score(regr, X, y, cv = 5)) + +def xg_cross_val(X, y, tree_depth): + regr = HistGradientBoostingRegressor(max_depth = tree_depth) + regr.fit(X, y) + + print(cross_val_score(regr, X, y, cv = 5)) + +############################################################################### + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ml/ml_diagnostic_plots/A_no_RPD.png b/ml/ml_diagnostic_plots/A_no_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..4cae4a43ad5e531693647a85233072e0a2d9d37f Binary files /dev/null and b/ml/ml_diagnostic_plots/A_no_RPD.png differ diff --git a/ml/ml_diagnostic_plots/A_w_RPD.png b/ml/ml_diagnostic_plots/A_w_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..0d5bec0304072257ff625a23fcbeeb21a429e730 Binary files /dev/null and b/ml/ml_diagnostic_plots/A_w_RPD.png differ diff --git a/ml/ml_diagnostic_plots/C_no_RPD.png b/ml/ml_diagnostic_plots/C_no_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..6165eb5cecfe09468a40c04f28f45f0d092f33ba Binary files /dev/null and b/ml/ml_diagnostic_plots/C_no_RPD.png differ diff --git a/ml/ml_diagnostic_plots/C_w_RPD.png b/ml/ml_diagnostic_plots/C_w_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..a3bcfc3c3fb3531531c27938927ce554a8e71842 Binary files /dev/null and b/ml/ml_diagnostic_plots/C_w_RPD.png differ diff --git a/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots_RPD_A_w_RPD (Truncated).png b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots_RPD_A_w_RPD (Truncated).png new file mode 100644 index 0000000000000000000000000000000000000000..f085c8d81ccd2447e4d35235d5bdf2d35049c732 Binary files /dev/null and b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots_RPD_A_w_RPD (Truncated).png differ diff --git a/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots_RPD_A_w_RPD.png b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots_RPD_A_w_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..ebb696398fca6261179c7710fc926cb2bed5e99f Binary files /dev/null and b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots_RPD_A_w_RPD.png differ diff --git a/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots_RPD_C_w_RPD (Truncated).png b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots_RPD_C_w_RPD (Truncated).png new file mode 100644 index 0000000000000000000000000000000000000000..45c45a2001df599d167c2334a096bba00d0446ce Binary files /dev/null and b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots_RPD_C_w_RPD (Truncated).png differ diff --git a/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots_RPD_C_w_RPD.png b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots_RPD_C_w_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..5e579dd73c2eb3c71463ade61492418301c1c4d5 Binary files /dev/null and b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots_RPD_C_w_RPD.png differ diff --git a/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots__A_w_RPD_true_BRAN (Truncated).png b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots__A_w_RPD_true_BRAN (Truncated).png new file mode 100644 index 0000000000000000000000000000000000000000..2371f7b244322ca2c60fafcfad48de0c306b4f66 Binary files /dev/null and b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots__A_w_RPD_true_BRAN (Truncated).png differ diff --git a/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots__A_w_RPD_true_BRAN.png b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots__A_w_RPD_true_BRAN.png new file mode 100644 index 0000000000000000000000000000000000000000..066cf34e5d05ac9e2d9182de2f957f0a772d0461 Binary files /dev/null and b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots__A_w_RPD_true_BRAN.png differ diff --git a/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots__C_w_RPD_true_BRAN (Truncated).png b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots__C_w_RPD_true_BRAN (Truncated).png new file mode 100644 index 0000000000000000000000000000000000000000..d7795d661f7ac90ba22d522fd202efd54c6f43ff Binary files /dev/null and b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots__C_w_RPD_true_BRAN (Truncated).png differ diff --git a/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots__C_w_RPD_true_BRAN.png b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots__C_w_RPD_true_BRAN.png new file mode 100644 index 0000000000000000000000000000000000000000..34b366e81c539d796480f0c17358fb9f53af945b Binary files /dev/null and b/ml/ml_diagnostic_plots/Energy_Dists/EnergyFreqPlots__C_w_RPD_true_BRAN.png differ diff --git a/ml/ml_diagnostic_plots/hist_A_no_RPD.png b/ml/ml_diagnostic_plots/hist_A_no_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..f6e91b48925838259ce126c3bba63035da926388 Binary files /dev/null and b/ml/ml_diagnostic_plots/hist_A_no_RPD.png differ diff --git a/ml/ml_diagnostic_plots/hist_A_w_RPD.png b/ml/ml_diagnostic_plots/hist_A_w_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..ce7740cf5c36e2808d4b613f2ac00650b28e5498 Binary files /dev/null and b/ml/ml_diagnostic_plots/hist_A_w_RPD.png differ diff --git a/ml/ml_diagnostic_plots/hist_C_no_RPD.png b/ml/ml_diagnostic_plots/hist_C_no_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..eb0bcc65b653b48f952ccebd170d341e4a61b9af Binary files /dev/null and b/ml/ml_diagnostic_plots/hist_C_no_RPD.png differ diff --git a/ml/ml_diagnostic_plots/hist_C_w_RPD.png b/ml/ml_diagnostic_plots/hist_C_w_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..808c119431d2505bcf08fc5d7a113d120d21a438 Binary files /dev/null and b/ml/ml_diagnostic_plots/hist_C_w_RPD.png differ diff --git a/ml/ml_diagnostic_plots/log_A_no_RPD.png b/ml/ml_diagnostic_plots/log_A_no_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..3f60990b214dfb6df5cbdd519d154453324ab225 Binary files /dev/null and b/ml/ml_diagnostic_plots/log_A_no_RPD.png differ diff --git a/ml/ml_diagnostic_plots/log_A_w_RPD.png b/ml/ml_diagnostic_plots/log_A_w_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..464f99adb34ac5374f82fca68850680fefae5fee Binary files /dev/null and b/ml/ml_diagnostic_plots/log_A_w_RPD.png differ diff --git a/ml/ml_diagnostic_plots/log_C_no_RPD.png b/ml/ml_diagnostic_plots/log_C_no_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..9481dbf75a308eeacede3a9ab17fdcaed9887dc5 Binary files /dev/null and b/ml/ml_diagnostic_plots/log_C_no_RPD.png differ diff --git a/ml/ml_diagnostic_plots/log_C_w_RPD.png b/ml/ml_diagnostic_plots/log_C_w_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..3a930d7a35baa7c822cfba34d4a087ab4707719f Binary files /dev/null and b/ml/ml_diagnostic_plots/log_C_w_RPD.png differ diff --git a/ml/ml_diagnostic_plots/res_A_no_RPD.png b/ml/ml_diagnostic_plots/res_A_no_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..cfcda526f58b69dea5f194501bb1fdbf3761ef2a Binary files /dev/null and b/ml/ml_diagnostic_plots/res_A_no_RPD.png differ diff --git a/ml/ml_diagnostic_plots/res_A_w_RPD.png b/ml/ml_diagnostic_plots/res_A_w_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..589418249018010927826e86149dde423a770c0f Binary files /dev/null and b/ml/ml_diagnostic_plots/res_A_w_RPD.png differ diff --git a/ml/ml_diagnostic_plots/res_C_no_RPD.png b/ml/ml_diagnostic_plots/res_C_no_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..f34c84839c32bd5a1ad1f4f409cb1fa08973bfd6 Binary files /dev/null and b/ml/ml_diagnostic_plots/res_C_no_RPD.png differ diff --git a/ml/ml_diagnostic_plots/res_C_w_RPD.png b/ml/ml_diagnostic_plots/res_C_w_RPD.png new file mode 100644 index 0000000000000000000000000000000000000000..c4eb451cf88a2293a378a4fb90eb62e42ce3417c Binary files /dev/null and b/ml/ml_diagnostic_plots/res_C_w_RPD.png differ diff --git a/ml/rootToNumpy.py b/ml/rootToNumpy.py index 27c662332069ce649ad59bef1d196f1f2005b8d6..036d155c667676cfee000bea26f849262ef11098 100644 --- a/ml/rootToNumpy.py +++ b/ml/rootToNumpy.py @@ -2,8 +2,9 @@ import ROOT import numpy as np from json import dumps -FILE_GLOB = "../data/zdcTopoAnalysis_1N.root" +FILE_GLOB = "./data/zdcTopoAnalysis_1N.root" +# To run, call python3 ml/rootToNumpy.py in command prompt from top of ml4zdc repo def main(): """ @@ -29,7 +30,7 @@ def main(): """ dataframe = ROOT.RDataFrame("zdcTree", FILE_GLOB) - + print(dataframe) # print all columns and their types columns = [str(col) for col in dataframe.GetColumnNames()] columns_and_types = {col: dataframe.GetColumnType(col) for col in columns} @@ -70,6 +71,10 @@ def main(): print(zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.shape) print(zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA) + np.save("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC.npy", zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC) + np.save("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.npy", zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA) + + return if __name__ == "__main__": main() diff --git a/ml/tree_model_predict.py b/ml/tree_model_predict.py new file mode 100644 index 0000000000000000000000000000000000000000..5a034d13a25d727ce9fa4ef4a2da2007bb685a8f --- /dev/null +++ b/ml/tree_model_predict.py @@ -0,0 +1,204 @@ +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.model_selection import train_test_split +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.mlab as mlab +# from scipy import stats +from scipy.stats import norm + +def main(): + zdc_sideA_withRPD = np.load("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.npy") + zdc_sideC_withRPD = np.load("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC.npy") + zdc_sideA_noRPD = np.load("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.npy") + zdc_sideC_noRPD = np.load("zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC.npy") + + zdc_sideA_noRPD = np.delete(zdc_sideA_noRPD, 4, 1) + zdc_sideC_noRPD = np.delete(zdc_sideC_noRPD, 4, 1) + + tree_depth = -1 + training_ratio = 0.8 + + A_w_RPD = xg(zdc_sideA_withRPD[:, :5], zdc_sideA_withRPD[:, 5], tree_depth, training_ratio) + C_w_RPD = xg(zdc_sideC_withRPD[:, :5], zdc_sideC_withRPD[:, 5], tree_depth, training_ratio) + A_no_RPD = xg(zdc_sideA_noRPD[:, :4], zdc_sideA_noRPD[:, 4], tree_depth, training_ratio) + C_no_RPD = xg(zdc_sideC_noRPD[:, :4], zdc_sideC_noRPD[:, 4], tree_depth, training_ratio) + + # TotE_A_w_RPD = np.sum(zdc_sideA_withRPD, axis = 1) + # TotE_C_w_RPD = np.sum(zdc_sideC_withRPD, axis = 1) + # TotE_A_no_RPD = np.sum(zdc_sideA_noRPD, axis = 1) + # TotE_C_no_RPD = np.sum(zdc_sideC_noRPD, axis = 1) + + pred_BRAN_A_w_RPD = A_w_RPD.predict(zdc_sideA_withRPD[:, :5]) + pred_BRAN_C_w_RPD = C_w_RPD.predict(zdc_sideC_withRPD[:, :5]) + pred_BRAN_A_no_RPD = A_no_RPD.predict(zdc_sideA_noRPD[:, :4]) + pred_BRAN_C_no_RPD = C_no_RPD.predict(zdc_sideC_noRPD[:, :4]) + + BRAN_A_w_RPD = zdc_sideA_withRPD[:, 5] + BRAN_C_w_RPD = zdc_sideC_withRPD[:, 5] + BRAN_A_no_RPD = zdc_sideA_noRPD[:, 4] + BRAN_C_no_RPD = zdc_sideC_noRPD[:, 4] + + # res_BRAN_A_w_RPD = BRAN_A_w_RPD - pred_BRAN_A_w_RPD + # res_BRAN_C_w_RPD = BRAN_C_w_RPD - pred_BRAN_C_w_RPD + # res_BRAN_A_no_RPD = BRAN_A_no_RPD - pred_BRAN_A_no_RPD + # res_BRAN_C_no_RPD = BRAN_C_no_RPD - pred_BRAN_C_no_RPD + + # makePlots(TotE_A_w_RPD, BRAN_A_w_RPD, pred_BRAN_A_w_RPD, res_BRAN_A_w_RPD, "Side A (RPD) - BRAN vs. Total Energy", "A_w_RPD") + # makePlots(TotE_C_w_RPD, BRAN_C_w_RPD, pred_BRAN_C_w_RPD, res_BRAN_C_w_RPD, "Side C (RPD) - BRAN vs. Total Energy", "C_w_RPD") + # makePlots(TotE_A_no_RPD, BRAN_A_no_RPD, pred_BRAN_A_no_RPD, res_BRAN_A_no_RPD, "Side A (NO RPD) - BRAN vs. Total Energy", "A_no_RPD") + # makePlots(TotE_C_no_RPD, BRAN_C_no_RPD, pred_BRAN_C_no_RPD, res_BRAN_C_no_RPD, "Side C (NO RPD) - BRAN vs. Total Energy", "C_no_RPD") + + # makeLogPlots(TotE_A_w_RPD, BRAN_A_w_RPD, pred_BRAN_A_w_RPD, res_BRAN_A_w_RPD, "Side A (RPD) - BRAN vs. Total Energy", "log_A_w_RPD") + # makeLogPlots(TotE_C_w_RPD, BRAN_C_w_RPD, pred_BRAN_C_w_RPD, res_BRAN_C_w_RPD, "Side C (RPD) - BRAN vs. Total Energy", "log_C_w_RPD") + # makeLogPlots(TotE_A_no_RPD, BRAN_A_no_RPD, pred_BRAN_A_no_RPD, res_BRAN_A_no_RPD, "Side A (NO RPD) - BRAN vs. Total Energy", "log_A_no_RPD") + # makeLogPlots(TotE_C_no_RPD, BRAN_C_no_RPD, pred_BRAN_C_no_RPD, res_BRAN_C_no_RPD, "Side C (NO RPD) - BRAN vs. Total Energy", "log_C_no_RPD") + + # plotBRANS(BRAN_A_w_RPD, pred_BRAN_A_w_RPD, "Side A (RPD) - True vs. Predicted BRANs", "hist_A_w_RPD") + # plotBRANS(BRAN_C_w_RPD, pred_BRAN_C_w_RPD, "Side C (RPD) - True vs. Predicted BRANs", "hist_C_w_RPD") + # plotBRANS(BRAN_A_no_RPD, pred_BRAN_A_no_RPD, "Side A (NO RPD) - True vs. Predicted BRANs", "hist_A_no_RPD") + # plotBRANS(BRAN_C_no_RPD, pred_BRAN_C_no_RPD, "Side C (NO RPD) - True vs. Predicted BRANs", "hist_C_no_RPD") + + # plotSpike(zdc_sideA_noRPD[:, :4], pred_BRAN_A_w_RPD, "Side A (RPD) - Energy Frequency Plots", "EnergyFreqPlots_A_w_RPD", "") + # plotSpike(zdc_sideC_noRPD[:, :4], pred_BRAN_C_w_RPD, "Side C (RPD) - Energy Frequency Plots", "EnergyFreqPlots_C_w_RPD", "") + # plotSpike(zdc_sideA_noRPD[:, :4], pred_BRAN_A_no_RPD, "Side A (NO RPD) - Energy Frequency Plots", "EnergyFreqPlots_A_no_RPD", "") + # plotSpike(zdc_sideC_noRPD[:, :4], pred_BRAN_C_no_RPD, "Side C (NO RPD) - Energy Frequency Plots", "EnergyFreqPlots_C_no_RPD", "") + plotSpike(zdc_sideA_withRPD[:, :5], pred_BRAN_A_w_RPD, "Side A - Energy Frequency Plots with RPD and predicted BRAN", "EnergyFreqPlots_RPD_A_w_RPD", "RPD + ") + plotSpike(zdc_sideC_withRPD[:, :5], pred_BRAN_C_w_RPD, "Side C - Energy Frequency Plots with RPD and predicted BRAN", "EnergyFreqPlots_RPD_C_w_RPD", "RPD + ") + plotSpike(zdc_sideA_withRPD[:, :5], zdc_sideA_withRPD[:, 5], "Side A - Energy Frequency Plots with RPD and reconstructed BRAN", "EnergyFreqPlots__A_w_RPD_true_BRAN", "RPD + ") + plotSpike(zdc_sideC_withRPD[:, :5], zdc_sideC_withRPD[:, 5], "Side C - Energy Frequency Plots with RPD and reconstructed BRAN", "EnergyFreqPlots__C_w_RPD_true_BRAN", "RPD + ") + +def plotSpike(predictor_data, response_data, title1, docname1, includeRPD): + predSum = np.sum(predictor_data, axis = 1) + totSum = predSum + response_data + + meanPred = np.average(predSum) + meanTot = np.average(totSum) + stdPred = np.std(predSum) + stdTot = np.std(totSum) + + c = 2 + highPred, lowPred = meanPred + c*stdPred, meanPred - c*stdPred + highTot, lowTot = meanTot + c*stdTot, meanTot - c*stdTot + + plt.hist(predSum, label = f"EM + {includeRPD}HAD123", bins = 1000) + plt.hist(totSum, label = f"EM + BRAN + {includeRPD}HAD123", bins = 1000, alpha = 0.5) + plt.axvline(x = meanPred, color = "blue", label = "Mean w/o BRAN") + plt.axvline(x = lowPred, color = 'blue', label = f"-{c} sigma w/o BRAN") + plt.axvline(x = highPred, color = 'blue', label = f"+{c} sigma w/o BRAN") + plt.axvline(x = meanTot, color = "orange", label = "Mean with BRAN") + plt.axvline(x = lowTot, color = 'orange', label = f"-{c} sigma with BRAN") + plt.axvline(x = highTot, color = 'orange', label = f"+{c} sigma with BRAN") + + plt.title(title1) + plt.xlabel("Total Energy") + plt.ylabel("Frequency") + plt.legend(loc = "upper left", prop = {'size': 8}) + plt.savefig(f"ml_diagnostic_plots/Energy_Dists/{docname1}.png") + plt.clf() + + # Cutoffs for +/- 2 sigma + cond1 = predSum > lowPred + cond2 = predSum < highPred + cond = np.logical_and(cond1, cond2) + gausPred = predSum[cond] + + cond1 = totSum > lowTot + cond2 = totSum < highTot + cond = np.logical_and(cond1, cond2) + gausTot = totSum[cond] + + (newMeanPred, newStdPred) = norm.fit(gausPred) + (newMeanTot, newStdTot) = norm.fit(gausTot) + + print(title1) + print("Energy without BRAN") + print(f'mean: {round(newMeanPred, 3)}', f'sigma: {round(newStdPred, 3)}', f'resolution: {round(newStdPred/newMeanPred, 3)}') + print("Energy with BRAN") + print(f'mean: {round(newMeanTot, 3)}', f'sigma: {round(newStdTot, 3)}', f'resolution: {round(newStdTot / newMeanTot, 3)}') + print((newStdPred/newMeanPred - newStdTot/newMeanTot)/(newStdPred/newMeanPred)) + print(((stdPred/meanPred - stdTot/meanTot)/(stdPred/meanPred))) + print() + + fact = 1400 + if "reconstructed" in title1: + fact += 50 + + nP, binsP, patchesP = plt.hist(gausPred, label = f"EM + {includeRPD}HAD123", bins = 1000) + nT, binsT, patchesT = plt.hist(gausTot, label = f"EM + BRAN + {includeRPD}HAD123", bins = 1000, alpha = 0.5) + yP = 428800*fact*norm.pdf(binsP, newMeanPred, newStdPred) + lP = plt.plot(binsP, yP, 'b--', linewidth=2) + yT = 428800*fact*norm.pdf(binsT, newMeanTot, newStdTot) + lT = plt.plot(binsT, yT, 'r--', linewidth=2) + plt.axvline(x = newMeanPred, color = "blue", label = "Mean w/o BRAN") + plt.axvline(x = newMeanTot, color = "orange", label = "Mean with BRAN") + plt.title(f"{title1} (Trunc)") + plt.xlabel("Total Energy") + plt.ylabel("Frequency") + plt.legend(loc = "upper left", prop = {'size': 8}) + plt.savefig(f"ml_diagnostic_plots/Energy_Dists/{docname1} (Truncated).png") + plt.clf() + +def makePlots(x_data, y_data, pred_data, res_data, title, docname): + plt.scatter(x_data, y_data, label = "True") + plt.scatter(x_data, pred_data, label = "Predicted") + plt.title(title) + plt.xlabel("Total Energy") + plt.ylabel("BRAN Energy") + plt.legend() + plt.savefig(f"ml_diagostic_plots/{docname}.png") + plt.clf() + + plt.scatter(x_data, res_data, label = "Errors") + plt.title(f"Error of {title}") + plt.xlabel("Total Energy") + plt.ylabel("Error in BRAN Energy") + plt.legend() + plt.savefig(f"ml_diagostic_plots/res_{docname}.png") + plt.clf() + +def plotBRANS(true_bran, pred_bran, title, docname): + bin_width = 10000 + bins = np.arange(np.min([true_bran, pred_bran]), + np.max([true_bran, pred_bran]) + bin_width, bin_width) + + plt.hist(true_bran, bins = bins, label = "True BRAN Energies") + plt.hist(pred_bran, weights = -np.ones_like(pred_bran), bins = bins, label = "Predicted BRAN Energies") + + plt.title(title) + plt.xlabel("BRAN Energy") + # plt.yscale('log') + plt.ylabel("Count") + plt.axhline(0, color = 'k') + plt.legend() + plt.savefig(f"ml_diagostic_plots/{docname}.png") + plt.clf() + +def makeLogPlots(x_data, y_data, pred_data, res_data, title, docname): + plt.scatter(x_data, y_data, label = "True") + plt.scatter(x_data, pred_data, label = "Predicted") + plt.title(title) + plt.xlabel("Total Energy") + plt.ylabel("BRAN Energy") + # plt.xscale("log") + plt.yscale("log") + plt.legend() + plt.savefig(f"ml_diagostic_plots/{docname}.png") + plt.clf() + +def xg(X, y, tree_depth, training_ratio): + + X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = training_ratio) + model = 0 + if tree_depth < 1: + model = HistGradientBoostingRegressor() + else: + model = HistGradientBoostingRegressor(max_depth = tree_depth) + + model.fit(X_train, y_train) + + return model + +################################################################################################## + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ml/zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.npy b/ml/zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.npy new file mode 100644 index 0000000000000000000000000000000000000000..325ae74775f2fb2ee8f26a8613ded45df3c3e0e4 Binary files /dev/null and b/ml/zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.npy differ diff --git a/ml/zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC.npy b/ml/zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC.npy new file mode 100644 index 0000000000000000000000000000000000000000..deba13767c6953ed08fe4b2b398bce7292225d99 Binary files /dev/null and b/ml/zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC.npy differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/pred_A_10N (Truncated).png b/ml_many_neutrons/ml_diagnostic_plots/pred_A_10N (Truncated).png new file mode 100644 index 0000000000000000000000000000000000000000..4f3ff4e2d20e3f095cbf1986d88c610130264853 Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/pred_A_10N (Truncated).png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/pred_A_10N.png b/ml_many_neutrons/ml_diagnostic_plots/pred_A_10N.png new file mode 100644 index 0000000000000000000000000000000000000000..2355a2e7c746e8b36174556e1360ec851613b71a Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/pred_A_10N.png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/pred_A_5N (Truncated).png b/ml_many_neutrons/ml_diagnostic_plots/pred_A_5N (Truncated).png new file mode 100644 index 0000000000000000000000000000000000000000..98967af8f1afdba2c8c4c27fe5b8c21d3caa1338 Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/pred_A_5N (Truncated).png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/pred_A_5N.png b/ml_many_neutrons/ml_diagnostic_plots/pred_A_5N.png new file mode 100644 index 0000000000000000000000000000000000000000..6df9596c76fd70bcea4dbe11096d7c7e682a6a56 Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/pred_A_5N.png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/pred_C_10N (Truncated).png b/ml_many_neutrons/ml_diagnostic_plots/pred_C_10N (Truncated).png new file mode 100644 index 0000000000000000000000000000000000000000..d744005d2dcc9837d1e36d595cd2a9ea79fc6398 Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/pred_C_10N (Truncated).png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/pred_C_10N.png b/ml_many_neutrons/ml_diagnostic_plots/pred_C_10N.png new file mode 100644 index 0000000000000000000000000000000000000000..fe87109c6d99dbeacbc75a7435a56dd2be40e580 Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/pred_C_10N.png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/pred_C_5N (Truncated).png b/ml_many_neutrons/ml_diagnostic_plots/pred_C_5N (Truncated).png new file mode 100644 index 0000000000000000000000000000000000000000..dc7d97c6a4b90a46af0f8bf534960168a59e9451 Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/pred_C_5N (Truncated).png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/pred_C_5N.png b/ml_many_neutrons/ml_diagnostic_plots/pred_C_5N.png new file mode 100644 index 0000000000000000000000000000000000000000..2f7ca5d95a3a97d8ae9e801e54a57f8214ec4553 Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/pred_C_5N.png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/recon_A_10N (Truncated).png b/ml_many_neutrons/ml_diagnostic_plots/recon_A_10N (Truncated).png new file mode 100644 index 0000000000000000000000000000000000000000..e9106d205cee172401fafeec141d9fe0e87b29ee Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/recon_A_10N (Truncated).png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/recon_A_10N.png b/ml_many_neutrons/ml_diagnostic_plots/recon_A_10N.png new file mode 100644 index 0000000000000000000000000000000000000000..bbee77c388e6ceda2d2da467512f5a1729ed0356 Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/recon_A_10N.png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/recon_A_5N (Truncated).png b/ml_many_neutrons/ml_diagnostic_plots/recon_A_5N (Truncated).png new file mode 100644 index 0000000000000000000000000000000000000000..37c148ac318c362cebbce09f5ddd9e932dc6d89c Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/recon_A_5N (Truncated).png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/recon_A_5N.png b/ml_many_neutrons/ml_diagnostic_plots/recon_A_5N.png new file mode 100644 index 0000000000000000000000000000000000000000..12511764e24ed2d996e3de648ae8471f39df4030 Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/recon_A_5N.png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/recon_C_10N (Truncated).png b/ml_many_neutrons/ml_diagnostic_plots/recon_C_10N (Truncated).png new file mode 100644 index 0000000000000000000000000000000000000000..7ee5b2b76ee54baac84ab0ab51a2e328d7b41ff0 Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/recon_C_10N (Truncated).png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/recon_C_10N.png b/ml_many_neutrons/ml_diagnostic_plots/recon_C_10N.png new file mode 100644 index 0000000000000000000000000000000000000000..b23c4c2f638440bdaffbd698f5fef3417b48c663 Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/recon_C_10N.png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/recon_C_5N (Truncated).png b/ml_many_neutrons/ml_diagnostic_plots/recon_C_5N (Truncated).png new file mode 100644 index 0000000000000000000000000000000000000000..114e31a3b4c9f696959dbb0b362c196d6ea73e8a Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/recon_C_5N (Truncated).png differ diff --git a/ml_many_neutrons/ml_diagnostic_plots/recon_C_5N.png b/ml_many_neutrons/ml_diagnostic_plots/recon_C_5N.png new file mode 100644 index 0000000000000000000000000000000000000000..f5ea5a095a79b14c756e2b2f457f726d54667828 Binary files /dev/null and b/ml_many_neutrons/ml_diagnostic_plots/recon_C_5N.png differ diff --git a/ml_many_neutrons/rootToNumpy.py b/ml_many_neutrons/rootToNumpy.py new file mode 100644 index 0000000000000000000000000000000000000000..6c42ae5e87713682bf1a23950d8b24a9489e2c2e --- /dev/null +++ b/ml_many_neutrons/rootToNumpy.py @@ -0,0 +1,125 @@ +import ROOT +import numpy as np +from json import dumps + +FILE_GLOB_5N = "./data/zdcTopoAnalysis_5N.root" +FILE_GLOB_10N = "./data/zdcTopoAnalysis_10N.root" +# To run, call python3 ml/rootToNumpy.py in command prompt from top of ml4zdc repo + +def main(): + """ + this script reads a ROOT file with a TTree with RDataFrame and numpy-fies some of its data + based on: https://root.cern/doc/master/tmva101__Training_8py.html + and https://root.cern/doc/master/df026__AsNumpyArrays_8py.html + see also (RDataFrame): https://root.cern/doc/master/classROOT_1_1RDataFrame.html + this is also helpful (cppyy, Python-C++ bindings used by PyROOT): https://cppyy.readthedocs.io/en/latest/stl.html + + the training will be done in Python using Pytorch, TensorFlow, etc + once we have a model, we will want to see what it's doing by making plots + this could be done with matplotlib.pyplot, but if we will be showing plots to the ZDC group, + we'll need to have plots made with ROOT + we can do this with PyROOT, but I think it would be ideal to load the (trained) model in C++ + and then analyze the data as we've been doing with our scripts since eventually this is what + we'd like to do with real data + models can be exported in ONNX (Open Neural Network eXchange) format, e.g., https://pytorch.org/docs/stable/onnx.html + then, we can load the model in Python or C++ with ROOT tools: + https://indico.cern.ch/event/1176076/contributions/4939648/attachments/2474114/4245117/SOFIE@ICHEP.pdf + + it may or may not be worth it to extract the data we need (the TTree contains much more than that) + and save to a file (.npy, pytables, h5py), depending on the performance of reading directly from TTree + """ + + dataframe = ROOT.RDataFrame("zdcTree", FILE_GLOB_5N) + print(dataframe) + # print all columns and their types + columns = [str(col) for col in dataframe.GetColumnNames()] + columns_and_types = {col: dataframe.GetColumnType(col) for col in columns} + print("all branches and types:") + print(dumps(columns_and_types, indent=2)) + + # zdc_ZdcModuleTruthTotal is an option, but it includes "invisible" and "escaped" energy, + # which can't be seen in our detectors, so we'll instead sum the "EM" and "non EM" energies per module + + # unlike in C++, we can't pass a callable to Define(), but we can pass a string + # like this, which I guess will evaluate to ROOT::VecOps::operator+() + dataframe = dataframe.Define( + "zdcTruth_5N", "zdc_ZdcModuleTruthEM + zdc_ZdcModuleTruthNonEM" + ) + + # now get zdcTruth_5N branch into a numpy array + numpy_data = dataframe.AsNumpy(columns=["zdcTruth_5N"]) + zdcTruth_5N_halfNumpy = numpy_data["zdcTruth_5N"] + print("half numpy-fied zdcTruth_5N:") + print(zdcTruth_5N_halfNumpy.shape) + print(zdcTruth_5N_halfNumpy) + + print("full numpy-fied zdcTruth_5N:") + zdcTruth_5N_fullNumpy = np.stack(zdcTruth_5N_halfNumpy) + print(zdcTruth_5N_fullNumpy.shape) + print(zdcTruth_5N_fullNumpy) + + # split the data into the different sides + # zdcTruth_5N is a vector with length 14; the two sides are concatenated + # the last entry in each side is unused + # for one side, the order is ["EM", "HAD1" ,"HAD2" ,"HAD3" ,"RPD" ,"BRAN", (unused)] + zdcTruth_5N_fullNumpy_sideC = zdcTruth_5N_fullNumpy[:, :6] + print("side C numpy-fied zdcTruth_5N:") + print(zdcTruth_5N_fullNumpy_sideC.shape) + print(zdcTruth_5N_fullNumpy_sideC) + zdcTruth_5N_fullNumpy_sideA = zdcTruth_5N_fullNumpy[:, 7:13] + print("side A numpy-fied zdcTruth_5N:") + print(zdcTruth_5N_fullNumpy_sideA.shape) + print(zdcTruth_5N_fullNumpy_sideA) + + np.save("zdcTruth_5N_fullNumpy_sideC.npy", zdcTruth_5N_fullNumpy_sideC) + np.save("zdcTruth_5N_fullNumpy_sideA.npy", zdcTruth_5N_fullNumpy_sideA) + + dataframe = ROOT.RDataFrame("zdcTree", FILE_GLOB_10N) + print(dataframe) + # print all columns and their types + columns = [str(col) for col in dataframe.GetColumnNames()] + columns_and_types = {col: dataframe.GetColumnType(col) for col in columns} + print("all branches and types:") + print(dumps(columns_and_types, indent=2)) + + # zdc_ZdcModuleTruthTotal is an option, but it includes "invisible" and "escaped" energy, + # which can't be seen in our detectors, so we'll instead sum the "EM" and "non EM" energies per module + + # unlike in C++, we can't pass a callable to Define(), but we can pass a string + # like this, which I guess will evaluate to ROOT::VecOps::operator+() + dataframe = dataframe.Define( + "zdcTruth_10N", "zdc_ZdcModuleTruthEM + zdc_ZdcModuleTruthNonEM" + ) + + # now get zdcTruth_10N branch into a numpy array + numpy_data = dataframe.AsNumpy(columns=["zdcTruth_10N"]) + zdcTruth_10N_halfNumpy = numpy_data["zdcTruth_10N"] + print("half numpy-fied zdcTruth_10N:") + print(zdcTruth_10N_halfNumpy.shape) + print(zdcTruth_10N_halfNumpy) + + print("full numpy-fied zdcTruth_10N:") + zdcTruth_10N_fullNumpy = np.stack(zdcTruth_10N_halfNumpy) + print(zdcTruth_10N_fullNumpy.shape) + print(zdcTruth_10N_fullNumpy) + + # split the data into the different sides + # zdcTruth_10N is a vector with length 14; the two sides are concatenated + # the last entry in each side is unused + # for one side, the order is ["EM", "HAD1" ,"HAD2" ,"HAD3" ,"RPD" ,"BRAN", (unused)] + zdcTruth_10N_fullNumpy_sideC = zdcTruth_10N_fullNumpy[:, :6] + print("side C numpy-fied zdcTruth_10N:") + print(zdcTruth_10N_fullNumpy_sideC.shape) + print(zdcTruth_10N_fullNumpy_sideC) + zdcTruth_10N_fullNumpy_sideA = zdcTruth_10N_fullNumpy[:, 7:13] + print("side A numpy-fied zdcTruth_10N:") + print(zdcTruth_10N_fullNumpy_sideA.shape) + print(zdcTruth_10N_fullNumpy_sideA) + + np.save("zdcTruth_10N_fullNumpy_sideC.npy", zdcTruth_10N_fullNumpy_sideC) + np.save("zdcTruth_10N_fullNumpy_sideA.npy", zdcTruth_10N_fullNumpy_sideA) + + return + +if __name__ == "__main__": + main() diff --git a/ml_many_neutrons/tree_model_predict.py b/ml_many_neutrons/tree_model_predict.py new file mode 100644 index 0000000000000000000000000000000000000000..5738fc36b72d5ba49a26e6c0e0d3eda84a7e9d45 --- /dev/null +++ b/ml_many_neutrons/tree_model_predict.py @@ -0,0 +1,132 @@ +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.model_selection import train_test_split +import numpy as np +import matplotlib.pyplot as plt +from scipy.stats import norm + +def main(): + zdc_sideA_5N = np.load("./zdcTruth_5N_fullNumpy_sideA.npy") + zdc_sideC_5N = np.load("./zdcTruth_5N_fullNumpy_sideC.npy") + zdc_sideA_10N = np.load("./zdcTruth_10N_fullNumpy_sideA.npy") + zdc_sideC_10N = np.load("./zdcTruth_10N_fullNumpy_sideC.npy") + + print(zdc_sideA_5N[:5]) + print(zdc_sideC_5N[:5]) + print(zdc_sideA_10N[:5]) + print(zdc_sideC_10N[:5]) + + tree_depth = 4 + training_ratio = 0.8 + + A_5N = xg(zdc_sideA_5N[:, :5], zdc_sideA_5N[:, 5], tree_depth, training_ratio) + C_5N = xg(zdc_sideC_5N[:, :5], zdc_sideC_5N[:, 5], tree_depth, training_ratio) + A_10N = xg(zdc_sideA_10N[:, :5], zdc_sideA_10N[:, 5], tree_depth, training_ratio) + C_10N = xg(zdc_sideC_10N[:, :5], zdc_sideC_10N[:, 5], tree_depth, training_ratio) + + pred_BRAN_A_5N = A_5N.predict(zdc_sideA_5N[:, :5]) + pred_BRAN_C_5N = C_5N.predict(zdc_sideC_5N[:, :5]) + pred_BRAN_A_10N = A_10N.predict(zdc_sideA_10N[:, :5]) + pred_BRAN_C_10N = C_10N.predict(zdc_sideC_10N[:, :5]) + + BRAN_A_5N = zdc_sideA_5N[:, 5] + BRAN_C_5N = zdc_sideC_5N[:, 5] + BRAN_A_10N = zdc_sideA_10N[:, 5] + BRAN_C_10N = zdc_sideC_10N[:, 5] + + plotSpike(zdc_sideA_5N[:, :5], pred_BRAN_A_5N, "Side A, 5N, predicted energy freq", "pred_A_5N") + plotSpike(zdc_sideC_5N[:, :5], pred_BRAN_C_5N, "Side C, 5N, predicted energy freq", "pred_C_5N") + plotSpike(zdc_sideA_10N[:, :5], pred_BRAN_A_10N, "Side A, 10N, predicted energy freq", "pred_A_10N") + plotSpike(zdc_sideC_10N[:, :5], pred_BRAN_C_10N, "Side C, 10N, predicted energy freq", "pred_C_10N") + + plotSpike(zdc_sideA_5N[:, :5], BRAN_A_5N, "Side A, 5N, truth energy freq", "recon_A_5N") + plotSpike(zdc_sideC_5N[:, :5], BRAN_C_5N, "Side C, 5N, truth energy freq", "recon_C_5N") + plotSpike(zdc_sideA_10N[:, :5], BRAN_A_10N, "Side A, 10N, truth energy freq", "recon_A_10N") + plotSpike(zdc_sideC_10N[:, :5], BRAN_C_10N, "Side C, 10N, truth energy freq", "recon_C_10N") + + +def xg(X, y, tree_depth, training_ratio): + + X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = training_ratio) + model = 0 + if tree_depth < 1: + model = HistGradientBoostingRegressor() + else: + model = HistGradientBoostingRegressor(max_depth = tree_depth) + + model.fit(X_train, y_train) + + return model + +def plotSpike(predictor_data, response_data, title1, docname1,): + predSum = np.sum(predictor_data, axis = 1) + totSum = predSum + response_data + + meanPred = np.average(predSum) + meanTot = np.average(totSum) + stdPred = np.std(predSum) + stdTot = np.std(totSum) + + c = 3 + highPred, lowPred = meanPred + c*stdPred, meanPred - c*stdPred + highTot, lowTot = meanTot + c*stdTot, meanTot - c*stdTot + + plt.hist(predSum, label = f"EM + RPD + HAD123", bins = 1000) + plt.hist(totSum, label = f"EM + BRAN + RPD + HAD123", bins = 1000, alpha = 0.5) + plt.axvline(x = meanPred, color = "blue", label = "Mean w/o BRAN") + plt.axvline(x = lowPred, color = 'blue', label = f"-{c} sigma w/o BRAN") + plt.axvline(x = highPred, color = 'blue', label = f"+{c} sigma w/o BRAN") + plt.axvline(x = meanTot, color = "orange", label = "Mean with BRAN") + plt.axvline(x = lowTot, color = 'orange', label = f"-{c} sigma with BRAN") + plt.axvline(x = highTot, color = 'orange', label = f"+{c} sigma with BRAN") + + plt.title(title1) + plt.xlabel("Total Energy") + plt.ylabel("Frequency") + plt.legend(loc = "upper left", prop = {'size': 8}) + plt.savefig(f"ml_diagnostic_plots/{docname1}.png") + plt.clf() + + # Cutoffs for +/- 2 sigma + cond1 = predSum > lowPred + cond2 = predSum < highPred + cond = np.logical_and(cond1, cond2) + gausPred = predSum[cond] + + cond1 = totSum > lowTot + cond2 = totSum < highTot + cond = np.logical_and(cond1, cond2) + gausTot = totSum[cond] + + (newMeanPred, newStdPred) = norm.fit(gausPred) + (newMeanTot, newStdTot) = norm.fit(gausTot) + + print(title1) + print("Energy without BRAN") + print(f'mean: {round(newMeanPred, 3)}', f'sigma: {round(newStdPred, 3)}', f'resolution: {round(newStdPred/newMeanPred, 3)}') + print("Energy with BRAN") + print(f'mean: {round(newMeanTot, 3)}', f'sigma: {round(newStdTot, 3)}', f'resolution: {round(newStdTot / newMeanTot, 3)}') + print((newStdPred/newMeanPred - newStdTot/newMeanTot)/(newStdPred/newMeanPred)) + print(((stdPred/meanPred - stdTot/meanTot)/(stdPred/meanPred))) + print() + + fact = 900 + + nP, binsP, patchesP = plt.hist(gausPred, label = f"EM + RPD + HAD123", bins = 1000) + nT, binsT, patchesT = plt.hist(gausTot, label = f"EM + BRAN + RPD + HAD123", bins = 1000, alpha = 0.5) + yP = 428800*fact*norm.pdf(binsP, newMeanPred, newStdPred) + lP = plt.plot(binsP, yP, 'b--', linewidth=2) + yT = 428800*fact*norm.pdf(binsT, newMeanTot, newStdTot) + lT = plt.plot(binsT, yT, 'r--', linewidth=2) + plt.axvline(x = newMeanPred, color = "blue", label = "Mean w/o BRAN") + plt.axvline(x = newMeanTot, color = "orange", label = "Mean with BRAN") + plt.title(f"{title1} (Trunc)") + plt.xlabel("Total Energy") + plt.ylabel("Frequency") + plt.legend(loc = "upper left", prop = {'size': 8}) + plt.savefig(f"ml_diagnostic_plots/{docname1} (Truncated).png") + plt.clf() + +################################################################################################## + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ml_many_neutrons/zdcTruth_10N_fullNumpy_sideA.npy b/ml_many_neutrons/zdcTruth_10N_fullNumpy_sideA.npy new file mode 100644 index 0000000000000000000000000000000000000000..4182aee5774f4b2cd1bd852c5cdc6446c8cf48f6 Binary files /dev/null and b/ml_many_neutrons/zdcTruth_10N_fullNumpy_sideA.npy differ diff --git a/ml_many_neutrons/zdcTruth_10N_fullNumpy_sideC.npy b/ml_many_neutrons/zdcTruth_10N_fullNumpy_sideC.npy new file mode 100644 index 0000000000000000000000000000000000000000..3e79decafdfe52c9f93a1aff6177c2cde95b0741 Binary files /dev/null and b/ml_many_neutrons/zdcTruth_10N_fullNumpy_sideC.npy differ diff --git a/ml_many_neutrons/zdcTruth_5N_fullNumpy_sideA.npy b/ml_many_neutrons/zdcTruth_5N_fullNumpy_sideA.npy new file mode 100644 index 0000000000000000000000000000000000000000..fec31c9216be4a2232c210a4a04b7eba9db8a031 Binary files /dev/null and b/ml_many_neutrons/zdcTruth_5N_fullNumpy_sideA.npy differ diff --git a/ml_many_neutrons/zdcTruth_5N_fullNumpy_sideC.npy b/ml_many_neutrons/zdcTruth_5N_fullNumpy_sideC.npy new file mode 100644 index 0000000000000000000000000000000000000000..e6b59952c151ee4a85ceedb92072cc0c87d5018e Binary files /dev/null and b/ml_many_neutrons/zdcTruth_5N_fullNumpy_sideC.npy differ diff --git a/plots/all_vs_bran.root b/plots/all_vs_bran.root index 331fd2e0ec05e97be2153e79aa28c98950893a4c..79e3e014c6d2247072812d80eb6e11117fcd9765 100644 Binary files a/plots/all_vs_bran.root and b/plots/all_vs_bran.root differ diff --git a/plots/energy_dists.root b/plots/energy_dists.root index 0a50325ee94d0d2ee6037fb94d3d9fdeb23171f0..41dfc1188f847468c833f36ac3a6b5e186411fdc 100644 Binary files a/plots/energy_dists.root and b/plots/energy_dists.root differ diff --git a/scripts/plot_all_vs_bran.cpp b/scripts/plot_all_vs_bran.cpp index 33aece4b650356ff4e5ee2fd46f2c551f37316c5..5166fce32a8ecc2868a66a6836a06d8bfe6eb0ea 100644 --- a/scripts/plot_all_vs_bran.cpp +++ b/scripts/plot_all_vs_bran.cpp @@ -6,8 +6,8 @@ #include "../include/ZDCModule.hpp" #include "../include/Axis.hpp" -std::string const SIM_FILE_PATH = "../data/SingleNeutronNew.2024.05.19_NTUP.root"; -std::string const OUT_FILE_PATH = "../plots/all_vs_bran.root"; +std::string const SIM_FILE_PATH = "./data/zdcTopoAnalysis_1N.root"; +std::string const OUT_FILE_PATH = "./plots/all_vs_bran.root"; /** * @brief plot EM module truth vs BRAN truth diff --git a/scripts/plot_energy_dists.cpp b/scripts/plot_energy_dists.cpp index 7b1258ac38f063fe87d5cc71bacc462515b0f493..518c6ebc807f0a2a0d2a1a2cd65adb97abea2476 100644 --- a/scripts/plot_energy_dists.cpp +++ b/scripts/plot_energy_dists.cpp @@ -5,12 +5,13 @@ #include "TCanvas.h" #include "TLegend.h" #include "TLegendEntry.h" +#include <iostream> #include "../include/ZDCModule.hpp" #include "../include/Axis.hpp" -std::string const SIM_FILE_PATH = "../data/SingleNeutronNew.2024.05.19_NTUP.root"; -std::string const OUT_FILE_PATH = "../plots/energy_dists.root"; +std::string const SIM_FILE_PATH = "./data/SingleNeutronNew.2024.05.19_NTUP.root"; +std::string const OUT_FILE_PATH = "./plots/energy_dists.root"; namespace axisConfig = axis::singleNeutron; @@ -114,6 +115,9 @@ inline void plot_energy_dists() { leg->Draw(); } TFile* plotFile = TFile::Open(OUT_FILE_PATH.c_str(), "RECREATE"); + if (!plotFile) { + std::cout << "plotFile is null" << std::endl; + } for (auto const& side : SIDES) { overlays.at(side)->Write(); }