Merge branch 'rootToNumpy' into 'main'

merge ml start, read TTree and export to numpy into main See merge request !4

Merge branch 'rootToNumpy' into 'main'
54a3c7cf · Mason Housenga · 85d27a7f · 6117f255 · 54a3c7cf
Commit 54a3c7cf authored 11 months ago by Mason Housenga
--- a/ml/rootToNumpy.py
+++ b/ml/rootToNumpy.py
+import ROOT
+import numpy as np
+from json import dumps
+
+FILE_GLOB = "../data/zdcTopoAnalysis_1N.root"
+
+
+def main():
+    """
+    this script reads a ROOT file with a TTree with RDataFrame and numpy-fies some of its data
+    based on: https://root.cern/doc/master/tmva101__Training_8py.html
+        and https://root.cern/doc/master/df026__AsNumpyArrays_8py.html
+    see also (RDataFrame): https://root.cern/doc/master/classROOT_1_1RDataFrame.html
+    this is also helpful (cppyy, Python-C++ bindings used by PyROOT): https://cppyy.readthedocs.io/en/latest/stl.html
+
+    the training will be done in Python using Pytorch, TensorFlow, etc
+    once we have a model, we will want to see what it's doing by making plots
+    this could be done with matplotlib.pyplot, but if we will be showing plots to the ZDC group, 
+        we'll need to have plots made with ROOT
+    we can do this with PyROOT, but I think it would be ideal to load the (trained) model in C++
+        and then analyze the data as we've been doing with our scripts since eventually this is what
+        we'd like to do with real data
+    models can be exported in ONNX (Open Neural Network eXchange) format, e.g., https://pytorch.org/docs/stable/onnx.html
+    then, we can load the model in Python or C++ with ROOT tools:
+        https://indico.cern.ch/event/1176076/contributions/4939648/attachments/2474114/4245117/SOFIE@ICHEP.pdf
+
+    it may or may not be worth it to extract the data we need (the TTree contains much more than that)
+        and save to a file (.npy, pytables, h5py), depending on the performance of reading directly from TTree
+    """
+
+    dataframe = ROOT.RDataFrame("zdcTree", FILE_GLOB)
+
+    # print all columns and their types
+    columns = [str(col) for col in dataframe.GetColumnNames()]
+    columns_and_types = {col: dataframe.GetColumnType(col) for col in columns}
+    print("all branches and types:")
+    print(dumps(columns_and_types, indent=2))
+
+    # zdc_ZdcModuleTruthTotal is an option, but it includes "invisible" and "escaped" energy,
+    # which can't be seen in our detectors, so we'll instead sum the "EM" and "non EM" energies per module
+
+    # unlike in C++, we can't pass a callable to Define(), but we can pass a string
+    # like this, which I guess will evaluate to ROOT::VecOps::operator+()
+    dataframe = dataframe.Define(
+        "zdc_ZdcModuleTruthEMNonEM", "zdc_ZdcModuleTruthEM + zdc_ZdcModuleTruthNonEM"
+    )
+
+    # now get zdc_ZdcModuleTruthEMNonEM branch into a numpy array
+    numpy_data = dataframe.AsNumpy(columns=["zdc_ZdcModuleTruthEMNonEM"])
+    zdc_ZdcModuleTruthEMNonEM_halfNumpy = numpy_data["zdc_ZdcModuleTruthEMNonEM"]
+    print("half numpy-fied zdc_ZdcModuleTruthEMNonEM:")
+    print(zdc_ZdcModuleTruthEMNonEM_halfNumpy.shape)
+    print(zdc_ZdcModuleTruthEMNonEM_halfNumpy)
+
+    print("full numpy-fied zdc_ZdcModuleTruthEMNonEM:")
+    zdc_ZdcModuleTruthEMNonEM_fullNumpy = np.stack(zdc_ZdcModuleTruthEMNonEM_halfNumpy)
+    print(zdc_ZdcModuleTruthEMNonEM_fullNumpy.shape)
+    print(zdc_ZdcModuleTruthEMNonEM_fullNumpy)
+
+    # split the data into the different sides
+    # zdc_ZdcModuleTruthEMNonEM is a vector with length 14; the two sides are concatenated
+    # the last entry in each side is unused
+    # for one side, the order is ["EM", "HAD1" ,"HAD2" ,"HAD3" ,"RPD" ,"BRAN", (unused)]
+    zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC = zdc_ZdcModuleTruthEMNonEM_fullNumpy[:, :6]
+    print("side C numpy-fied zdc_ZdcModuleTruthEMNonEM:")
+    print(zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC.shape)
+    print(zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideC)
+    zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA = zdc_ZdcModuleTruthEMNonEM_fullNumpy[:, 7:13]
+    print("side A numpy-fied zdc_ZdcModuleTruthEMNonEM:")
+    print(zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA.shape)
+    print(zdc_ZdcModuleTruthEMNonEM_fullNumpy_sideA)
+
+
+if __name__ == "__main__":
+    main()