diff --git a/Cargo.lock b/Cargo.lock
index 6642aef7745bf91996cb283de2c20c34953b3e90..0835939abb2314765a7ba7ee4ef90eefcd8e3144 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1366,7 +1366,29 @@ dependencies = [
 ]
 
 [[package]]
-name = "juno_grape_reduction"
+name = "juno_grape_fft"
+version = "0.1.0"
+dependencies = [
+ "async-std",
+ "grape_sim",
+ "hercules_rt",
+ "juno_build",
+ "with_builtin_macros",
+]
+
+[[package]]
+name = "juno_grape_reduction_host"
+version = "0.1.0"
+dependencies = [
+ "async-std",
+ "grape_sim",
+ "hercules_rt",
+ "juno_build",
+ "with_builtin_macros",
+]
+
+[[package]]
+name = "juno_grape_reduction_tree"
 version = "0.1.0"
 dependencies = [
  "async-std",
diff --git a/Cargo.toml b/Cargo.toml
index fd37ab44b5fe189caa1611962a040e7c836b6bfe..75521f82e1b6bf1f11c4cc6834918a8eea679db2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,9 +36,10 @@ members = [
 	"juno_samples/schedule_test",
 	"juno_samples/simple3",
 	"juno_samples/grape",
-	"juno_samples/grape_reduction",
+	"juno_samples/grape_reduction_tree",
 	"juno_samples/grape_conv",
-	"juno_samples/grape_reduction",
+	"juno_samples/grape_fft",
+	"juno_samples/grape_reduction_host",
 	"juno_scheduler",
 	"juno_utils",
 ]
diff --git a/grape_sim/src/lib.rs b/grape_sim/src/lib.rs
index 06541a8a5e70105320517d71e0a35afe5aa590d0..9ca5469ebafd3742659debce7a66e66e9801a338 100644
--- a/grape_sim/src/lib.rs
+++ b/grape_sim/src/lib.rs
@@ -65,7 +65,8 @@ pub fn compute() -> [i16; 8] {
                 println!("row: {}, col: {}, ins: {:?} op: {:?}", row, col, ins, op);
 
                 let result = match op {
-                    hercules_cg::FuOp::Add => ins.0 + ins.1,
+                    hercules_cg::FuOp::Add => ins.0.wrapping_add(*ins.1),
+                    hercules_cg::FuOp::Sub => ins.0.wrapping_sub(*ins.1),
                     hercules_cg::FuOp::Mult => ins.0 * ins.1,
                     hercules_cg::FuOp::PassA => *ins.0,
                     hercules_cg::FuOp::Default => {
@@ -96,7 +97,7 @@ pub unsafe extern "C" fn await_valid() {}
 pub unsafe extern "C" fn read_data(outputs: *mut i16) {
     // This forces the computation.
     let out = compute();
-    print!("out: {:?}", out);
+    print!("out: {:?}\n", out);
     for i in 0..7 {
         *outputs.add(i) = out[i];
     }
diff --git a/hercules_cg/src/grape.rs b/hercules_cg/src/grape.rs
index a7093f777a034be0a89044360fa04c169e14b6ea..085e1f9d53d461bbebde4a5c6f95e6bf31b1d3e7 100644
--- a/hercules_cg/src/grape.rs
+++ b/hercules_cg/src/grape.rs
@@ -74,6 +74,7 @@ impl FunctionalUnit {
 
         let vec = match self.op_type {
             FuOp::Add => vec![0, 0, 0, 0],
+            FuOp::Sub => vec![0, 1, 1, 1],
             FuOp::Mult => vec![0, 0, 1, 0],
             FuOp::PassA => vec![1, 1, 1, 1],
             FuOp::Default => vec![1, 1, 1, 1],
@@ -133,6 +134,7 @@ where
 #[derive(Clone, Debug, Copy, PartialEq, Serialize, Deserialize)]
 pub enum FuOp {
     Add,
+    Sub,
     Mult,
     PassA,
     Default, // ...
@@ -441,6 +443,8 @@ where
         // let binary_string: String = "010100101".to_owned();
         std::fs::write("debug.txt", debug_str);
 
+        // todo!();
+
         // write!(output, "{}", string);
 
         // todo!();
@@ -1199,7 +1203,7 @@ where
         for (i, item) in input_nodes.iter().enumerate() {
             param_map.insert(*item, i);
         }
-        return self.schedule_row_recursive(0, param_map, input_mapping, config);
+        return self.schedule_row_recursive(0, param_map, input_mapping, config, HashSet::new());
     }
 
     fn schedule_row_recursive(
@@ -1208,6 +1212,7 @@ where
         prev_mapping: HashMap<NodeID, usize>, // A node mapping for the previous row
         mut input_mapping: Vec<NodeID>,
         mut config: SliceDesc<H, W>,
+        mut computed_nodes: HashSet<NodeID>,
     ) -> Result<
         (SliceDesc<H, W>, HashMap<NodeID, usize>, Vec<NodeID>),
         (SliceDesc<H, W>, HashMap<NodeID, usize>, Vec<NodeID>),
@@ -1326,6 +1331,23 @@ where
         dedup(&mut choices);
         // Remove duplicates without changing order.
 
+        // Move killed nodes to back.
+        let mut i = 0;
+
+        println!("computed_nodes: {:?}", computed_nodes);
+
+        choices = choices
+            .iter()
+            .filter(|i| {
+                !self
+                    .def_use_map
+                    .get_users(**i)
+                    .iter()
+                    .all(|u| computed_nodes.contains(u))
+            })
+            .cloned()
+            .collect();
+
         println!("num choices: {:?}", choices.clone().len());
         println!("choices: {:?}", choices.clone());
 
@@ -1333,6 +1355,7 @@ where
         for combo in choices.iter().combinations(W.min(choices.len())) {
             println!("combo: {:?}", combo);
             let mut next_mapping = HashMap::new();
+            let mut next_computed_nodes = computed_nodes.clone();
 
             // Heurestic (Correctness): Just assume the first 8 are good choices.
             for (col, node) in combo.iter().take(W).enumerate() {
@@ -1361,6 +1384,7 @@ where
                 } else {
                     // compute it
                     next_mapping.insert(**node, col);
+                    next_computed_nodes.insert(**node);
 
                     // Collect inputs
                     let inputs: Vec<usize> = get_uses(&self.function.nodes[node.idx()])
@@ -1388,7 +1412,7 @@ where
                     let op_type = match self.function.nodes[node.idx()] {
                         Node::Binary { left, right, op } => match op {
                             BinaryOperator::Add => FuOp::Add,
-                            BinaryOperator::Sub => todo!(),
+                            BinaryOperator::Sub => FuOp::Sub,
                             BinaryOperator::Mul => FuOp::Mult,
                             BinaryOperator::Div => todo!(),
                             BinaryOperator::Rem => todo!(),
@@ -1410,8 +1434,13 @@ where
                 }
             }
 
-            let schedule_attempt =
-                self.schedule_row_recursive(row + 1, next_mapping, input_mapping.clone(), config);
+            let schedule_attempt = self.schedule_row_recursive(
+                row + 1,
+                next_mapping,
+                input_mapping.clone(),
+                config,
+                next_computed_nodes.clone(),
+            );
 
             if schedule_attempt.is_ok() {
                 return schedule_attempt;
diff --git a/hercules_rt/src/grape_header.rs b/hercules_rt/src/grape_header.rs
index 63467b92a6e0fa8a53e403b671d8600212d204e5..93a93f2cf29108d923f46fcc2d0cb06145f72041 100644
--- a/hercules_rt/src/grape_header.rs
+++ b/hercules_rt/src/grape_header.rs
@@ -1,10 +1,9 @@
 extern "C" {
     fn program_bitstream(ptr: *const u8, len_bits: usize);
 
-    fn send_data(inputs: *const i16); // len is 16 
+    fn send_data(inputs: *const i16); // len is 16
 
     fn await_valid();
 
     fn read_data(outputs: *mut i16); // len is 7
-
-}
\ No newline at end of file
+}
diff --git a/hercules_rt/src/grape_lib.rs b/hercules_rt/src/grape_lib.rs
index b2be97bd4391a15df37fe784f68be30d8f74ef92..5a625ce1083b26eb8f74d9471643069a8b0c43d3 100644
--- a/hercules_rt/src/grape_lib.rs
+++ b/hercules_rt/src/grape_lib.rs
@@ -1,20 +1,7 @@
 // Grape stuff
 
 #[no_mangle]
-pub unsafe extern "C" fn program_bitstream(ptr: *const u8, len: usize) {
-    // Example: read the bytes from the pointer
-    if ptr.is_null() {
-        panic!("Null pointer passed to program_bitstream");
-    }
-
-    let slice = std::slice::from_raw_parts(ptr, len);
-
-    println!("Received bitstream of length {}", slice.len());
-
-    for (i, byte) in slice.iter().enumerate() {
-        println!("Byte {}: {:08b}", i, byte);
-    }
-}
+pub unsafe extern "C" fn program_bitstream(ptr: *const u8, len: usize) {}
 
 #[no_mangle]
 pub unsafe extern "C" fn send_data(inputs: *const i16) {} // len is 16
diff --git a/juno_samples/grape_conv/src/grape.sch b/juno_samples/grape_conv/src/grape.sch
index 3aef5b9af52356456d080dcdf5ce0bca1d2263ee..664c0725c2776bf1d4fcbd88bc929fcd8caee0ee 100644
--- a/juno_samples/grape_conv/src/grape.sch
+++ b/juno_samples/grape_conv/src/grape.sch
@@ -26,5 +26,5 @@ ccp(*);
 simplify-cfg(*);
 dce(*);
 grape(wrapper);
-xdot[true](*);
+xdot[true](wrapper);
 gcm(*);
diff --git a/juno_samples/grape_conv/src/main.rs b/juno_samples/grape_conv/src/main.rs
index bf509d5d118324ef0982cbd1a5ad6cf847c34c6f..cad2a6ec5e96a682370a3238b22f630dccc79ee8 100644
--- a/juno_samples/grape_conv/src/main.rs
+++ b/juno_samples/grape_conv/src/main.rs
@@ -1,5 +1,7 @@
 #![feature(concat_idents)]
 
+use std::time::Instant;
+
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
 
@@ -49,10 +51,14 @@ fn main() {
         {
             let a = HerculesCPURef::from_slice(&a);
             let b = HerculesCPURef::from_slice(&b);
+            let start = Instant::now();
             let mut r = runner!(entry);
             let c = r.run(a, b).await;
             print!("{:?}", c);
+            let duration = start.elapsed();
+            println!("Time elapsed in some_function() is: {:?}", duration);
 
+            assert!(false);
             assert_eq!(c, (result[0], result[1], result[2], result[3]));
         }
         #[cfg(feature = "cuda")]
diff --git a/juno_samples/grape_reduction/Cargo.toml b/juno_samples/grape_fft/Cargo.toml
similarity index 88%
rename from juno_samples/grape_reduction/Cargo.toml
rename to juno_samples/grape_fft/Cargo.toml
index 683268b6368948bc6986d344641ae9ba02f79cfe..7b479813af19ffd84a6e6e153ebb750c3a28ae01 100644
--- a/juno_samples/grape_reduction/Cargo.toml
+++ b/juno_samples/grape_fft/Cargo.toml
@@ -1,11 +1,11 @@
 [package]
-name = "juno_grape_reduction"
+name = "juno_grape_fft"
 version = "0.1.0"
 authors = ["Xavier Routh <xrouth2@illinois.edu>"]
 edition = "2021"
 
 [[bin]]
-name = "juno_grape_reduction"
+name = "juno_grape_fft"
 path = "src/main.rs"
 
 [features]
diff --git a/juno_samples/grape_fft/build.rs b/juno_samples/grape_fft/build.rs
new file mode 100644
index 0000000000000000000000000000000000000000..2897d0e019e4b1983d5b3db4b4210d0f2ce80fe9
--- /dev/null
+++ b/juno_samples/grape_fft/build.rs
@@ -0,0 +1,35 @@
+use juno_build::JunoCompiler;
+
+fn main() {
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .file_in_src("fft.jn")
+            .unwrap()
+            .schedule_in_src("cpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "grape")]
+    {
+        JunoCompiler::new()
+            .file_in_src("fft.jn")
+            .unwrap()
+            .schedule_in_src("grape.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .file_in_src("simple.jn")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+}
diff --git a/juno_samples/grape_reduction/src/cpu.sch b/juno_samples/grape_fft/src/cpu.sch
similarity index 100%
rename from juno_samples/grape_reduction/src/cpu.sch
rename to juno_samples/grape_fft/src/cpu.sch
diff --git a/juno_samples/grape_fft/src/fft.jn b/juno_samples/grape_fft/src/fft.jn
new file mode 100644
index 0000000000000000000000000000000000000000..f8d428d43154426df96f23e6fee6052463442d56
--- /dev/null
+++ b/juno_samples/grape_fft/src/fft.jn
@@ -0,0 +1,46 @@
+fn fft(a : i16[8]) -> i16, i16, i16, i16, i16, i16, i16, i16 {
+  let out : i16[8];
+  let r0 = a[0];
+  let r1 = a[1];
+  let r2 = a[2];
+  let r3 = a[3];
+  let i0 = a[4];
+  let i1 = a[5];
+  let i2 = a[6];
+  let i3 = a[7];
+
+  let r02 = r0 + r2;
+  let r13 = r1 + r3;
+  let i02 = i0 + i2;
+  let i13 = i1 + i3;
+  let r0m2 = r0 - r2;
+  let r1m3 = r1 - r3;
+  let i0m2 = i0 - i2;
+  let i1m3 = i1 - i3;
+
+  out[0] = r02 + r13;
+  out[4] = i02 + i13;
+  out[2] = r02 - r13;
+  out[6] = i02 - i13;
+  out[1] = r0m2 + i1m3;
+  out[5] = i0m2 - r1m3;
+  out[3] = r0m2 - i1m3;
+  out[7] = i0m2 + r1m3;
+  return out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7];
+}
+
+#[entry]
+fn entry(a: i16[8]) -> i16[8] {
+    let out: i16[8];
+    let a, b, c, d, e, f, g, h = fft(a);
+    out[0] = a;
+    out[1] = b;
+    out[2] = c;
+    out[3] = d;
+    out[4] = e;
+    out[5] = f;
+    out[6] = g;
+    out[7] = h;
+    return out;
+}
+
diff --git a/juno_samples/grape_fft/src/grape.sch b/juno_samples/grape_fft/src/grape.sch
new file mode 100644
index 0000000000000000000000000000000000000000..49486d91178dce73fc41f9a6ef64162d273926df
--- /dev/null
+++ b/juno_samples/grape_fft/src/grape.sch
@@ -0,0 +1,31 @@
+gvn(*);
+phi-elim(*);
+ccp(*);
+simplify-cfg(*);
+dce(*);
+
+delete-uncalled(*);
+
+fixpoint stop after 10 {
+  forkify(*);
+  fork-guard-elim(*);
+  fork-unroll(*);
+  predication(*);
+  gvn(*);
+  phi-elim(*);
+  ccp(*);
+  simplify-cfg(*);
+  dce(*);
+  lift-dc-math(*);
+}
+
+a2p(*);
+sroa(*);
+gvn(*);
+phi-elim(*);
+ccp(*);
+simplify-cfg(*);
+dce(*);
+grape(fft);
+xdot[true](*);
+gcm(*);
diff --git a/juno_samples/grape_fft/src/main.rs b/juno_samples/grape_fft/src/main.rs
new file mode 100644
index 0000000000000000000000000000000000000000..947be6431397ae1ccbc271abcbe9843ea2e06c9c
--- /dev/null
+++ b/juno_samples/grape_fft/src/main.rs
@@ -0,0 +1,45 @@
+#![feature(concat_idents)]
+
+use std::time::Instant;
+
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
+
+use hercules_rt::{runner, HerculesCPURef};
+
+juno_build::juno!("fft");
+
+#[cfg(feature = "grape")]
+use grape_sim::*;
+fn main() {
+    async_std::task::block_on(async {
+        let a: Box<[i16]> = (1..=8).collect::<Vec<_>>().into_boxed_slice();
+
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = HerculesCPURef::from_slice(&a);
+            let start = Instant::now();
+
+            let mut r = runner!(entry);
+            let c = r.run(a).await;
+            let duration = start.elapsed();
+            println!("Time elapsed in some_function() is: {:?}", duration);
+
+            assert!(false);
+            print!("{:?}", c);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a));
+            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b));
+            let mut r = runner!(simple);
+            let c = r.run(8, a.get_ref(), b.get_ref()).await;
+            assert_eq!(c, 120);
+        }
+    });
+}
+
+#[test]
+fn simple3_test() {
+    main();
+}
diff --git a/juno_samples/grape_reduction_host/Cargo.toml b/juno_samples/grape_reduction_host/Cargo.toml
new file mode 100644
index 0000000000000000000000000000000000000000..f38a566c11282b4dc1413f6add670723b4bddf29
--- /dev/null
+++ b/juno_samples/grape_reduction_host/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "juno_grape_reduction_host"
+version = "0.1.0"
+authors = ["Xavier Routh <xrouth2@illinois.edu>"]
+edition = "2021"
+
+[[bin]]
+name = "juno_grape_reduction_host"
+path = "src/main.rs"
+
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+grape = []
+
+[build-dependencies]
+juno_build = { path = "../../juno_build" }
+
+[dependencies]
+juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
+grape_sim = { path = "../../grape_sim" }
+with_builtin_macros = "0.1.0"
+async-std = "*"
diff --git a/juno_samples/grape_reduction/build.rs b/juno_samples/grape_reduction_host/build.rs
similarity index 100%
rename from juno_samples/grape_reduction/build.rs
rename to juno_samples/grape_reduction_host/build.rs
diff --git a/juno_samples/grape_reduction_host/src/cpu.sch b/juno_samples/grape_reduction_host/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..7934b277d2ccc3daa385f95182ad6555dd4040f1
--- /dev/null
+++ b/juno_samples/grape_reduction_host/src/cpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+
+gcm(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/grape_reduction/src/grape.sch b/juno_samples/grape_reduction_host/src/grape.sch
similarity index 95%
rename from juno_samples/grape_reduction/src/grape.sch
rename to juno_samples/grape_reduction_host/src/grape.sch
index d098a3327e7790ccbea13a38e3abe7d1f2c77ef0..0265e729f52a2108cc6c07bba7fbe2bb6649f27c 100644
--- a/juno_samples/grape_reduction/src/grape.sch
+++ b/juno_samples/grape_reduction_host/src/grape.sch
@@ -8,7 +8,7 @@ inline(fake_entry);
 delete-uncalled(*);
 
 forkify(*);
-fork-tile[4, 0, false, true](*);
+fork-tile[8, 0, false, true](*);
 let a = fork-split(*);
 print[a._1_fake_entry.fj1]();
 let inner = outline(a._1_fake_entry.fj1);
@@ -53,7 +53,7 @@ fork-unroll(a._1_fake_entry.fj0);
 // xdot[true](*);
 
 
-reassociate(inner);
+// reassociate(inner);
 
 
 
diff --git a/juno_samples/grape_reduction_host/src/grape3.sch b/juno_samples/grape_reduction_host/src/grape3.sch
new file mode 100644
index 0000000000000000000000000000000000000000..248254316f8209ed3a4800c070636892e19ac07a
--- /dev/null
+++ b/juno_samples/grape_reduction_host/src/grape3.sch
@@ -0,0 +1,45 @@
+gvn(*);
+phi-elim(*);
+ccp(*);
+simplify-cfg(*);
+dce(*);
+
+inline(fake_entry);
+delete-uncalled(*);
+
+forkify(*);
+fork-guard-elim(*);
+dce(*);
+
+
+fixpoint stop after 10 {
+  forkify(*);
+  fork-guard-elim(*);
+  fork-unroll(*);
+  predication(*);
+  gvn(*);
+  phi-elim(*);
+  ccp(*);
+  simplify-cfg(*);
+  dce(*);
+  lift-dc-math(*);
+}
+// xdot[true](*);
+
+a2p(*);
+sroa(*);
+xdot[true](*);
+
+// reassociate go brr
+reassociate(*);
+xdot[true](*);
+
+gvn(*);
+phi-elim(*);
+ccp(*);
+simplify-cfg(*);
+dce(*);
+grape(fake_entry);
+xdot[true](*);
+gcm(*);
+xdot[true](*);
diff --git a/juno_samples/grape_reduction_host/src/main.rs b/juno_samples/grape_reduction_host/src/main.rs
new file mode 100644
index 0000000000000000000000000000000000000000..81e5ec0cdf12a2ca3ef365d552c4fd0841f12e33
--- /dev/null
+++ b/juno_samples/grape_reduction_host/src/main.rs
@@ -0,0 +1,50 @@
+#![feature(concat_idents)]
+
+use std::time::Instant;
+
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
+
+use hercules_rt::{runner, HerculesCPURef};
+
+juno_build::juno!("simple");
+
+#[cfg(feature = "grape")]
+use grape_sim::*;
+
+fn main() {
+    async_std::task::block_on(async {
+        let size = 64;
+        let a: Box<[i16]> = (1..=size).collect::<Vec<_>>().into_boxed_slice();
+        let sum: i16 = a.iter().sum();
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = HerculesCPURef::from_slice(&a);
+            let start = Instant::now();
+
+            let mut r = runner!(entry);
+            let c = r.run(a).await;
+            let duration = start.elapsed();
+
+            println!("Time elapsed in some_function() is: {:?}", duration);
+
+            assert!(false);
+            print!("{:?}", c);
+
+            assert_eq!(c, sum);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a));
+            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b));
+            let mut r = runner!(simple);
+            let c = r.run(8, a.get_ref(), b.get_ref()).await;
+            assert_eq!(c, 120);
+        }
+    });
+}
+
+#[test]
+fn simple3_test() {
+    main();
+}
diff --git a/juno_samples/grape_reduction_host/src/simple.jn b/juno_samples/grape_reduction_host/src/simple.jn
new file mode 100644
index 0000000000000000000000000000000000000000..bd94baf7c0c581038e85a83930e56b83defd91ed
--- /dev/null
+++ b/juno_samples/grape_reduction_host/src/simple.jn
@@ -0,0 +1,27 @@
+fn reduce<n: usize>(a : i16[n]) -> i16 {
+  let acc: i16 = 0;
+
+  for i = 0 to n {
+    acc += a[i];
+  }
+  
+  return acc;
+}
+
+fn fake_entry(a: i16[64]) -> i16 {
+  @this {
+  let r = reduce::<64>(a);
+
+  return r;
+  }
+}
+
+#[entry]
+fn entry(a: i16[64]) -> i16 {
+  return fake_entry(a);
+}
+
+
+
+
+
diff --git a/juno_samples/grape_reduction_tree/Cargo.toml b/juno_samples/grape_reduction_tree/Cargo.toml
new file mode 100644
index 0000000000000000000000000000000000000000..a20c606ba90d814bbd49d9783613d89d558a97c7
--- /dev/null
+++ b/juno_samples/grape_reduction_tree/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "juno_grape_reduction_tree"
+version = "0.1.0"
+authors = ["Xavier Routh <xrouth2@illinois.edu>"]
+edition = "2021"
+
+[[bin]]
+name = "juno_grape_reduction_tree"
+path = "src/main.rs"
+
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+grape = []
+
+[build-dependencies]
+juno_build = { path = "../../juno_build" }
+
+[dependencies]
+juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
+grape_sim = { path = "../../grape_sim" }
+with_builtin_macros = "0.1.0"
+async-std = "*"
diff --git a/juno_samples/grape_reduction_tree/build.rs b/juno_samples/grape_reduction_tree/build.rs
new file mode 100644
index 0000000000000000000000000000000000000000..5ee28f2ccace52074541d72014082dbe056f1f14
--- /dev/null
+++ b/juno_samples/grape_reduction_tree/build.rs
@@ -0,0 +1,35 @@
+use juno_build::JunoCompiler;
+
+fn main() {
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .file_in_src("simple.jn")
+            .unwrap()
+            .schedule_in_src("cpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "grape")]
+    {
+        JunoCompiler::new()
+            .file_in_src("simple.jn")
+            .unwrap()
+            .schedule_in_src("grape.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .file_in_src("simple.jn")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+}
diff --git a/juno_samples/grape_reduction_tree/src/cpu.sch b/juno_samples/grape_reduction_tree/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..7934b277d2ccc3daa385f95182ad6555dd4040f1
--- /dev/null
+++ b/juno_samples/grape_reduction_tree/src/cpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+
+gcm(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/grape_reduction_tree/src/grape.sch b/juno_samples/grape_reduction_tree/src/grape.sch
new file mode 100644
index 0000000000000000000000000000000000000000..248254316f8209ed3a4800c070636892e19ac07a
--- /dev/null
+++ b/juno_samples/grape_reduction_tree/src/grape.sch
@@ -0,0 +1,45 @@
+gvn(*);
+phi-elim(*);
+ccp(*);
+simplify-cfg(*);
+dce(*);
+
+inline(fake_entry);
+delete-uncalled(*);
+
+forkify(*);
+fork-guard-elim(*);
+dce(*);
+
+
+fixpoint stop after 10 {
+  forkify(*);
+  fork-guard-elim(*);
+  fork-unroll(*);
+  predication(*);
+  gvn(*);
+  phi-elim(*);
+  ccp(*);
+  simplify-cfg(*);
+  dce(*);
+  lift-dc-math(*);
+}
+// xdot[true](*);
+
+a2p(*);
+sroa(*);
+xdot[true](*);
+
+// reassociate go brr
+reassociate(*);
+xdot[true](*);
+
+gvn(*);
+phi-elim(*);
+ccp(*);
+simplify-cfg(*);
+dce(*);
+grape(fake_entry);
+xdot[true](*);
+gcm(*);
+xdot[true](*);
diff --git a/juno_samples/grape_reduction/src/main.rs b/juno_samples/grape_reduction_tree/src/main.rs
similarity index 82%
rename from juno_samples/grape_reduction/src/main.rs
rename to juno_samples/grape_reduction_tree/src/main.rs
index 1c209ca181ad5e258cfe6ead86aceb19d534c9d9..d113aa0abc20ce07200273a019e0774e535c3faa 100644
--- a/juno_samples/grape_reduction/src/main.rs
+++ b/juno_samples/grape_reduction_tree/src/main.rs
@@ -1,5 +1,7 @@
 #![feature(concat_idents)]
 
+use std::time::Instant;
+
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
 
@@ -17,10 +19,15 @@ fn main() {
         #[cfg(not(feature = "cuda"))]
         {
             let a = HerculesCPURef::from_slice(&a);
+            let start = Instant::now();
+
             let mut r = runner!(entry);
             let c = r.run(a).await;
             print!("{:?}", c);
+            let duration = start.elapsed();
+            println!("Time elapsed in some_function() is: {:?}", duration);
 
+            assert!(false);
             assert_eq!(c, sum);
         }
         #[cfg(feature = "cuda")]
diff --git a/juno_samples/grape_reduction/src/simple.jn b/juno_samples/grape_reduction_tree/src/simple.jn
similarity index 100%
rename from juno_samples/grape_reduction/src/simple.jn
rename to juno_samples/grape_reduction_tree/src/simple.jn