diff --git a/Cargo.lock b/Cargo.lock
index 61cde7f161b7c4177cb781addeb2f484af3b7477..ccd42c296c953d24a081c13a4725f6a4449aa15a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1301,6 +1301,7 @@ dependencies = [
  "hercules_rt",
  "juno_build",
  "nom 8.0.0",
+ "rayon",
  "with_builtin_macros",
 ]
 
@@ -1497,6 +1498,7 @@ dependencies = [
  "hercules_rt",
  "juno_build",
  "nom 8.0.0",
+ "rayon",
  "with_builtin_macros",
 ]
 
diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index d94f0e19191a028dfaded785c460164513b712a4..5c5d35629a42bb2465b1558f5aed37fc20a1237a 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -160,7 +160,7 @@ impl<'a> RTContext<'a> {
         // Dump the function signature.
         write!(
             w,
-            "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]async unsafe fn {}_{}(",
+            "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]unsafe fn {}_{}(",
             self.module_name,
             func.name
         )?;
@@ -198,7 +198,7 @@ impl<'a> RTContext<'a> {
         }
         write!(w, ") -> ")?;
         self.write_rust_return_type(w, &func.return_types)?;
-        write!(w, " {{")?;
+        write!(w, " {{use rayon::prelude::*;")?;
 
         // Dump signatures for called device functions.
         // For single-return functions we directly expose the device function
@@ -407,9 +407,14 @@ impl<'a> RTContext<'a> {
 
                 // Emit loops for the thread IDs.
                 for (idx, factor) in factors.into_iter().enumerate() {
-                    write!(prologue, "for tid_{}_{} in 0..", id.idx(), idx)?;
+                    write!(prologue, "(0..")?;
                     self.codegen_dynamic_constant(*factor, prologue)?;
-                    write!(prologue, " {{")?;
+                    write!(
+                        prologue,
+                        ").into_par_iter().for_each(|tid_{}_{}| {{",
+                        id.idx(),
+                        idx
+                    )?;
                 }
 
                 // Emit clones of arcs used inside the fork-join.
@@ -422,13 +427,6 @@ impl<'a> RTContext<'a> {
                     }
                 }
 
-                // Spawn an async closure and push its future to a Vec.
-                write!(
-                    prologue,
-                    "fork_{}.push(::async_std::task::spawn(async move {{",
-                    id.idx()
-                )?;
-
                 // Open a new environment.
                 self.codegen_open_environment(id, prologue)?;
 
@@ -454,22 +452,15 @@ impl<'a> RTContext<'a> {
 
                 // Close the branch inside the async closure.
                 let epilogue = &mut blocks.get_mut(&id).unwrap().epilogue;
-                write!(
-                    epilogue,
-                    "::std::sync::atomic::fence(::std::sync::atomic::Ordering::Release);return;}}"
-                )?;
+                write!(epilogue, "return;}}")?;
 
                 // Close the fork's environment.
                 self.codegen_close_environment(epilogue)?;
 
-                // Close the async closure and push statement from the
-                // fork.
-                write!(epilogue, "}}));")?;
-
                 // Close the loops emitted by the fork node.
                 let fork = self.join_fork_map[&id];
                 for _ in 0..func.nodes[fork.idx()].try_fork().unwrap().1.len() {
-                    write!(epilogue, "}}")?;
+                    write!(epilogue, "}});")?;
                 }
 
                 // Close the branch for the fork outside the async closure.
@@ -479,14 +470,6 @@ impl<'a> RTContext<'a> {
                 let succ = self.control_subgraph.succs(id).next().unwrap();
                 write!(epilogue, "{} => {{", id.idx())?;
 
-                // Await the empty futures for the fork-joins, waiting for them
-                // to complete.
-                write!(
-                    epilogue,
-                    "for fut in fork_{}.drain(..) {{ fut.await; }}; ::std::sync::atomic::fence(::std::sync::atomic::Ordering::Acquire);",
-                    fork.idx(),
-                )?;
-
                 // Emit the assignments to the reduce variables in the
                 // surrounding context. It's very unfortunate that we have to do
                 // it while lowering the join node (rather than the reduce nodes
@@ -669,9 +652,9 @@ impl<'a> RTContext<'a> {
                     }
                 };
                 let postfix = match (device, is_async) {
-                    (Device::AsyncRust, false) => ".await",
+                    (Device::AsyncRust, false) => "",
                     (_, false) => "",
-                    (Device::AsyncRust, true) => ".await}))",
+                    (Device::AsyncRust, true) => "}))",
                     (_, true) => "}))",
                 };
                 write!(
@@ -1264,11 +1247,6 @@ impl<'a> RTContext<'a> {
             }
         }
 
-        // Declare Vecs for storing futures of fork-joins.
-        for fork in self.fork_tree[&root].iter() {
-            write!(w, "let mut fork_{} = vec![];", fork.idx())?;
-        }
-
         // The core executor is a Rust loop. We literally run a "control token"
         // as described in the original sea of nodes paper through the basic
         // blocks to drive execution.
@@ -1358,7 +1336,7 @@ impl<'a> RTContext<'a> {
             })
             .collect();
 
-        write!(w, "async fn run<'runner:")?;
+        write!(w, "fn run<'runner:")?;
         for (ret_idx, origins) in returned_origins.iter().enumerate() {
             if origins.iter().any(|origin| !origin.is_parameter()) {
                 write!(w, " 'r{} +", ret_idx)?;
@@ -1482,7 +1460,7 @@ impl<'a> RTContext<'a> {
         for idx in 0..func.param_types.len() {
             write!(w, "p{}, ", idx)?;
         }
-        write!(w, ").await;")?;
+        write!(w, ");")?;
         // Return the result, appropriately wrapping pointers
         if num_returns == 1 {
             if self.module.types[func.return_types[0].idx()].is_primitive() {
diff --git a/juno_samples/rodinia/cfd/Cargo.toml b/juno_samples/rodinia/cfd/Cargo.toml
index 6720b5275381594a63f31571ccf6266ebe4e46f4..e39d16ab522a208bab816ca7c70a8ef64f856499 100644
--- a/juno_samples/rodinia/cfd/Cargo.toml
+++ b/juno_samples/rodinia/cfd/Cargo.toml
@@ -21,6 +21,7 @@ juno_build = { path = "../../../juno_build" }
 juno_build = { path = "../../../juno_build" }
 hercules_rt = { path = "../../../hercules_rt" }
 async-std = "*"
+rayon = "*"
 clap = { version = "*", features = ["derive"] }
 with_builtin_macros = "0.1.0"
 nom = "*"
diff --git a/juno_samples/rodinia/cfd/benches/cfd_bench.rs b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
index 5fc73db9d369be5ef9695f37ad1f39122facf991..8be321d17e18e1c146082787c2ae873460a536ba 100644
--- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs
+++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
@@ -46,40 +46,37 @@ fn cfd_bench(c: &mut Criterion) {
 
     group.bench_function("cfd bench euler", |b| {
         b.iter(|| {
-            async_std::task::block_on(async {
-                r.run(
-                    nelr as u64,
-                    iterations as u64,
-                    v_density.to(),
-                    v_momentum_x.to(),
-                    v_momentum_y.to(),
-                    v_momentum_z.to(),
-                    v_energy.to(),
-                    areas.to(),
-                    elements_surrounding_elements.to(),
-                    normals_x.to(),
-                    normals_y.to(),
-                    normals_z.to(),
-                    ff_variable.density,
-                    ff_variable.momentum.x,
-                    ff_variable.momentum.y,
-                    ff_variable.momentum.z,
-                    ff_variable.energy,
-                    ff_fc_density_energy.x,
-                    ff_fc_density_energy.y,
-                    ff_fc_density_energy.z,
-                    ff_fc_momentum_x.x,
-                    ff_fc_momentum_x.y,
-                    ff_fc_momentum_x.z,
-                    ff_fc_momentum_y.x,
-                    ff_fc_momentum_y.y,
-                    ff_fc_momentum_y.z,
-                    ff_fc_momentum_z.x,
-                    ff_fc_momentum_z.y,
-                    ff_fc_momentum_z.z,
-                )
-                .await
-            });
+            r.run(
+                nelr as u64,
+                iterations as u64,
+                v_density.to(),
+                v_momentum_x.to(),
+                v_momentum_y.to(),
+                v_momentum_z.to(),
+                v_energy.to(),
+                areas.to(),
+                elements_surrounding_elements.to(),
+                normals_x.to(),
+                normals_y.to(),
+                normals_z.to(),
+                ff_variable.density,
+                ff_variable.momentum.x,
+                ff_variable.momentum.y,
+                ff_variable.momentum.z,
+                ff_variable.energy,
+                ff_fc_density_energy.x,
+                ff_fc_density_energy.y,
+                ff_fc_density_energy.z,
+                ff_fc_momentum_x.x,
+                ff_fc_momentum_x.y,
+                ff_fc_momentum_x.z,
+                ff_fc_momentum_y.x,
+                ff_fc_momentum_y.y,
+                ff_fc_momentum_y.z,
+                ff_fc_momentum_z.x,
+                ff_fc_momentum_z.y,
+                ff_fc_momentum_z.z,
+            );
         })
     });
 
@@ -118,40 +115,37 @@ fn cfd_bench(c: &mut Criterion) {
 
     group.bench_function("cfd bench pre-euler", |b| {
         b.iter(|| {
-            async_std::task::block_on(async {
-                r.run(
-                    nelr as u64,
-                    iterations as u64,
-                    v_density.to(),
-                    v_momentum_x.to(),
-                    v_momentum_y.to(),
-                    v_momentum_z.to(),
-                    v_energy.to(),
-                    areas.to(),
-                    elements_surrounding_elements.to(),
-                    normals_x.to(),
-                    normals_y.to(),
-                    normals_z.to(),
-                    ff_variable.density,
-                    ff_variable.momentum.x,
-                    ff_variable.momentum.y,
-                    ff_variable.momentum.z,
-                    ff_variable.energy,
-                    ff_fc_density_energy.x,
-                    ff_fc_density_energy.y,
-                    ff_fc_density_energy.z,
-                    ff_fc_momentum_x.x,
-                    ff_fc_momentum_x.y,
-                    ff_fc_momentum_x.z,
-                    ff_fc_momentum_y.x,
-                    ff_fc_momentum_y.y,
-                    ff_fc_momentum_y.z,
-                    ff_fc_momentum_z.x,
-                    ff_fc_momentum_z.y,
-                    ff_fc_momentum_z.z,
-                )
-                .await
-            });
+            r.run(
+                nelr as u64,
+                iterations as u64,
+                v_density.to(),
+                v_momentum_x.to(),
+                v_momentum_y.to(),
+                v_momentum_z.to(),
+                v_energy.to(),
+                areas.to(),
+                elements_surrounding_elements.to(),
+                normals_x.to(),
+                normals_y.to(),
+                normals_z.to(),
+                ff_variable.density,
+                ff_variable.momentum.x,
+                ff_variable.momentum.y,
+                ff_variable.momentum.z,
+                ff_variable.energy,
+                ff_fc_density_energy.x,
+                ff_fc_density_energy.y,
+                ff_fc_density_energy.z,
+                ff_fc_momentum_x.x,
+                ff_fc_momentum_x.y,
+                ff_fc_momentum_x.z,
+                ff_fc_momentum_y.x,
+                ff_fc_momentum_y.y,
+                ff_fc_momentum_y.z,
+                ff_fc_momentum_z.x,
+                ff_fc_momentum_z.y,
+                ff_fc_momentum_z.z,
+            );
         })
     });
 }
diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs
index d61df4c5b7bb0c7915d8605debd72e375a5482b0..bb8bf079c67f26c75ea0d83cbfc7037577754ee8 100644
--- a/juno_samples/rodinia/cfd/src/lib.rs
+++ b/juno_samples/rodinia/cfd/src/lib.rs
@@ -48,41 +48,37 @@ fn run_euler(
     let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
     let mut runner = runner!(euler);
-    let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async {
-        runner
-            .run(
-                nelr as u64,
-                iterations as u64,
-                v_density.to(),
-                v_momentum_x.to(),
-                v_momentum_y.to(),
-                v_momentum_z.to(),
-                v_energy.to(),
-                areas.to(),
-                elements_surrounding_elements.to(),
-                normals_x.to(),
-                normals_y.to(),
-                normals_z.to(),
-                ff_variable.density,
-                ff_variable.momentum.x,
-                ff_variable.momentum.y,
-                ff_variable.momentum.z,
-                ff_variable.energy,
-                ff_fc_density_energy.x,
-                ff_fc_density_energy.y,
-                ff_fc_density_energy.z,
-                ff_fc_momentum_x.x,
-                ff_fc_momentum_x.y,
-                ff_fc_momentum_x.z,
-                ff_fc_momentum_y.x,
-                ff_fc_momentum_y.y,
-                ff_fc_momentum_y.z,
-                ff_fc_momentum_z.x,
-                ff_fc_momentum_z.y,
-                ff_fc_momentum_z.z,
-            )
-            .await
-    });
+    let (density, momentum_x, momentum_y, momentum_z, energy) = runner.run(
+        nelr as u64,
+        iterations as u64,
+        v_density.to(),
+        v_momentum_x.to(),
+        v_momentum_y.to(),
+        v_momentum_z.to(),
+        v_energy.to(),
+        areas.to(),
+        elements_surrounding_elements.to(),
+        normals_x.to(),
+        normals_y.to(),
+        normals_z.to(),
+        ff_variable.density,
+        ff_variable.momentum.x,
+        ff_variable.momentum.y,
+        ff_variable.momentum.z,
+        ff_variable.energy,
+        ff_fc_density_energy.x,
+        ff_fc_density_energy.y,
+        ff_fc_density_energy.z,
+        ff_fc_momentum_x.x,
+        ff_fc_momentum_x.y,
+        ff_fc_momentum_x.z,
+        ff_fc_momentum_y.x,
+        ff_fc_momentum_y.y,
+        ff_fc_momentum_y.z,
+        ff_fc_momentum_z.x,
+        ff_fc_momentum_z.y,
+        ff_fc_momentum_z.z,
+    );
 
     Variables {
         density: AlignedSlice::from_slice(HerculesMutBox::from(density).as_slice()),
@@ -122,41 +118,37 @@ fn run_pre_euler(
     let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
     let mut runner = runner!(pre_euler);
-    let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async {
-        runner
-            .run(
-                nelr as u64,
-                iterations as u64,
-                v_density.to(),
-                v_momentum_x.to(),
-                v_momentum_y.to(),
-                v_momentum_z.to(),
-                v_energy.to(),
-                areas.to(),
-                elements_surrounding_elements.to(),
-                normals_x.to(),
-                normals_y.to(),
-                normals_z.to(),
-                ff_variable.density,
-                ff_variable.momentum.x,
-                ff_variable.momentum.y,
-                ff_variable.momentum.z,
-                ff_variable.energy,
-                ff_fc_density_energy.x,
-                ff_fc_density_energy.y,
-                ff_fc_density_energy.z,
-                ff_fc_momentum_x.x,
-                ff_fc_momentum_x.y,
-                ff_fc_momentum_x.z,
-                ff_fc_momentum_y.x,
-                ff_fc_momentum_y.y,
-                ff_fc_momentum_y.z,
-                ff_fc_momentum_z.x,
-                ff_fc_momentum_z.y,
-                ff_fc_momentum_z.z,
-            )
-            .await
-    });
+    let (density, momentum_x, momentum_y, momentum_z, energy) = runner.run(
+        nelr as u64,
+        iterations as u64,
+        v_density.to(),
+        v_momentum_x.to(),
+        v_momentum_y.to(),
+        v_momentum_z.to(),
+        v_energy.to(),
+        areas.to(),
+        elements_surrounding_elements.to(),
+        normals_x.to(),
+        normals_y.to(),
+        normals_z.to(),
+        ff_variable.density,
+        ff_variable.momentum.x,
+        ff_variable.momentum.y,
+        ff_variable.momentum.z,
+        ff_variable.energy,
+        ff_fc_density_energy.x,
+        ff_fc_density_energy.y,
+        ff_fc_density_energy.z,
+        ff_fc_momentum_x.x,
+        ff_fc_momentum_x.y,
+        ff_fc_momentum_x.z,
+        ff_fc_momentum_y.x,
+        ff_fc_momentum_y.y,
+        ff_fc_momentum_y.z,
+        ff_fc_momentum_z.x,
+        ff_fc_momentum_z.y,
+        ff_fc_momentum_z.z,
+    );
 
     Variables {
         density: AlignedSlice::from_slice(HerculesMutBox::from(density).as_slice()),
diff --git a/juno_samples/rodinia/srad/Cargo.toml b/juno_samples/rodinia/srad/Cargo.toml
index facf8c3bc7c92fe0b77dd85900c3e53307d358e5..46c2ae8e316011c9d563797ea1eedb88bd8d54d9 100644
--- a/juno_samples/rodinia/srad/Cargo.toml
+++ b/juno_samples/rodinia/srad/Cargo.toml
@@ -21,6 +21,7 @@ juno_build = { path = "../../../juno_build" }
 juno_build = { path = "../../../juno_build" }
 hercules_rt = { path = "../../../hercules_rt" }
 async-std = "*"
+rayon = "*"
 clap = { version = "*", features = ["derive"] }
 with_builtin_macros = "0.1.0"
 nom = "*"
diff --git a/juno_samples/rodinia/srad/benches/srad_bench.rs b/juno_samples/rodinia/srad/benches/srad_bench.rs
index 6af13aae5d9093bd59cf6299dfa64ac3e73209a6..2ea230ab25c6cf254d8a7914bcff629f9ae00ea7 100644
--- a/juno_samples/rodinia/srad/benches/srad_bench.rs
+++ b/juno_samples/rodinia/srad/benches/srad_bench.rs
@@ -26,17 +26,8 @@ fn srad_bench(c: &mut Criterion) {
     let mut image_h = HerculesMutBox::from(image.clone());
     group.bench_function("srad bench", |b| {
         b.iter(|| {
-            async_std::task::block_on(async {
-                r.run(
-                    nrows as u64,
-                    ncols as u64,
-                    niter as u64,
-                    image_h.to(),
-                    max,
-                    lambda,
-                )
-                .await
-            });
+            let to = image_h.to();
+            r.run(nrows as u64, ncols as u64, niter as u64, to, max, lambda);
         })
     });
 }
diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch
index 7b7a6c9e2203b8e280f323ed9e6589ab149537c0..186ef8d53f4a411cabc00f9c9294878ebf03a961 100644
--- a/juno_samples/rodinia/srad/src/cpu.sch
+++ b/juno_samples/rodinia/srad/src/cpu.sch
@@ -39,7 +39,6 @@ fork-tile[32, 0, false, false](loop2);
 let split = fork-split(loop2);
 let loop2_body = outline(split.srad_1.fj1);
 simpl!(loop2, loop2_body);
-
 inline(srad@loop2);
 delete-uncalled(*);
 
diff --git a/juno_samples/rodinia/srad/src/lib.rs b/juno_samples/rodinia/srad/src/lib.rs
index cb156d9db712155ad9e5b3f9775e3c6c29dbf22a..5d5576e48948ee5f4db858a854aa230109fc5019 100644
--- a/juno_samples/rodinia/srad/src/lib.rs
+++ b/juno_samples/rodinia/srad/src/lib.rs
@@ -27,66 +27,60 @@ pub struct SRADInputs {
 }
 
 pub fn srad_harness(args: SRADInputs) {
-    async_std::task::block_on(async {
-        let SRADInputs {
-            niter,
-            lambda,
-            nrows,
-            ncols,
-            image,
-            output,
-            verify,
-            output_verify,
-        } = args;
+    let SRADInputs {
+        niter,
+        lambda,
+        nrows,
+        ncols,
+        image,
+        output,
+        verify,
+        output_verify,
+    } = args;
 
-        let Image {
-            image: image_ori,
-            max,
-            rows: image_ori_rows,
-            cols: image_ori_cols,
-        } = read_graphics(image);
-        let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols);
-        let mut image_h = HerculesMutBox::from(image.clone());
+    let Image {
+        image: image_ori,
+        max,
+        rows: image_ori_rows,
+        cols: image_ori_cols,
+    } = read_graphics(image);
+    let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols);
+    let mut image_h = HerculesMutBox::from(image.clone());
 
-        let mut runner = runner!(srad);
-        let result: Vec<f32> = HerculesMutBox::from(
-            runner
-                .run(
-                    nrows as u64,
-                    ncols as u64,
-                    niter as u64,
-                    image_h.to(),
-                    max,
-                    lambda,
-                )
-                .await,
-        )
-        .as_slice()
-        .to_vec();
+    let mut runner = runner!(srad);
+    let result: Vec<f32> = HerculesMutBox::from(runner.run(
+        nrows as u64,
+        ncols as u64,
+        niter as u64,
+        image_h.to(),
+        max,
+        lambda,
+    ))
+    .as_slice()
+    .to_vec();
 
-        if let Some(output) = output {
-            write_graphics(output, &result, nrows, ncols, max);
-        }
-
-        if verify {
-            let mut rust_result = image;
-            rust_srad::srad(nrows, ncols, niter, &mut rust_result, max, lambda);
+    if let Some(output) = output {
+        write_graphics(output, &result, nrows, ncols, max);
+    }
 
-            if let Some(output) = output_verify {
-                write_graphics(output, &rust_result, nrows, ncols, max);
-            }
+    if verify {
+        let mut rust_result = image;
+        rust_srad::srad(nrows, ncols, niter, &mut rust_result, max, lambda);
 
-            let max_diff = result
-                .iter()
-                .zip(rust_result.iter())
-                .map(|(a, b)| (*a as i32 - *b as i32).abs())
-                .max()
-                .unwrap_or(0);
-            assert!(
-                max_diff <= 2,
-                "Verification failed: maximum pixel difference of {} exceeds threshold of 1",
-                max_diff
-            );
+        if let Some(output) = output_verify {
+            write_graphics(output, &rust_result, nrows, ncols, max);
         }
-    })
+
+        let max_diff = result
+            .iter()
+            .zip(rust_result.iter())
+            .map(|(a, b)| (*a as i32 - *b as i32).abs())
+            .max()
+            .unwrap_or(0);
+        assert!(
+            max_diff <= 2,
+            "Verification failed: maximum pixel difference of {} exceeds threshold of 1",
+            max_diff
+        );
+    }
 }