diff --git a/Cargo.lock b/Cargo.lock index 61cde7f161b7c4177cb781addeb2f484af3b7477..ccd42c296c953d24a081c13a4725f6a4449aa15a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1301,6 +1301,7 @@ dependencies = [ "hercules_rt", "juno_build", "nom 8.0.0", + "rayon", "with_builtin_macros", ] @@ -1497,6 +1498,7 @@ dependencies = [ "hercules_rt", "juno_build", "nom 8.0.0", + "rayon", "with_builtin_macros", ] diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index d94f0e19191a028dfaded785c460164513b712a4..5c5d35629a42bb2465b1558f5aed37fc20a1237a 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -160,7 +160,7 @@ impl<'a> RTContext<'a> { // Dump the function signature. write!( w, - "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]async unsafe fn {}_{}(", + "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]unsafe fn {}_{}(", self.module_name, func.name )?; @@ -198,7 +198,7 @@ impl<'a> RTContext<'a> { } write!(w, ") -> ")?; self.write_rust_return_type(w, &func.return_types)?; - write!(w, " {{")?; + write!(w, " {{use rayon::prelude::*;")?; // Dump signatures for called device functions. // For single-return functions we directly expose the device function @@ -407,9 +407,14 @@ impl<'a> RTContext<'a> { // Emit loops for the thread IDs. for (idx, factor) in factors.into_iter().enumerate() { - write!(prologue, "for tid_{}_{} in 0..", id.idx(), idx)?; + write!(prologue, "(0..")?; self.codegen_dynamic_constant(*factor, prologue)?; - write!(prologue, " {{")?; + write!( + prologue, + ").into_par_iter().for_each(|tid_{}_{}| {{", + id.idx(), + idx + )?; } // Emit clones of arcs used inside the fork-join. @@ -422,13 +427,6 @@ impl<'a> RTContext<'a> { } } - // Spawn an async closure and push its future to a Vec. - write!( - prologue, - "fork_{}.push(::async_std::task::spawn(async move {{", - id.idx() - )?; - // Open a new environment. self.codegen_open_environment(id, prologue)?; @@ -454,22 +452,15 @@ impl<'a> RTContext<'a> { // Close the branch inside the async closure. let epilogue = &mut blocks.get_mut(&id).unwrap().epilogue; - write!( - epilogue, - "::std::sync::atomic::fence(::std::sync::atomic::Ordering::Release);return;}}" - )?; + write!(epilogue, "return;}}")?; // Close the fork's environment. self.codegen_close_environment(epilogue)?; - // Close the async closure and push statement from the - // fork. - write!(epilogue, "}}));")?; - // Close the loops emitted by the fork node. let fork = self.join_fork_map[&id]; for _ in 0..func.nodes[fork.idx()].try_fork().unwrap().1.len() { - write!(epilogue, "}}")?; + write!(epilogue, "}});")?; } // Close the branch for the fork outside the async closure. @@ -479,14 +470,6 @@ impl<'a> RTContext<'a> { let succ = self.control_subgraph.succs(id).next().unwrap(); write!(epilogue, "{} => {{", id.idx())?; - // Await the empty futures for the fork-joins, waiting for them - // to complete. - write!( - epilogue, - "for fut in fork_{}.drain(..) {{ fut.await; }}; ::std::sync::atomic::fence(::std::sync::atomic::Ordering::Acquire);", - fork.idx(), - )?; - // Emit the assignments to the reduce variables in the // surrounding context. It's very unfortunate that we have to do // it while lowering the join node (rather than the reduce nodes @@ -669,9 +652,9 @@ impl<'a> RTContext<'a> { } }; let postfix = match (device, is_async) { - (Device::AsyncRust, false) => ".await", + (Device::AsyncRust, false) => "", (_, false) => "", - (Device::AsyncRust, true) => ".await}))", + (Device::AsyncRust, true) => "}))", (_, true) => "}))", }; write!( @@ -1264,11 +1247,6 @@ impl<'a> RTContext<'a> { } } - // Declare Vecs for storing futures of fork-joins. - for fork in self.fork_tree[&root].iter() { - write!(w, "let mut fork_{} = vec![];", fork.idx())?; - } - // The core executor is a Rust loop. We literally run a "control token" // as described in the original sea of nodes paper through the basic // blocks to drive execution. @@ -1358,7 +1336,7 @@ impl<'a> RTContext<'a> { }) .collect(); - write!(w, "async fn run<'runner:")?; + write!(w, "fn run<'runner:")?; for (ret_idx, origins) in returned_origins.iter().enumerate() { if origins.iter().any(|origin| !origin.is_parameter()) { write!(w, " 'r{} +", ret_idx)?; @@ -1482,7 +1460,7 @@ impl<'a> RTContext<'a> { for idx in 0..func.param_types.len() { write!(w, "p{}, ", idx)?; } - write!(w, ").await;")?; + write!(w, ");")?; // Return the result, appropriately wrapping pointers if num_returns == 1 { if self.module.types[func.return_types[0].idx()].is_primitive() { diff --git a/juno_samples/rodinia/cfd/Cargo.toml b/juno_samples/rodinia/cfd/Cargo.toml index 6720b5275381594a63f31571ccf6266ebe4e46f4..e39d16ab522a208bab816ca7c70a8ef64f856499 100644 --- a/juno_samples/rodinia/cfd/Cargo.toml +++ b/juno_samples/rodinia/cfd/Cargo.toml @@ -21,6 +21,7 @@ juno_build = { path = "../../../juno_build" } juno_build = { path = "../../../juno_build" } hercules_rt = { path = "../../../hercules_rt" } async-std = "*" +rayon = "*" clap = { version = "*", features = ["derive"] } with_builtin_macros = "0.1.0" nom = "*" diff --git a/juno_samples/rodinia/cfd/benches/cfd_bench.rs b/juno_samples/rodinia/cfd/benches/cfd_bench.rs index 5fc73db9d369be5ef9695f37ad1f39122facf991..8be321d17e18e1c146082787c2ae873460a536ba 100644 --- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs +++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs @@ -46,40 +46,37 @@ fn cfd_bench(c: &mut Criterion) { group.bench_function("cfd bench euler", |b| { b.iter(|| { - async_std::task::block_on(async { - r.run( - nelr as u64, - iterations as u64, - v_density.to(), - v_momentum_x.to(), - v_momentum_y.to(), - v_momentum_z.to(), - v_energy.to(), - areas.to(), - elements_surrounding_elements.to(), - normals_x.to(), - normals_y.to(), - normals_z.to(), - ff_variable.density, - ff_variable.momentum.x, - ff_variable.momentum.y, - ff_variable.momentum.z, - ff_variable.energy, - ff_fc_density_energy.x, - ff_fc_density_energy.y, - ff_fc_density_energy.z, - ff_fc_momentum_x.x, - ff_fc_momentum_x.y, - ff_fc_momentum_x.z, - ff_fc_momentum_y.x, - ff_fc_momentum_y.y, - ff_fc_momentum_y.z, - ff_fc_momentum_z.x, - ff_fc_momentum_z.y, - ff_fc_momentum_z.z, - ) - .await - }); + r.run( + nelr as u64, + iterations as u64, + v_density.to(), + v_momentum_x.to(), + v_momentum_y.to(), + v_momentum_z.to(), + v_energy.to(), + areas.to(), + elements_surrounding_elements.to(), + normals_x.to(), + normals_y.to(), + normals_z.to(), + ff_variable.density, + ff_variable.momentum.x, + ff_variable.momentum.y, + ff_variable.momentum.z, + ff_variable.energy, + ff_fc_density_energy.x, + ff_fc_density_energy.y, + ff_fc_density_energy.z, + ff_fc_momentum_x.x, + ff_fc_momentum_x.y, + ff_fc_momentum_x.z, + ff_fc_momentum_y.x, + ff_fc_momentum_y.y, + ff_fc_momentum_y.z, + ff_fc_momentum_z.x, + ff_fc_momentum_z.y, + ff_fc_momentum_z.z, + ); }) }); @@ -118,40 +115,37 @@ fn cfd_bench(c: &mut Criterion) { group.bench_function("cfd bench pre-euler", |b| { b.iter(|| { - async_std::task::block_on(async { - r.run( - nelr as u64, - iterations as u64, - v_density.to(), - v_momentum_x.to(), - v_momentum_y.to(), - v_momentum_z.to(), - v_energy.to(), - areas.to(), - elements_surrounding_elements.to(), - normals_x.to(), - normals_y.to(), - normals_z.to(), - ff_variable.density, - ff_variable.momentum.x, - ff_variable.momentum.y, - ff_variable.momentum.z, - ff_variable.energy, - ff_fc_density_energy.x, - ff_fc_density_energy.y, - ff_fc_density_energy.z, - ff_fc_momentum_x.x, - ff_fc_momentum_x.y, - ff_fc_momentum_x.z, - ff_fc_momentum_y.x, - ff_fc_momentum_y.y, - ff_fc_momentum_y.z, - ff_fc_momentum_z.x, - ff_fc_momentum_z.y, - ff_fc_momentum_z.z, - ) - .await - }); + r.run( + nelr as u64, + iterations as u64, + v_density.to(), + v_momentum_x.to(), + v_momentum_y.to(), + v_momentum_z.to(), + v_energy.to(), + areas.to(), + elements_surrounding_elements.to(), + normals_x.to(), + normals_y.to(), + normals_z.to(), + ff_variable.density, + ff_variable.momentum.x, + ff_variable.momentum.y, + ff_variable.momentum.z, + ff_variable.energy, + ff_fc_density_energy.x, + ff_fc_density_energy.y, + ff_fc_density_energy.z, + ff_fc_momentum_x.x, + ff_fc_momentum_x.y, + ff_fc_momentum_x.z, + ff_fc_momentum_y.x, + ff_fc_momentum_y.y, + ff_fc_momentum_y.z, + ff_fc_momentum_z.x, + ff_fc_momentum_z.y, + ff_fc_momentum_z.z, + ); }) }); } diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs index d61df4c5b7bb0c7915d8605debd72e375a5482b0..bb8bf079c67f26c75ea0d83cbfc7037577754ee8 100644 --- a/juno_samples/rodinia/cfd/src/lib.rs +++ b/juno_samples/rodinia/cfd/src/lib.rs @@ -48,41 +48,37 @@ fn run_euler( let normals_z = HerculesImmBox::from(normals.z.as_slice()); let mut runner = runner!(euler); - let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async { - runner - .run( - nelr as u64, - iterations as u64, - v_density.to(), - v_momentum_x.to(), - v_momentum_y.to(), - v_momentum_z.to(), - v_energy.to(), - areas.to(), - elements_surrounding_elements.to(), - normals_x.to(), - normals_y.to(), - normals_z.to(), - ff_variable.density, - ff_variable.momentum.x, - ff_variable.momentum.y, - ff_variable.momentum.z, - ff_variable.energy, - ff_fc_density_energy.x, - ff_fc_density_energy.y, - ff_fc_density_energy.z, - ff_fc_momentum_x.x, - ff_fc_momentum_x.y, - ff_fc_momentum_x.z, - ff_fc_momentum_y.x, - ff_fc_momentum_y.y, - ff_fc_momentum_y.z, - ff_fc_momentum_z.x, - ff_fc_momentum_z.y, - ff_fc_momentum_z.z, - ) - .await - }); + let (density, momentum_x, momentum_y, momentum_z, energy) = runner.run( + nelr as u64, + iterations as u64, + v_density.to(), + v_momentum_x.to(), + v_momentum_y.to(), + v_momentum_z.to(), + v_energy.to(), + areas.to(), + elements_surrounding_elements.to(), + normals_x.to(), + normals_y.to(), + normals_z.to(), + ff_variable.density, + ff_variable.momentum.x, + ff_variable.momentum.y, + ff_variable.momentum.z, + ff_variable.energy, + ff_fc_density_energy.x, + ff_fc_density_energy.y, + ff_fc_density_energy.z, + ff_fc_momentum_x.x, + ff_fc_momentum_x.y, + ff_fc_momentum_x.z, + ff_fc_momentum_y.x, + ff_fc_momentum_y.y, + ff_fc_momentum_y.z, + ff_fc_momentum_z.x, + ff_fc_momentum_z.y, + ff_fc_momentum_z.z, + ); Variables { density: AlignedSlice::from_slice(HerculesMutBox::from(density).as_slice()), @@ -122,41 +118,37 @@ fn run_pre_euler( let normals_z = HerculesImmBox::from(normals.z.as_slice()); let mut runner = runner!(pre_euler); - let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async { - runner - .run( - nelr as u64, - iterations as u64, - v_density.to(), - v_momentum_x.to(), - v_momentum_y.to(), - v_momentum_z.to(), - v_energy.to(), - areas.to(), - elements_surrounding_elements.to(), - normals_x.to(), - normals_y.to(), - normals_z.to(), - ff_variable.density, - ff_variable.momentum.x, - ff_variable.momentum.y, - ff_variable.momentum.z, - ff_variable.energy, - ff_fc_density_energy.x, - ff_fc_density_energy.y, - ff_fc_density_energy.z, - ff_fc_momentum_x.x, - ff_fc_momentum_x.y, - ff_fc_momentum_x.z, - ff_fc_momentum_y.x, - ff_fc_momentum_y.y, - ff_fc_momentum_y.z, - ff_fc_momentum_z.x, - ff_fc_momentum_z.y, - ff_fc_momentum_z.z, - ) - .await - }); + let (density, momentum_x, momentum_y, momentum_z, energy) = runner.run( + nelr as u64, + iterations as u64, + v_density.to(), + v_momentum_x.to(), + v_momentum_y.to(), + v_momentum_z.to(), + v_energy.to(), + areas.to(), + elements_surrounding_elements.to(), + normals_x.to(), + normals_y.to(), + normals_z.to(), + ff_variable.density, + ff_variable.momentum.x, + ff_variable.momentum.y, + ff_variable.momentum.z, + ff_variable.energy, + ff_fc_density_energy.x, + ff_fc_density_energy.y, + ff_fc_density_energy.z, + ff_fc_momentum_x.x, + ff_fc_momentum_x.y, + ff_fc_momentum_x.z, + ff_fc_momentum_y.x, + ff_fc_momentum_y.y, + ff_fc_momentum_y.z, + ff_fc_momentum_z.x, + ff_fc_momentum_z.y, + ff_fc_momentum_z.z, + ); Variables { density: AlignedSlice::from_slice(HerculesMutBox::from(density).as_slice()), diff --git a/juno_samples/rodinia/srad/Cargo.toml b/juno_samples/rodinia/srad/Cargo.toml index facf8c3bc7c92fe0b77dd85900c3e53307d358e5..46c2ae8e316011c9d563797ea1eedb88bd8d54d9 100644 --- a/juno_samples/rodinia/srad/Cargo.toml +++ b/juno_samples/rodinia/srad/Cargo.toml @@ -21,6 +21,7 @@ juno_build = { path = "../../../juno_build" } juno_build = { path = "../../../juno_build" } hercules_rt = { path = "../../../hercules_rt" } async-std = "*" +rayon = "*" clap = { version = "*", features = ["derive"] } with_builtin_macros = "0.1.0" nom = "*" diff --git a/juno_samples/rodinia/srad/benches/srad_bench.rs b/juno_samples/rodinia/srad/benches/srad_bench.rs index 6af13aae5d9093bd59cf6299dfa64ac3e73209a6..2ea230ab25c6cf254d8a7914bcff629f9ae00ea7 100644 --- a/juno_samples/rodinia/srad/benches/srad_bench.rs +++ b/juno_samples/rodinia/srad/benches/srad_bench.rs @@ -26,17 +26,8 @@ fn srad_bench(c: &mut Criterion) { let mut image_h = HerculesMutBox::from(image.clone()); group.bench_function("srad bench", |b| { b.iter(|| { - async_std::task::block_on(async { - r.run( - nrows as u64, - ncols as u64, - niter as u64, - image_h.to(), - max, - lambda, - ) - .await - }); + let to = image_h.to(); + r.run(nrows as u64, ncols as u64, niter as u64, to, max, lambda); }) }); } diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch index 7b7a6c9e2203b8e280f323ed9e6589ab149537c0..186ef8d53f4a411cabc00f9c9294878ebf03a961 100644 --- a/juno_samples/rodinia/srad/src/cpu.sch +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -39,7 +39,6 @@ fork-tile[32, 0, false, false](loop2); let split = fork-split(loop2); let loop2_body = outline(split.srad_1.fj1); simpl!(loop2, loop2_body); - inline(srad@loop2); delete-uncalled(*); diff --git a/juno_samples/rodinia/srad/src/lib.rs b/juno_samples/rodinia/srad/src/lib.rs index cb156d9db712155ad9e5b3f9775e3c6c29dbf22a..5d5576e48948ee5f4db858a854aa230109fc5019 100644 --- a/juno_samples/rodinia/srad/src/lib.rs +++ b/juno_samples/rodinia/srad/src/lib.rs @@ -27,66 +27,60 @@ pub struct SRADInputs { } pub fn srad_harness(args: SRADInputs) { - async_std::task::block_on(async { - let SRADInputs { - niter, - lambda, - nrows, - ncols, - image, - output, - verify, - output_verify, - } = args; + let SRADInputs { + niter, + lambda, + nrows, + ncols, + image, + output, + verify, + output_verify, + } = args; - let Image { - image: image_ori, - max, - rows: image_ori_rows, - cols: image_ori_cols, - } = read_graphics(image); - let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols); - let mut image_h = HerculesMutBox::from(image.clone()); + let Image { + image: image_ori, + max, + rows: image_ori_rows, + cols: image_ori_cols, + } = read_graphics(image); + let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols); + let mut image_h = HerculesMutBox::from(image.clone()); - let mut runner = runner!(srad); - let result: Vec<f32> = HerculesMutBox::from( - runner - .run( - nrows as u64, - ncols as u64, - niter as u64, - image_h.to(), - max, - lambda, - ) - .await, - ) - .as_slice() - .to_vec(); + let mut runner = runner!(srad); + let result: Vec<f32> = HerculesMutBox::from(runner.run( + nrows as u64, + ncols as u64, + niter as u64, + image_h.to(), + max, + lambda, + )) + .as_slice() + .to_vec(); - if let Some(output) = output { - write_graphics(output, &result, nrows, ncols, max); - } - - if verify { - let mut rust_result = image; - rust_srad::srad(nrows, ncols, niter, &mut rust_result, max, lambda); + if let Some(output) = output { + write_graphics(output, &result, nrows, ncols, max); + } - if let Some(output) = output_verify { - write_graphics(output, &rust_result, nrows, ncols, max); - } + if verify { + let mut rust_result = image; + rust_srad::srad(nrows, ncols, niter, &mut rust_result, max, lambda); - let max_diff = result - .iter() - .zip(rust_result.iter()) - .map(|(a, b)| (*a as i32 - *b as i32).abs()) - .max() - .unwrap_or(0); - assert!( - max_diff <= 2, - "Verification failed: maximum pixel difference of {} exceeds threshold of 1", - max_diff - ); + if let Some(output) = output_verify { + write_graphics(output, &rust_result, nrows, ncols, max); } - }) + + let max_diff = result + .iter() + .zip(rust_result.iter()) + .map(|(a, b)| (*a as i32 - *b as i32).abs()) + .max() + .unwrap_or(0); + assert!( + max_diff <= 2, + "Verification failed: maximum pixel difference of {} exceeds threshold of 1", + max_diff + ); + } }