Set up matmul bench, initial GPU schedule

2727439c · Russel Arbore · 1e9f5808 · 2727439c · 2727439c · 2727439c
Commit 2727439c authored 2 months ago by Russel Arbore
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1343,6 +1343,7 @@ name = "juno_matmul"
 version = "0.1.0"
 dependencies = [
 "async-std",
+ "criterion",
 "hercules_rt",
 "juno_build",
 "rand 0.9.0",

--- a/juno_samples/matmul/Cargo.toml
+++ b/juno_samples/matmul/Cargo.toml
@@ -8,6 +8,9 @@ edition = "2021"
 name = "juno_matmul"
 path = "src/main.rs"

+[lib]
+path = "src/lib.rs"
+
 [features]
 cuda = ["juno_build/cuda", "hercules_rt/cuda"]

@@ -20,3 +23,10 @@ hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
 rand = "*"
+
+[dev-dependencies]
+criterion = { version = "0.5", features = ["html_reports"] }
+
+[[bench]]
+name = "matmul_bench"
+harness = false
--- a/juno_samples/matmul/benches/matmul_bench.rs
+++ b/juno_samples/matmul/benches/matmul_bench.rs
+#![feature(concat_idents)]
+use criterion::{criterion_group, criterion_main, Criterion};
+use rand::random;
+
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo};
+
+juno_build::juno!("matmul");
+
+// We need this even though we don't use anything from the library because of
+// Rust build scripts only linking static libraries into the library, and not
+// into the benchmark binary. Yuck!
+#[allow(unused_imports)]
+use juno_matmul::*;
+
+fn matmul_bench(c: &mut Criterion) {
+    let mut group = c.benchmark_group("matmul bench");
+    group.sample_size(10);
+
+    let mut r = runner!(matmul);
+
+    let mut bench = |name, i: usize, j: usize, k: usize| {
+        let a: Box<[f32]> = (0..i * j).map(|_| random::<f32>()).collect();
+        let b: Box<[f32]> = (0..j * k).map(|_| random::<f32>()).collect();
+        let a = HerculesImmBox::from(a.as_ref());
+        let b = HerculesImmBox::from(b.as_ref());
+        group.bench_function(name, |bench| {
+            bench.iter(|| {
+                async_std::task::block_on(async {
+                    r.run(i as u64, j as u64, k as u64, a.to(), b.to()).await
+                });
+            })
+        });
+    };
+
+    bench("matmul bench 512", 512, 512, 512);
+}
+
+criterion_group!(benches, matmul_bench);
+criterion_main!(benches);
--- a/juno_samples/matmul/src/lib.rs
+++ b/juno_samples/matmul/src/lib.rs
+#![feature(concat_idents)]
+use std::iter::zip;
+
+use rand::random;
+
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox};
+
+juno_build::juno!("matmul");
+
+pub fn matmul_harness(i: usize, j: usize, k: usize) {
+    async_std::task::block_on(async {
+        let a: Box<[f32]> = (0..i * j).map(|_| random::<f32>()).collect();
+        let b: Box<[f32]> = (0..j * k).map(|_| random::<f32>()).collect();
+        let mut correct_c: Box<[f32]> = (0..i * k).map(|_| 0.0).collect();
+        for ii in 0..i {
+            for kk in 0..k {
+                for jj in 0..j {
+                    correct_c[ii * k + kk] += a[ii * j + jj] * b[jj * k + kk];
+                }
+            }
+        }
+        let a = HerculesImmBox::from(a.as_ref());
+        let b = HerculesImmBox::from(b.as_ref());
+        let mut r = runner!(matmul);
+        let mut c = HerculesMutBox::from(r.run(i as u64, j as u64, k as u64, a.to(), b.to()).await);
+        for (calc, correct) in zip(c.as_slice().into_iter().map(|x: &mut f32| *x), correct_c) {
+            assert!((calc - correct).abs() < 0.0001, "{} != {}", calc, correct);
+        }
+    });
+}
--- a/juno_samples/matmul/src/main.rs
+++ b/juno_samples/matmul/src/main.rs
-#![feature(concat_idents)]
-use std::iter::zip;
-
-use rand::random;
-
-use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox};
-
-juno_build::juno!("matmul");
+use juno_matmul::matmul_harness;

 fn main() {
-    async_std::task::block_on(async {
-        const I: usize = 256;
-        const J: usize = 64;
-        const K: usize = 128;
-        let a: Box<[f32]> = (0..I * J).map(|_| random::<f32>()).collect();
-        let b: Box<[f32]> = (0..J * K).map(|_| random::<f32>()).collect();
-        let mut correct_c: Box<[f32]> = (0..I * K).map(|_| 0.0).collect();
-        for i in 0..I {
-            for k in 0..K {
-                for j in 0..J {
-                    correct_c[i * K + k] += a[i * J + j] * b[j * K + k];
-                }
-            }
-        }
-        let a = HerculesImmBox::from(a.as_ref());
-        let b = HerculesImmBox::from(b.as_ref());
-        let mut r = runner!(matmul);
-        let mut c = HerculesMutBox::from(r.run(I as u64, J as u64, K as u64, a.to(), b.to()).await);
-        for (calc, correct) in zip(c.as_slice().into_iter().map(|x: &mut f32| *x), correct_c) {
-            assert!((calc - correct).abs() < 0.0001, "{} != {}", calc, correct);
-        }
-    });
+    matmul_harness(256, 64, 128);
 }

 #[test]
 fn matmul_test() {
-    main();
+    matmul_harness(256, 64, 128);
 }
--- a/juno_samples/matmul/src/matmul.jn
+++ b/juno_samples/matmul/src/matmul.jn
 #[entry]
 fn matmul<n : usize, m : usize, l : usize>(a : f32[n, m], b : f32[m, l]) -> f32[n, l] {
-  let res : f32[n, l];
+  @res let res : f32[n, l];

  @outer for i = 0 to n {
    @middle for j = 0 to l {

--- a/juno_samples/matmul/src/matmul.sch
+++ b/juno_samples/matmul/src/matmul.sch
-macro optimize!(X) {
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
  gvn(X);
  phi-elim(X);
+  crc(X);
+  slf(X);
  dce(X);
-  ip-sroa(X);
-  sroa(X);
-  dce(X);
-  gvn(X);
-  phi-elim(X);
-  dce(X);
-}
-
-macro codegen-prep!(X) {
-  optimize!(X);
-  gcm(X);
-  float-collections(X);
-  dce(X);
-  gcm(X);
-}
-
-macro forkify!(X) {
-  fixpoint {
-    forkify(X);
-    fork-guard-elim(X);
-  }
+  infer-schedules(X);
 }

-macro fork-chunk![n](X) {
-  fork-tile[n, 0, false, false](X);
-}
-
-macro fork-tile![n](X) {
-  fork-tile[n, 0, false, true](X);
-}
-
-macro parallelize!(X) {
-  parallel-fork(X);
-  parallel-reduce(X);
-}
-
-macro unforkify!(X) {
-  fork-split(X);
-  unforkify(X);
-}
-
-optimize!(*);
-forkify!(*);
+simpl!(*);
+forkify(*);
+fork-guard-elim(*);
+simpl!(*);
+reduce-slf(*);
+simpl!(*);
+fork-coalesce(*);
+simpl!(*);
+reduce-slf(*);
+simpl!(*);

 if feature("cuda") {
-  fixpoint {
-    reduce-slf(*);
-    slf(*);
-    infer-schedules(*);
-  }
-  fork-coalesce(*);
-  infer-schedules(*);
-  dce(*);
-  //rewrite(*);
-  let out = outline(matmul@outer);
-  gpu(out);
-  fixpoint {
-    simplify-cfg(*);
-    dce(*);
-  }
-
-  optimize!(*);
-  codegen-prep!(*);
+  let func = outline(matmul@outer);
+  no-memset(matmul@res);
+  gpu(func);
 } else {
-  associative(matmul@outer);
-
-  // Parallelize by computing output array as 16 chunks
-  let par = matmul@outer \ matmul@inner;
-  fork-chunk![4](par);
-  let (outer, inner, _) = fork-reshape[[0, 2], [1], [3]](par);
-  parallelize!(outer \ inner);
-
-  let body = outline(inner);
-  cpu(body);
-
-  // Tile for cache, assuming 64B cache lines
-  fork-tile![16](body);
-  let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);
-
-  reduce-slf(inner);
-  unforkify!(body);
-  codegen-prep!(*);
+  unforkify(*);
+  fork-split(*);
+  unforkify(*);
 }
+
+gcm(*);