Skip to content
Snippets Groups Projects
Commit 2727439c authored by Russel Arbore's avatar Russel Arbore
Browse files

Set up matmul bench, initial GPU schedule

parent 1e9f5808
No related branches found
No related tags found
No related merge requests found
Pipeline #202676 passed
......@@ -1343,6 +1343,7 @@ name = "juno_matmul"
version = "0.1.0"
dependencies = [
"async-std",
"criterion",
"hercules_rt",
"juno_build",
"rand 0.9.0",
......
......@@ -8,6 +8,9 @@ edition = "2021"
name = "juno_matmul"
path = "src/main.rs"
[lib]
path = "src/lib.rs"
[features]
cuda = ["juno_build/cuda", "hercules_rt/cuda"]
......@@ -20,3 +23,10 @@ hercules_rt = { path = "../../hercules_rt" }
with_builtin_macros = "0.1.0"
async-std = "*"
rand = "*"
[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }
[[bench]]
name = "matmul_bench"
harness = false
#![feature(concat_idents)]
use criterion::{criterion_group, criterion_main, Criterion};
use rand::random;
use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo};
juno_build::juno!("matmul");
// We need this even though we don't use anything from the library because of
// Rust build scripts only linking static libraries into the library, and not
// into the benchmark binary. Yuck!
#[allow(unused_imports)]
use juno_matmul::*;
fn matmul_bench(c: &mut Criterion) {
let mut group = c.benchmark_group("matmul bench");
group.sample_size(10);
let mut r = runner!(matmul);
let mut bench = |name, i: usize, j: usize, k: usize| {
let a: Box<[f32]> = (0..i * j).map(|_| random::<f32>()).collect();
let b: Box<[f32]> = (0..j * k).map(|_| random::<f32>()).collect();
let a = HerculesImmBox::from(a.as_ref());
let b = HerculesImmBox::from(b.as_ref());
group.bench_function(name, |bench| {
bench.iter(|| {
async_std::task::block_on(async {
r.run(i as u64, j as u64, k as u64, a.to(), b.to()).await
});
})
});
};
bench("matmul bench 512", 512, 512, 512);
}
criterion_group!(benches, matmul_bench);
criterion_main!(benches);
#![feature(concat_idents)]
use std::iter::zip;
use rand::random;
use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox};
juno_build::juno!("matmul");
pub fn matmul_harness(i: usize, j: usize, k: usize) {
async_std::task::block_on(async {
let a: Box<[f32]> = (0..i * j).map(|_| random::<f32>()).collect();
let b: Box<[f32]> = (0..j * k).map(|_| random::<f32>()).collect();
let mut correct_c: Box<[f32]> = (0..i * k).map(|_| 0.0).collect();
for ii in 0..i {
for kk in 0..k {
for jj in 0..j {
correct_c[ii * k + kk] += a[ii * j + jj] * b[jj * k + kk];
}
}
}
let a = HerculesImmBox::from(a.as_ref());
let b = HerculesImmBox::from(b.as_ref());
let mut r = runner!(matmul);
let mut c = HerculesMutBox::from(r.run(i as u64, j as u64, k as u64, a.to(), b.to()).await);
for (calc, correct) in zip(c.as_slice().into_iter().map(|x: &mut f32| *x), correct_c) {
assert!((calc - correct).abs() < 0.0001, "{} != {}", calc, correct);
}
});
}
#![feature(concat_idents)]
use std::iter::zip;
use rand::random;
use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox};
juno_build::juno!("matmul");
use juno_matmul::matmul_harness;
fn main() {
async_std::task::block_on(async {
const I: usize = 256;
const J: usize = 64;
const K: usize = 128;
let a: Box<[f32]> = (0..I * J).map(|_| random::<f32>()).collect();
let b: Box<[f32]> = (0..J * K).map(|_| random::<f32>()).collect();
let mut correct_c: Box<[f32]> = (0..I * K).map(|_| 0.0).collect();
for i in 0..I {
for k in 0..K {
for j in 0..J {
correct_c[i * K + k] += a[i * J + j] * b[j * K + k];
}
}
}
let a = HerculesImmBox::from(a.as_ref());
let b = HerculesImmBox::from(b.as_ref());
let mut r = runner!(matmul);
let mut c = HerculesMutBox::from(r.run(I as u64, J as u64, K as u64, a.to(), b.to()).await);
for (calc, correct) in zip(c.as_slice().into_iter().map(|x: &mut f32| *x), correct_c) {
assert!((calc - correct).abs() < 0.0001, "{} != {}", calc, correct);
}
});
matmul_harness(256, 64, 128);
}
#[test]
fn matmul_test() {
main();
matmul_harness(256, 64, 128);
}
#[entry]
fn matmul<n : usize, m : usize, l : usize>(a : f32[n, m], b : f32[m, l]) -> f32[n, l] {
let res : f32[n, l];
@res let res : f32[n, l];
@outer for i = 0 to n {
@middle for j = 0 to l {
......
macro optimize!(X) {
macro simpl!(X) {
ccp(X);
simplify-cfg(X);
lift-dc-math(X);
gvn(X);
phi-elim(X);
crc(X);
slf(X);
dce(X);
ip-sroa(X);
sroa(X);
dce(X);
gvn(X);
phi-elim(X);
dce(X);
}
macro codegen-prep!(X) {
optimize!(X);
gcm(X);
float-collections(X);
dce(X);
gcm(X);
}
macro forkify!(X) {
fixpoint {
forkify(X);
fork-guard-elim(X);
}
infer-schedules(X);
}
macro fork-chunk![n](X) {
fork-tile[n, 0, false, false](X);
}
macro fork-tile![n](X) {
fork-tile[n, 0, false, true](X);
}
macro parallelize!(X) {
parallel-fork(X);
parallel-reduce(X);
}
macro unforkify!(X) {
fork-split(X);
unforkify(X);
}
optimize!(*);
forkify!(*);
simpl!(*);
forkify(*);
fork-guard-elim(*);
simpl!(*);
reduce-slf(*);
simpl!(*);
fork-coalesce(*);
simpl!(*);
reduce-slf(*);
simpl!(*);
if feature("cuda") {
fixpoint {
reduce-slf(*);
slf(*);
infer-schedules(*);
}
fork-coalesce(*);
infer-schedules(*);
dce(*);
//rewrite(*);
let out = outline(matmul@outer);
gpu(out);
fixpoint {
simplify-cfg(*);
dce(*);
}
optimize!(*);
codegen-prep!(*);
let func = outline(matmul@outer);
no-memset(matmul@res);
gpu(func);
} else {
associative(matmul@outer);
// Parallelize by computing output array as 16 chunks
let par = matmul@outer \ matmul@inner;
fork-chunk![4](par);
let (outer, inner, _) = fork-reshape[[0, 2], [1], [3]](par);
parallelize!(outer \ inner);
let body = outline(inner);
cpu(body);
// Tile for cache, assuming 64B cache lines
fork-tile![16](body);
let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);
reduce-slf(inner);
unforkify!(body);
codegen-prep!(*);
unforkify(*);
fork-split(*);
unforkify(*);
}
gcm(*);
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment