Compare revisions

86599177 · e7b1e790 · ffbb263d · 5710b3d2 · 00a73f42 · 59072d67
--- a/.gitattributes
+++ b/.gitattributes
+# Highlight juno source files like they're rust source files
+*.jn gitlab-language=rust
--- a/.gitignore
+++ b/.gitignore
-/target
+**/target
 *.dot
+!paper_resources/*.dot
 *.bc
 *.out
 *.ll
 *.c
+*.cu
 *.o
+*.a
+*.hrt
+*.png
+*.swp
+.vscode
+*_env
+*.txt
+*ncu-rep
\ No newline at end of file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+test-cpu:
+  stage: test
+  script:
+    - cargo test --features=opencv -vv -- --nocapture
+test-gpu:
+  stage: test
+  script:
+    - cargo test --features=cuda,opencv -vv -- --nocapture
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
 [workspace]
+resolver = "2"
 members = [
 	"hercules_cg",
 	"hercules_ir",
 	"hercules_opt",
 	"hercules_rt",
+	"hercules_samples/call",
-	"hercules_tools/hercules_dot",
+	"hercules_samples/ccp",
-	"hercules_tools/hercules_cpu",
+	"hercules_samples/dot",
+	"hercules_samples/fac",
-	"hercules_samples/matmul"
+	"hercules_samples/matmul",
+	"hercules_test/hercules_interpreter",
+	"hercules_test/hercules_tests",
+	"juno_build",
+	"juno_frontend",
+	"juno_samples/antideps",
+	"juno_samples/casts_and_intrinsics",
+	"juno_samples/cava",
+	"juno_samples/concat",
+	"juno_samples/control",
+	"juno_samples/dot",
+	"juno_samples/edge_detection",
+	"juno_samples/fork_join_tests",
+	"juno_samples/implicit_clone",
+	"juno_samples/matmul",
+	"juno_samples/median_window",
+	"juno_samples/multi_device",
+	"juno_samples/multi_return",
+	"juno_samples/patterns",
+	"juno_samples/products",
+	"juno_samples/rodinia/backprop",
+	"juno_samples/rodinia/bfs",
+	"juno_samples/rodinia/cfd",
+	"juno_samples/rodinia/srad",
+	"juno_samples/schedule_test",
+	"juno_samples/simple3",
+	"juno_scheduler",
+	"juno_utils",
 ]
--- a/DESIGN.md
+++ b/DESIGN.md
-# Hercules' Design
-Hercules' is a compiler targeting heterogenous devices. The key goals of Hercules are listed below:
- Generate optimized, memory efficient, and parallel code for devices containing CPUs, GPUs, and other processing elements.
- Explore language design for programming heterogenous systems in a performant, expressive, and safe manner.
- Expose detailed configuration of code generation and scheduling through a novel scheduling language.
- Design an intermediate representation that allows for fine-grained control of what code is executed on what device in a system.
- Develop a runtime system capable of dynamically scheduling generated code fragments on a heterogenous machine.
-The following sections contain information on how Hercules is designed to meet these goals.
-## Front-end Language Design
-TODO: @aaronjc4
-## Scheduling Language Design
-TODO: @aaronjc4
-## Compiler Design
-The Hercules' compiler is split into the following components:
-### Hercules IR
-The IR of the Hercules compiler is similar to the sea of nodes IR presented in "A Simple Graph-Based Intermediate Representation", with a few differences.
- There are dynamic constants, which are constants provided dynamically to the conductor (this is the runtime system, [see the section describing the conductor](#the-conductor)) - these can be used to specify array type sizes, unlike normal runtime values.
- There is no single global store. The closest analog are individual values with an array type, which support dynamically indexed read and write operations.
- There is no I/O, or other side effects.
- There is no recursion.
- The implementation of Hercules IR does not follow the original object oriented design of sea-of-nodes.
-A key design consideration of Hercules IR is the absence of a concept of memory. A downside of this approach is that any language targetting Hecules IR must also be very restrictive regarding memory - in practice, this means tightly controlling or eliminating first-class references. The upside is that the compiler has complete freedom to layout data however it likes in memory when performing code generation. This includes deciding which data resides in which address spaces, which is a necessary ability for a compiler striving to have fine-grained control over what operations are computed on what devices.
-In addition to not having a generalized memory, Hercules IR has no functionality for calling functions with side-effects, or doing IO. In other words, Hercules is a pure IR (it's not functional, as functions aren't first class values). This may be changed in the future - we could support effectful programs by giving call operators a control input and output edge. However, at least for now, we'd like to work with the simplest IR possible, so the IR is pure.
-The key idea behind the sea of nodes IR is that control flow and data flow are represented in the same graph. The entire program thus can be represented by one large flow graph. This has several nice properties, the primary of which being that instructions are unordered except by true dependencies. This alleviates most code motion concerns, and also makes peephole optimizations more practical. Additionally, loop invariant code is neither "inside" nor "outside" a loop in the sea of nodes. Thus, any optimizations benefitting from a particular assumption about the position of loop invariant code works without needing to do code motion. Deciding whether code lives inside a loop or not becomes a scheduling concern.
-We chose to use a sea of nodes based IR because we believe it will be easier to partition than a CFG + basic block style IR. A CFG + basic block IR is inherently two-level - there is the control flow level in the CFG, and the data flow in the basic blocks. Partitioning a function across these two levels is a challenging task. As shown by previous work (HPVM), introducing more graph levels into the IR makes partitioning harder, not easier. We want Hercules to have fine-grained control over which code executes where. This requires Hercules' compiler IR to have as few graph levels as reasonable.
-See [IR.md](IR.md) for a more specific description of Hercules IR.
-### Optimizations
-Hercules relies on other compiler infrastructures, such as LLVM, to do code generation for specific devices. Thus, Hercules itself doesn't perform particularly sophisticated optimizations. In general, the optimizations Hercules do are done to make partitioning easier. This includes things like GVN and peephole optimizations, which in general, make the IR "simpler".
-TODO: @rarbore2
-### Partitioning
-Partitioning is responsible for deciding which operations in the IR graph are executed on which devices. Additionally, operations are broken up into shards - every node in a shard executes on the same device, and the runtime system schedules execution at the shard level. Partitioning is conceptually very similar to instruction selection. Each shard can be thought of as a single instruction, and the device the shard is executed on can be thought of as the particular instruction being selected. In instruction selection, there is not only the choice of which instructions to use, but also how to partition the potentially many operations in the IR into a smaller number of target instructions. Similarly, the Hercules IR partitioning process must decide which operations are grouped together into the same shard, and for each shard, which device it should execute on. The set of operations each potential target device is capable of executing is crucial information when forming the shard boundaries, so this cannot be performed optimally as a sequential two step process.
-TODO: @rarbore2
-### Code Generation
-Hercules uses LLVM for generating CPU and GPU code. Memory is "introduced" into the program representation at this stage. Operations in a function are separated into basic blocks. The data layout of values is decided on, and memory is allocated on the stack or is designated as separately allocated and passed into functions as necessary. Code is generated corresponding to possibly several estimates of dynamic constants.
-TODO: @rarbore2
-## The Conductor
-The conductor is responsible for dynamically executing code generated by Hercules. It exposes a Rust API for executing Hercules code. It takes care of memory allocation, synchronization, and scheduling. It is what is called the "runtime" in other systems - we chose a different name as there are events that happen distinctly as "conductor time" (such as providing dynamic constants), rather than at "runtime" (where the generated code is actually executed).
-TODO: @rarbore2
--- a/IR.md
+++ b/IR.md
-# Hercules IR
-Hercules IR is structured as following:
- One entire program lives in one "Module".
- Each module contains a set of functions, as well as interned types, constants, and dynamic constants. The most important element of a module is its resident functions.
- Each function consists of a name, a set of types for its parameters, a return type, a list of nodes, and the number of dynamic constants it takes as argument. Types are not needed for dynamic constants, since all dynamic constants have type u64. The most important element of a function is its node list.
- There are control and data types. The control type is parameterized by a list of thread replication factors. The primitive data types are boolean, signed integers, unsigned integers, and floating point numbers. The integer types can hold 8, 16, 32, or 64 bits. The floating point types can hold 32 or 64 bits. The compound types are product, summation, and arrays. A product type is a tuple, containing some number of children data types. A summation type is a union, containing exactly one of some number of children data types at runtime. An array is a dynamically indexable collection of elements, where each element is the same type. The size of the array is part of the type, and is represented with a dynamic constant.
- Dynamic constants are constants provided to the conductor when a Hercules IR program is started. Through this mechanism, Hercules IR can represent programs operating on a variable number of array elements, while forbidding runtime dynamic memory allocation (all dynamic memory allocation happens in the conductor).
- The nodes in a function are structured as a flow graph, which an explicit start node. Although control and data flow from definitions to uses, def-use edges are stored implicitly in the IR. Each node stores its predecessor nodes, so use-def edges are stored explicitly. To query the def-use edges in an IR graph, use the `def_use` function.
-Below, all of the nodes in Hercules IR are described.
-## Start
-The start node of the IR flow graph. This node is implicitly defined in the text format. It takes no inputs. Its output type is the empty control type (control with no thread replication factors).
-## Region
-Region nodes are the mechanism for merging multiple branches inside Hercules IR. A region node takes at least one input - each input must have a control type, and all of the inputs must have the same control type. The output type of the region node is the same control type as all of its inputs. The main purpose of a region node is to drive some number of [phi](#phi) nodes.
-## If
-The branch mechanism in Hercules IR. An if node takes two inputs - a control predecessor, and a condition. The control predecessor must have control type, and the condition must have boolean type. The output type is the same control type as the control input. Every if node must be followed directly by two [read](#read) nodes, each of which represents the opposite destinations of the branch. This is the mechanism by which the output edges from the if node (and also the [match](#match) node) are labelled, even though nodes only explicitly store their input edges.
-## Fork
-Fork (and [join](#join)) nodes are the mechanism for representing data-parallelism inside Hercules IR. A fork node takes one input - a control predecessor. A fork node also stores a thread replication factor (TRF), represented as a dynamic constant. The output type of a fork node is a control type, which is the same as the type of the control predecessor, with the TRF pushed to the end of the control type's factor list. Conceptually, for every thread that comes in to a fork node, TRF threads come out. A fork node can drive any number of children [thread\_id](#threadid) nodes. Each fork must have a single corresponding [join](#join) node - the fork must dominate the join node, and the join node must post-dominate the fork node (in the control flow subgraph).
-## Join
-Join (and [fork](#fork)) nodes are the mechanism for synchronizing data-parallel threads inside Hercules IR. A join nodes takes one input - a control predecessor. The output type of a join node is a control type, which is the same as the type of the control predecessor, with the last factor in the control type's list removed. Conceptually, after all threads created by the corresponding fork reach the join, then and only then does the join output a single thread. A join node can drive any number of children [collect](#collect) nodes. Each join must have a single corresponding [fork](#fork) node - the join must post-dominate the fork node, and the fork node must dominate the join node (in the control flow subgraph).
-## Phi
-Phi nodes merge potentially many data sources into one data output, driven by a corresponding region node. Phi nodes in Hercules IR perform the same function as phi nodes in other SSA-based IRs. Phi nodes take at least one input - a control predecessor, and some number of data inputs. The control predecessor of a phi node must be a region node. The data inputs must all have the same type. The output of the phi node has that data type. In the sea of nodes execution model, a phi node can be thought of as "latching" when its corresponding region node is reached. The phi node will latch to output the value of the input corresponding to the input that control traversed to reach the region node. After latching, the phi node's output won't change until the region node is reached again.
-## ThreadID
-The thread\_id node provides the thread ID as a datum to children nodes after a [fork](#fork) has been performed. A thread\_id node takes one input - a control predecessor. The control predecessor must be a [fork](#fork) node. The output type is a 64-bit unsigned integer. The output thread IDs generated by a thread\_id node range from 0 to TRF - 1, inclusive, where TRF is the thread replication factor of the input [fork](#fork) node.
-## Collect
-The collect node collects data from multiple executing threads, and puts them all into an array. A collect node takes two inputs - a control predecessor, and a data input. The control predecessor must be a [join](#join) node. The data input must have a non-control type. The output type will be an array, where the element type will be the type of the data input. The extent of the array will be equal to the thread replication factor of the [fork](#fork) node corresponding to the input [join](#join) node. For each datum input, the thread ID corresponding to that datum will be the index the datum is inserted into the array.
-## Return
-The return node returns some data from the current function. A return node has two inputs - a control predecessor, and a data input. The control predecessor must have a control type with an empty factor list - just as only one thread starts the execution of a function, only one thread can return from a function. The data input must have the same type as the function's return type. No node should use a return node as input (technically, the output type of a return node is an empty product type).
-## Parameter
-The parameter node represents a parameter of the function. A parameter node takes one input - the start node. A parameter node stores the parameter index of the function it corresponds to. Its value at runtime is the index-th argument to the function. Its output type is the type of the index-th parameter of the function.
-## Constant
-The constant node represents a constant value. A constant node takes one input - the start node. A constant node stores the constant ID of the constant it corresponds to. Its value at runtime is the constant it references. Its output type is the type of the constant it references.
-## DynamicConstant
-The dynamic\_constant node represents a dynamic constant, used as a runtime value. A dynamic\_constant node takes one input - the start node. A dynamic\_constant node stores the dynamic constant ID of the dynamic constant it corresponds to. Its value at runtime is the value of the dynamic constant it references, which is calculated at conductor time. Its output type is a 64-bit unsigned integer.
-## Unary
-The unary node represents a basic unary operation. A unary node takes one input - a data input. The data input must have a non-control type. A unary node additionally stores which unary operation it performs. The output type of the unary node is the same as its input type. The acceptable input data type depends on the unary operation.
-## Binary
-The binary node represents a basic binary operation. A binary node takes two inputs - a left data input, and a right data input. The left and right data inputs must be the same non-control type. A binary node additionally stores the binary operation it performs. The output type of the binary node is the same as its input type. The acceptable input data type depends on the binary operation.
-## Call
-The call node passes its inputs to a function, and outputs the result of the function call. A call node takes some number of data inputs. A call node also stores a reference to the function it calls. The number and types of the data inputs must match the referenced function. A call node also stores references to dynamic constants it uses as inputs to the function. The number of dynamic constants references must match the number of dynamic constant inputs of the referenced function. The output type of a call node is the return type of the referenced function. A call node notably does not take as input or output a control type. This is because all operations in Hercules IR are pure, including arbitrary function calls. Thus, the only things affecting a function call are the data inputs, and (conceptually) the function may be called an arbitrary amount of times.
-## Read
-The read node reads an element from a collection consisting of product, summation, and array types. It uses a series of indices, that index successive levels in a collection's type tree. There are 4 kinds of indices - fields (for products), variants (for summations), positions (for arrays), and controls (for succeeding [if](#if) and [match](#match) nodes). A read node takes one collect input, and potentially position inputs (there are node inputs only for positions). The output type of a read node is the indexed type in the collection's type tree.
-## Write
-The write node writes an element into a collection consisting of product, summation, and array types. It uses a series of indices, that index successive levels in a collection's type tree. There are 4 kinds of indices - fields (for products), variants (for summations), and positions (for arrays). A write node takes one collect input, a data input, and potentially position inputs (there are node inputs only for positions). The output type of a write node is the same as the collection input. The indexed type in the collection must be the same type as the data input.
-## Match
-The match node branches based on the variant of a sum typed value. A match node takes two inputs - a control predecessor, and a sum input. The control predecessor must have control type, and the sum input must have a sum type. The output type is a product of N control types, where N is the number of possible variants in the sum input's sum type. The control types in the product are the same as the control input's type. Every match node must be followed directly by N [read](#read) nodes, each of which reads differing elements of the match node's output product. This is the mechanism by which the output edges from the match node (and also the [if](#if) node) are labelled, even though nodes only explicitly store their input edges.
--- a/LICENSE
+++ b/LICENSE
+The Hercules Compiler is under the Apache License v2.0 with LLVM Exceptions:
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+---- LLVM Exceptions to the Apache 2.0 License ----
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
--- a/README.md
+++ b/README.md
 # Hercules
-See [DESIGN.md](DESIGN.md) for a discussion of Hercules' design.
+Hercules is a compiler for productively programming heterogeneous systems. The project is structured as follows:
+- `hercules_ir`: Core Hercules IR definition and analyses.
+- `hercules_opt`: Transformations on Hercules IR.
+- `hercules_cg`: Code generators that compile Hercules IR functions into device-specific code (currently LLVM, CUDA, or Async Rust).
+- `hercules_rt`: Assorted runtime utilities.
+- `juno_frontend`: Frontend for the Juno application language, compiles into Hercules IR.
+- `juno_scheduler`: Frontend for the Juno scheduling language, controls transformations on Hercules IR.
+- `juno_build`: Build utilities for incorporating Hercules code in Rust programs.
+- `juno_utils`: Assorted utilities for the Juno frontends.
+- `hercules_samples`: Samples / tests manually defining Hercules IR textually.
+- `juno_samples`: Samples / tests of Juno programs.
+- `hercules_test`: Assorted tests for the Hercules compiler.
+- `paper_resources`: Assorted figures for publications.
+## Setup
+Hercules is simple to setup. Just clone the repository:
+```
+https://gitlab.engr.illinois.edu/llvm/hercules.git
+```
+And run all the tests:
+```
+cargo test
+```
+Or a single sample (see `Cargo.toml` for a full list):
+```
+cargo test -p juno_matmul
+```
+If you want to see the output LLVM / CUDA / Rust code from the Hercules compiler, run with the `-vv` flag.
+```
+cargo test -p juno_matmul -vv
+```
+The written samples are setup with a `cuda` feature - if this feature is provided, the Juno program will be targeted onto the GPU, rather than the CPU. This only works if you have the CUDA toolkit installed:
+```
+cargo test -p juno_matmul --features=cuda
+```
+Some samples also include benchmarks. These use `criterion` for measurement. For example (`edge_detection` requires the `opencv` feature to be enabled to build OpenCV before running):
+```
+cargo bench -p juno_edge_detection --features=opencv
+```
--- a/hercules_cg/Cargo.toml
+++ b/hercules_cg/Cargo.toml
@@ -2,8 +2,15 @@
 name = "hercules_cg"
 version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>"]
+edition = "2021"
+[features]
+cuda = []
 [dependencies]
+rand = "*"
+ordered-float = "*"
 bitvec = "*"
-ena = "*"
+serde = { version = "*", features = ["derive"] }
 hercules_ir = { path = "../hercules_ir" }
--- a/hercules_cg/src/antideps.rs
+++ b/hercules_cg/src/antideps.rs
-extern crate hercules_ir;
-use self::hercules_ir::def_use::*;
-use self::hercules_ir::ir::*;
-/*
- * Top level function to assemble anti-dependence edges. Returns a list of pairs
- * of nodes. The first item in the pair is the read node, and the second item is
- * the write node.
- */
-pub fn antideps<I: Iterator<Item = NodeID>>(
-    function: &Function,
-    def_use: &ImmutableDefUseMap,
-    nodes: I,
-) -> Vec<(NodeID, NodeID)> {
-    // Anti-dependence edges are between a write node and a read node, where
-    // each node uses the same array value. The read must be scheduled before
-    // the write to avoid incorrect compilation.
-    let mut antideps = vec![];
-    for id in nodes {
-        // Collect the reads and writes to / from this collection.
-        let users = def_use.get_users(id);
-        let reads = users.iter().filter(|user| {
-            if let Node::Read {
-                collect,
-                indices: _,
-            } = function.nodes[user.idx()]
-            {
-                collect == id
-            } else {
-                false
-            }
-        });
-        let mut writes = users.iter().filter(|user| {
-            if let Node::Write {
-                collect,
-                data: _,
-                indices: _,
-            } = function.nodes[user.idx()]
-            {
-                collect == id
-            } else {
-                false
-            }
-        });
-        // If there are any writes, compute the anti dependence edges.
-        if let Some(write) = writes.next() {
-            for read in reads {
-                antideps.push((*read, *write));
-            }
-        }
-        // TODO: Multiple write uses should clone the collection for N - 1 of the writes.
-        assert!(writes.next() == None, "Can't form anti-dependencies when there are two independent writes depending on a single collection value.");
-    }
-    antideps
-}
-/*
- * Sometimes, we are only interested in anti-dependence edges involving arrays.
- */
-pub fn array_antideps(
-    function: &Function,
-    def_use: &ImmutableDefUseMap,
-    types: &Vec<Type>,
-    typing: &Vec<TypeID>,
-) -> Vec<(NodeID, NodeID)> {
-    antideps(
-        function,
-        def_use,
-        (0..function.nodes.len())
-            .map(NodeID::new)
-            .filter(|id| types[typing[id.idx()].idx()].is_array()),
-    )
-}
--- a/hercules_cg/src/cpu.rs
+++ b/hercules_cg/src/cpu.rs
--- a/hercules_cg/src/cpu_beta.rs
+++ b/hercules_cg/src/cpu_beta.rs
--- a/hercules_cg/src/fork_tree.rs
+++ b/hercules_cg/src/fork_tree.rs
+use std::collections::{HashMap, HashSet};
+use crate::*;
+/*
+ * Construct a map from fork node to all control nodes (including itself)
+ * satisfying:
+ * 1. Dominated by the fork.
+ * 2. Not dominated by the fork's join.
+ * 3. Not dominated by any other fork that's also dominated by the fork, where
+ *    we do count self-domination.
+ * We include the non-fork start node as the key for all control nodes outside
+ * any fork.
+ */
+pub fn fork_control_map(
+    fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>,
+) -> HashMap<NodeID, HashSet<NodeID>> {
+    let mut fork_control_map = HashMap::new();
+    for (control, forks) in fork_join_nesting {
+        let fork = forks.first().copied().unwrap_or(NodeID::new(0));
+        fork_control_map
+            .entry(fork)
+            .or_insert_with(HashSet::new)
+            .insert(*control);
+    }
+    fork_control_map
+}
+/*
+ * Construct a map from fork node to all fork nodes (including itself)
+ * satisfying:
+ * 1. Dominated by the fork.
+ * 2. Not dominated by the fork's join.
+ * 3. Not dominated by any other fork that's also dominated by the fork, where
+ *    we do count self-domination.
+ * Note that the fork tree also includes the start node as the unique root node.
+ */
+pub fn fork_tree(
+    function: &Function,
+    fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>,
+) -> HashMap<NodeID, HashSet<NodeID>> {
+    let mut fork_tree = HashMap::new();
+    for (control, forks) in fork_join_nesting {
+        if function.nodes[control.idx()].is_fork() {
+            fork_tree.entry(*control).or_insert_with(HashSet::new);
+            let nesting_fork = forks.get(1).copied().unwrap_or(NodeID::new(0));
+            fork_tree
+                .entry(nesting_fork)
+                .or_insert_with(HashSet::new)
+                .insert(*control);
+        }
+    }
+    fork_tree.entry(NodeID::new(0)).or_default();
+    fork_tree
+}
--- a/hercules_cg/src/gcm.rs
+++ b/hercules_cg/src/gcm.rs
-extern crate hercules_ir;
-use std::collections::HashMap;
-use self::hercules_ir::dataflow::*;
-use self::hercules_ir::def_use::*;
-use self::hercules_ir::dom::*;
-use self::hercules_ir::ir::*;
-use self::hercules_ir::loops::*;
-use self::hercules_ir::subgraph::*;
-/*
- * Top level global code motion function. Assigns each data node to one of its
- * immediate control use / user nodes, forming (unordered) basic blocks. Returns
- * the control node / basic block each node is in.
- */
-pub fn gcm(
-    function: &Function,
-    def_use: &ImmutableDefUseMap,
-    reverse_postorder: &Vec<NodeID>,
-    control_subgraph: &Subgraph,
-    dom: &DomTree,
-    fork_join_map: &HashMap<NodeID, NodeID>,
-    antideps: &Vec<(NodeID, NodeID)>,
-) -> Vec<NodeID> {
-    // Step 1: find the immediate control uses and immediate control users of
-    // each node.
-    let mut immediate_control_uses =
-        forward_dataflow(function, reverse_postorder, |inputs, node_id| {
-            immediate_control_flow(inputs, node_id, function)
-        });
-    let mut immediate_control_users =
-        backward_dataflow(function, def_use, reverse_postorder, |inputs, node_id| {
-            immediate_control_flow(inputs, node_id, function)
-        });
-    // Reads and writes forming anti dependencies must be put in the same block.
-    for (read, write) in antideps {
-        let meet = UnionNodeSet::meet(
-            &immediate_control_uses[read.idx()],
-            &immediate_control_uses[write.idx()],
-        );
-        immediate_control_uses[read.idx()] = meet.clone();
-        immediate_control_uses[write.idx()] = meet;
-        let meet = UnionNodeSet::meet(
-            &immediate_control_users[read.idx()],
-            &immediate_control_users[write.idx()],
-        );
-        immediate_control_users[read.idx()] = meet.clone();
-        immediate_control_users[write.idx()] = meet;
-    }
-    // Step 2: calculate loop tree of function.
-    let loops = loops(&control_subgraph, NodeID::new(0), &dom, fork_join_map);
-    // Step 3: find most control dependent, shallowest loop level node for every
-    // node.
-    let bbs = (0..function.nodes.len())
-        .map(|idx| {
-            let highest =
-                dom.lowest_amongst(immediate_control_uses[idx].nodes(function.nodes.len() as u32));
-            let lowest = dom
-                .common_ancestor(immediate_control_users[idx].nodes(function.nodes.len() as u32));
-            // Collect into vector to reverse, since we want to traverse down
-            // the dom tree, not up it.
-            let mut chain = dom
-                .chain(lowest, highest)
-                .collect::<Vec<_>>()
-                .into_iter()
-                .rev();
-            let mut location = chain.next().unwrap();
-            while let Some(control_node) = chain.next() {
-                // Traverse down the dom tree until we find a loop.
-                if loops.contains(control_node) {
-                    break;
-                } else {
-                    location = control_node;
-                }
-            }
-            location
-        })
-        .collect();
-    bbs
-}
-/*
- * Find fork/join nests that each control node is inside of. Result is a map
- * from each control node to a list of fork nodes. The fork nodes are listed in
- * ascending order of nesting.
- */
-pub fn compute_fork_join_nesting(
-    function: &Function,
-    dom: &DomTree,
-    fork_join_map: &HashMap<NodeID, NodeID>,
-) -> HashMap<NodeID, Vec<NodeID>> {
-    // For each control node, ascend dominator tree, looking for fork nodes. For
-    // each fork node, make sure each control node isn't strictly dominated by
-    // the corresponding join node.
-    (0..function.nodes.len())
-        .map(NodeID::new)
-        .filter(|id| function.nodes[id.idx()].is_control())
-        .map(|id| {
-            (
-                id,
-                dom.ascend(id)
-                    .filter(|id| function.nodes[id.idx()].is_fork())
-                    .filter(|fork_id| !dom.does_prop_dom(fork_join_map[&fork_id], id))
-                    .collect(),
-            )
-        })
-        .collect()
-}
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
--- a/hercules_cg/src/lib.rs
+++ b/hercules_cg/src/lib.rs
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
--- a/hercules_ir/Cargo.toml
+++ b/hercules_ir/Cargo.toml
 [package]
 name = "hercules_ir"
 version = "0.1.0"
-authors = ["Russel Arbore <rarbore2@illinois.edu>"]
+authors = ["Russel Arbore <rarbore2@illinois.edu>, Aaron Councilman <aaronjc4@illinois.edu>"]
+edition = "2021"
 [dependencies]
+rand = "*"
 nom = "*"
-ordered-float = "*"
+ordered-float = { version = "*", features = ["serde"] }
 bitvec = "*"
\ No newline at end of file
+serde = { version = "*", features = ["derive"] }
+either = "*"
--- a/hercules_ir/src/build.rs
+++ b/hercules_ir/src/build.rs
No results found