For my weekly github scan for interesting projects, I stumbled on tiny-gpu
which seems like an educational project to learn how to write a simple GPU hardware (the key word here is simple).
Installation Link to heading
First we need sv2v
as it’s written in SV. Add sv2v to the $PATH
.
sudo apt install haskell-stack
git clone https://github.com/zachjs/sv2v.git
cd sv2c
make
Then clone and cocotb install. Boom!
virtualenv .venv
source .venv/bin/activate
pip install cocotbhttps://github.com/zachjs/sv2v.git
Running First Example Link to heading
Running the first example is easy enough. README.md
has test_matadd
as one of 2 Cocotb tests.
mkdir build
make test_matadd
The makefile uses sv2v
to generate verilog code from SV. Then, Compiles gpu
as top for iverilog.
sv2v -w build/alu.v src/alu.sv
sv2v -I src/* -w build/gpu.v
echo "" >> build/gpu.v
cat build/alu.v >> build/gpu.v
echo '`timescale 1ns/1ns' > build/temp.v
cat build/gpu.v >> build/temp.v
mv build/temp.v build/gpu.v
iverilog -o build/sim.vvp -s gpu -g2012 build/gpu.v
And generate Cocotb run as follows
MODULE=test.test_matadd vvp -M $(cocotb-config --prefix)/cocotb/libs -m libcocotbvpi_icarus build/sim.vvp
-.--ns INFO gpi ..mbed/gpi_embed.cpp:108 in set_program_name_in_venv Using Python virtual environment interpreter at /tiny-gpu/.venv/bin/python
-.--ns INFO gpi ../gpi/GpiCommon.cpp:101 in gpi_print_registered_impl VPI registered
0.00ns INFO cocotb Running on Icarus Verilog version 13.0 (devel)
0.00ns INFO cocotb Running tests with cocotb v1.9.2 from /tiny-gpu/.venv/lib/python3.12/site-packages/cocotb
0.00ns INFO cocotb Seeding Python random module with 1746816588
0.00ns INFO cocotb.regression pytest not found, install it to enable better AssertionError messages
0.00ns INFO cocotb.regression Found test test.test_matadd.test_matadd
0.00ns INFO cocotb.regression running test_matadd (1/1)
4475001.00ns INFO cocotb.regression test_matadd passed
4475001.00ns INFO cocotb.regression **************************************************************************************
** TEST STATUS SIM TIME (ns) REAL TIME (s) RATIO (ns/s) **
**************************************************************************************
** test.test_matadd.test_matadd PASS 4475001.00 0.51 8859917.91 **
**************************************************************************************
** TESTS=1 PASS=1 FAIL=0 SKIP=0 4475001.00 0.51 8690696.54 **
**************************************************************************************
test_matadd Link to heading
Looking at test_matadd
, it uses Memory
to define program and data memories.
import cocotb
from cocotb.triggers import RisingEdge
from .helpers.setup import setup
from .helpers.memory import Memory
from .helpers.format import format_cycle
from .helpers.logger import logger
@cocotb.test()
async def test_matadd(dut):
# Program Memory
program_memory = Memory(dut=dut, addr_bits=8, data_bits=16, channels=1, name="program")
program = [
0b0101000011011110, # MUL R0, %blockIdx, %blockDim
0b0011000000001111, # ADD R0, R0, %threadIdx ; i = blockIdx * blockDim + threadIdx
0b1001000100000000, # CONST R1, #0 ; baseA (matrix A base address)
0b1001001000001000, # CONST R2, #8 ; baseB (matrix B base address)
0b1001001100010000, # CONST R3, #16 ; baseC (matrix C base address)
0b0011010000010000, # ADD R4, R1, R0 ; addr(A[i]) = baseA + i
0b0111010001000000, # LDR R4, R4 ; load A[i] from global memory
0b0011010100100000, # ADD R5, R2, R0 ; addr(B[i]) = baseB + i
0b0111010101010000, # LDR R5, R5 ; load B[i] from global memory
0b0011011001000101, # ADD R6, R4, R5 ; C[i] = A[i] + B[i]
0b0011011100110000, # ADD R7, R3, R0 ; addr(C[i]) = baseC + i
0b1000000001110110, # STR R7, R6 ; store C[i] in global memory
0b1111000000000000, # RET ; end of kernel
]
# Data Memory
data_memory = Memory(dut=dut, addr_bits=8, data_bits=8, channels=4, name="data")
data = [
0, 1, 2, 3, 4, 5, 6, 7, # Matrix A (1 x 8)
0, 1, 2, 3, 4, 5, 6, 7 # Matrix B (1 x 8)
]
# Device Control
threads = 8
await setup(
dut=dut,
program_memory=program_memory,
program=program,
data_memory=data_memory,
data=data,
threads=threads
)
data_memory.display(24)
cycles = 0
while dut.done.value != 1:
data_memory.run()
program_memory.run()
await cocotb.triggers.ReadOnly()
format_cycle(dut, cycles)
await RisingEdge(dut.clk)
cycles += 1
logger.info(f"Completed in {cycles} cycles")
data_memory.display(24)
expected_results = [a + b for a, b in zip(data[0:8], data[8:16])]
for i, expected in enumerate(expected_results):
result = data_memory.memory[i + 16]
assert result == expected, f"Result mismatch at index {i}: expected {expected}, got {result}"
setup
seems to load the program and data memory by calling load
.
async def setup(
dut,
program_memory: Memory,
program: List[int],
data_memory: Memory,
data: List[int],
threads: int
):
# Setup Clock
clock = Clock(dut.clk, 25, units="us")
cocotb.start_soon(clock.start())
# Reset
dut.reset.value = 1
await RisingEdge(dut.clk)
dut.reset.value = 0
# Load Program Memory
program_memory.load(program)
# Load Data Memory
data_memory.load(data)
# Device Control Register
dut.device_control_write_enable.value = 1
dut.device_control_data.value = threads
await RisingEdge(dut.clk)
dut.device_control_write_enable.value = 0
# Start
dut.start.value = 1
I will circle back later.
Walkthrough into RTL Link to heading
The top level gpu
contains the following components:
- Cores depending on
NUM_CORES
- 2 Controllers
- Dispatch
Core Link to heading
Well, This is GPU so we need the unit
which is core
.
generate
for (i = 0; i < NUM_CORES; i = i + 1) begin : cores
// Compute Core
core #(
) core_instance (
.clk(clk),
.reset(core_reset[i]),
.start(core_start[i]),
.done(core_done[i]),
.block_id(core_block_id[i]),
.thread_count(core_thread_count[i]),
.program_mem_read_valid(fetcher_read_valid[i]),
.program_mem_read_address(fetcher_read_address[i]),
.program_mem_read_ready(fetcher_read_ready[i]),
.program_mem_read_data(fetcher_read_data[i]),
.data_mem_read_valid(core_lsu_read_valid),
.data_mem_read_address(core_lsu_read_address),
.data_mem_read_ready(core_lsu_read_ready),
.data_mem_read_valid(core_lsu_read_valid),
.data_mem_read_address(core_lsu_read_address),
.data_mem_read_ready(core_lsu_read_ready),
.data_mem_read_data(core_lsu_read_data),
.data_mem_write_valid(core_lsu_write_valid),
.data_mem_write_address(core_lsu_write_address),
.data_mem_write_data(core_lsu_write_data),
.data_mem_write_ready(core_lsu_write_ready)
);
end
endgenerate
Each core have it’s own copy of the following modules:
- decoder
- scheduler
- fetcher
and the core has THREADS_PER_BLOCK
threads which hav it own ALU, PC, LSU and registers.
fetcher #(
.PROGRAM_MEM_ADDR_BITS(PROGRAM_MEM_ADDR_BITS),
.PROGRAM_MEM_DATA_BITS(PROGRAM_MEM_DATA_BITS)
) fetcher_instance (
.clk(clk),
.reset(reset),
.core_state(core_state),
.current_pc(current_pc),
.mem_read_valid(program_mem_read_valid),
.mem_read_address(program_mem_read_address),
.mem_read_ready(program_mem_read_ready),
.mem_read_data(program_mem_read_data),
.fetcher_state(fetcher_state),
.instruction(instruction)
);
// Decoder
decoder decoder_instance (
.clk(clk),
.reset(reset),
.core_state(core_state),
.instruction(instruction),
.decoded_rd_address(decoded_rd_address),
.decoded_rs_address(decoded_rs_address),
.decoded_rt_address(decoded_rt_address),
.decoded_nzp(decoded_nzp),
.decoded_immediate(decoded_immediate),
.decoded_reg_write_enable(decoded_reg_write_enable),
.decoded_mem_read_enable(decoded_mem_read_enable),
.decoded_mem_write_enable(decoded_mem_write_enable),
.decoded_nzp_write_enable(decoded_nzp_write_enable),
.decoded_reg_input_mux(decoded_reg_input_mux),
.decoded_alu_arithmetic_mux(decoded_alu_arithmetic_mux),
.decoded_alu_output_mux(decoded_alu_output_mux),
.decoded_pc_mux(decoded_pc_mux),
.decoded_ret(decoded_ret)
);
// Scheduler
scheduler #(
.THREADS_PER_BLOCK(THREADS_PER_BLOCK),
) scheduler_instance (
.clk(clk),
.reset(reset),
.start(start),
.fetcher_state(fetcher_state),
.core_state(core_state),
.decoded_mem_read_enable(decoded_mem_read_enable),
.decoded_mem_write_enable(decoded_mem_write_enable),
.decoded_ret(decoded_ret),
.lsu_state(lsu_state),
.current_pc(current_pc),
.next_pc(next_pc),
.done(done)
for (i = 0; i < THREADS_PER_BLOCK; i = i + 1) begin : threads
// ALU
alu alu_instance (
.clk(clk),
.reset(reset),
.enable(i < thread_count),
.core_state(core_state),
.decoded_alu_arithmetic_mux(decoded_alu_arithmetic_mux),
.decoded_alu_output_mux(decoded_alu_output_mux),
.rs(rs[i]),
.rt(rt[i]),
.alu_out(alu_out[i])
);
Memory Controller Link to heading
From docs, The 2 memory controllers handles the requests from the cores and get back data back from external slow memory.
Global memory has fixed read/write bandwidth, but there may be far more incoming requests across all cores to access data from memory than the external memory is actually able to handle.
The memory controllers keep track of all the outgoing requests to memory from the compute cores, throttle requests based on actual external memory bandwidth, and relay responses from external memory back to the proper resources.
Each memory controller has a fixed number of channels based on the bandwidth of global memory.
// Data Memory Controller
controller #(
.ADDR_BITS(DATA_MEM_ADDR_BITS),
.DATA_BITS(DATA_MEM_DATA_BITS),
.NUM_CONSUMERS(NUM_LSUS),
.NUM_CHANNELS(DATA_MEM_NUM_CHANNELS)
) data_memory_controller (
.clk(clk),
.reset(reset),
.consumer_read_valid(lsu_read_valid),
.consumer_read_address(lsu_read_address),
.consumer_read_ready(lsu_read_ready),
.consumer_read_data(lsu_read_data),
.consumer_write_valid(lsu_write_valid),
.consumer_write_address(lsu_write_address),
.consumer_write_data(lsu_write_data),
.consumer_write_ready(lsu_write_ready),
.mem_read_valid(data_mem_read_valid),
.mem_read_address(data_mem_read_address),
.mem_read_ready(data_mem_read_ready),
.mem_read_data(data_mem_read_data),
.mem_write_valid(data_mem_write_valid),
.mem_write_address(data_mem_write_address),
.mem_write_data(data_mem_write_data),
.mem_write_ready(data_mem_write_ready)
);
// Program Memory Controller
controller #(
.ADDR_BITS(PROGRAM_MEM_ADDR_BITS),
.DATA_BITS(PROGRAM_MEM_DATA_BITS),
.NUM_CONSUMERS(NUM_FETCHERS),
.NUM_CHANNELS(PROGRAM_MEM_NUM_CHANNELS),
.WRITE_ENABLE(0)
) program_memory_controller (
.clk(clk),
.reset(reset),
.consumer_read_valid(fetcher_read_valid),
.consumer_read_address(fetcher_read_address),
.consumer_read_ready(fetcher_read_ready),
.consumer_read_data(fetcher_read_data),
.mem_read_valid(program_mem_read_valid),
.mem_read_address(program_mem_read_address),
.mem_read_ready(program_mem_read_ready),
.mem_read_data(program_mem_read_data),
);
Skimming the code, directs the write and read requests to memory and replay them back to code. So, It moves between IDLE
to READ_WAITING
to READ_RELAYING
and back to IDLE
.
IDLE: begin
// While this channel is idle, cycle through consumers looking for one with a pending request
for (int j = 0; j < NUM_CONSUMERS; j = j + 1) begin
if (consumer_read_valid[j] && !channel_serving_consumer[j]) begin
channel_serving_consumer[j] = 1;
current_consumer[i] <= j;
mem_read_valid[i] <= 1;
mem_read_address[i] <= consumer_read_address[j];
controller_state[i] <= READ_WAITING;
// Once we find a pending request, pick it up with this channel and stop looking for requests
break;
end else if (consumer_write_valid[j] && !channel_serving_consumer[j]) begin
...
READ_WAITING: begin
// Wait for response from memory for pending read request
if (mem_read_ready[i]) begin
mem_read_valid[i] <= 0;
consumer_read_ready[current_consumer[i]] <= 1;
consumer_read_data[current_consumer[i]] <= mem_read_data[i];
controller_state[i] <= READ_RELAYING;
end
end
...
// Wait until consumer acknowledges it received response, then reset
READ_RELAYING: begin
if (!consumer_read_valid[current_consumer[i]]) begin
channel_serving_consumer[current_consumer[i]] = 0;
consumer_read_ready[current_consumer[i]] <= 0;
controller_state[i] <= IDLE;
end
end
DCR Link to heading
Simple register with thread_count
as output. This is configured from the test by writing threads
in device_control_data
.
// Device Control Register
dcr dcr_instance (
.clk(clk),
.reset(reset),
.device_control_write_enable(device_control_write_enable),
.device_control_data(device_control_data),
.thread_count(thread_count)
);
dut.device_control_write_enable.value = 1
dut.device_control_data.value = threads
await RisingEdge(dut.clk)
dut.device_control_write_enable.value = 0
Dispatch Link to heading
From the docs, it basically says dispatch
runs the threads on cores.
Once a kernel is launched, the dispatcher is the unit that actually manages the distribution of threads to different compute cores.
The dispatcher organizes threads into groups that can be executed in parallel on a single core called blocks and sends these blocks off to be processed by available cores.
Once all blocks have been processed, the dispatcher reports back that the kernel execution is done.
// Dispatcher
dispatch #(
.NUM_CORES(NUM_CORES),
.THREADS_PER_BLOCK(THREADS_PER_BLOCK)
) dispatch_instance (
.clk(clk),
.reset(reset),
.start(start),
.thread_count(thread_count),
.core_done(core_done),
.core_start(core_start),
.core_reset(core_reset),
.core_block_id(core_block_id),
.core_thread_count(core_thread_count),
.done(done)NUM_CORES
);
It sets core_start
based on total_blocks
and reads core_done
to know core is done executing the thread.
for (int i = 0; i < NUM_CORES; i++) begin
if (core_reset[i]) begin
core_reset[i] <= 0;
// If this core was just reset, check if there are more blocks to be dispatched
if (blocks_dispatched < total_blocks) begin
core_start[i] <= 1;
core_block_id[i] <= blocks_dispatched;
core_thread_count[i] <= (blocks_dispatched == total_blocks - 1)
? thread_count - (blocks_dispatched * THREADS_PER_BLOCK)
: THREADS_PER_BLOCK;
blocks_dispatched = blocks_dispatched + 1;
end
end
end
for (int i = 0; i < NUM_CORES; i++) begin
if (core_start[i] && core_done[i]) begin
// If a core just finished executing it's current block, reset it
core_reset[i] <= 1;
core_start[i] <= 0;
blocks_done = blocks_done + 1;
end
end