Skip to content
Merged
Prev Previous commit
Next Next commit
Fix dispatchKernel arguments in the examples
  • Loading branch information
junjihashimoto committed Sep 5, 2025
commit 2b1767df72d1ad502bc7e376f665836b29ddeb1b
5 changes: 1 addition & 4 deletions examples/float16/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,9 @@ int main(int argc, char **argv) {
}
Tensor input = createTensor(ctx, Shape{N}, kf16, inputArr.data());
Tensor output = createTensor(ctx, Shape{N}, kf16);
std::promise<void> promise;
std::future<void> future = promise.get_future();
Kernel op = createKernel(ctx, {kGelu, 256, kf16}, Bindings{input, output},
{cdiv(N, 256), 1, 1});
dispatchKernel(ctx, op, promise);
wait(ctx, future);
dispatchKernel(ctx, op);
toCPU(ctx, output, outputArr.data(), sizeof(outputArr));

for (int i = 0; i < 12; ++i) {
Expand Down
5 changes: 1 addition & 4 deletions examples/gpu_puzzles/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,8 @@ template <size_t N> std::array<float, N> makeData() {

template <size_t N, size_t R = N, size_t C = 1> void showResult(Context &ctx, Kernel &op, Tensor &output) {

std::promise<void> promise;
std::future<void> future = promise.get_future();
dispatchKernel(ctx, op, promise);
dispatchKernel(ctx, op);
std::array<float, R * C> outputArr;
wait(ctx, future);
toCPU(ctx, output, outputArr.data(), sizeof(outputArr));
printf("%s", show<float, R, C>(outputArr, "output").c_str());
}
Expand Down
8 changes: 1 addition & 7 deletions examples/matmul/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -838,12 +838,9 @@ void runTest(int version, size_t M, size_t K, size_t N,

// Initialize Kernel and bind GPU buffers
// pre-allocate for async dispatch
std::array<std::promise<void>, nIter> promises;
std::array<std::future<void>, nIter> futures;
std::array<Kernel, nIter> kernels;
std::array<Tensor, nIter> outputs;
for (int i = 0; i < nIter; i++) {
futures[i] = promises[i].get_future();
outputs[i] = createTensor(ctx, Shape{M, N}, numtype);
kernels[i] = selectMatmul(ctx, version, {input, weights, outputs[i]}, M, K, N, numtype);
}
Expand All @@ -854,10 +851,7 @@ void runTest(int version, size_t M, size_t K, size_t N,
// Dispatch kernel nIter times
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nIter; i++) {
dispatchKernel(ctx, kernels[i], promises[i]);
}
for (int i = 0; i < nIter; i++) {
wait(ctx, futures[i]);
dispatchKernel(ctx, kernels[i]);
}
auto end = std::chrono::high_resolution_clock::now();

Expand Down
5 changes: 1 addition & 4 deletions examples/physics/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,7 @@ int main() {
printf("\033[2J\033[H");
while (true) {
auto start = std::chrono::high_resolution_clock::now();
std::promise<void> promise;
std::future<void> future = promise.get_future();
dispatchKernel(ctx, update, promise);
wait(ctx, future);
dispatchKernel(ctx, update);
toCPU(ctx, pos, posArr.data(), sizeof(posArr));
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
Expand Down
5 changes: 1 addition & 4 deletions examples/shadertui/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,7 @@ int main() {
params.time = getCurrentTimeInMilliseconds(start);
toGPU(ctx, params, renderKernel);
auto frameStart = std::chrono::high_resolution_clock::now();
std::promise<void> promise;
std::future<void> future = promise.get_future();
dispatchKernel(ctx, renderKernel, promise);
wait(ctx, future);
dispatchKernel(ctx, renderKernel);
resetCommandBuffer(ctx.device, renderKernel);
toCPU(ctx, screen, screenArr);
rasterize<kRows, kCols>(screenArr, raster);
Expand Down
11 changes: 1 addition & 10 deletions examples/transpose/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,20 +162,11 @@ void runTest(int version, size_t M, size_t N,
LOG(kDefLog, kInfo, "Dispatching Kernel version %d, %d iterations ...",
version, nIter);

// pre-allocate promises and futures for async dispatch
// TODO(avh): implement a pooling mechanism for promises/futures in gpu.h
std::array<std::promise<void>, nIter> promises;
std::array<std::future<void>, nIter> futures;
for (int i = 0; i < nIter; i++) {
futures[i] = promises[i].get_future();
}

// Dispatch kernel nIter times
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < nIter; i++) {
if (!isCPU) {
dispatchKernel(ctx, kernel, promises[i]);
wait(ctx, futures[i]);
dispatchKernel(ctx, kernel);
resetCommandBuffer(ctx.device, kernel);
} else {
transpose(inputPtr.get(), outputPtr.get(), M, N);
Expand Down
Loading