|
1 | 1 | #include "kernel_launcher.h" |
2 | 2 |
|
| 3 | +// Namespace alias. |
| 4 | +namespace kl = kernel_launcher; |
3 | 5 |
|
4 | | -int main() { |
5 | | - // Namespace alias. |
6 | | - namespace kl = kernel_launcher; |
7 | | - |
8 | | - // Create a kernel builder |
| 6 | +kl::KernelBuilder build_kernel() { |
9 | 7 | kl::KernelBuilder builder("vector_add", "vector_add_kernel.cu"); |
10 | | - |
11 | | - // Define tunable parameters |
| 8 | + |
12 | 9 | auto threads_per_block = builder.tune("block_size", {32, 64, 128, 256, 512, 1024}); |
13 | 10 | auto elements_per_thread = builder.tune("elements_per_thread", {1, 2, 4, 8}); |
14 | | - |
15 | | - // Define expressions |
16 | 11 | auto elements_per_block = threads_per_block * elements_per_thread; |
17 | | - |
18 | | - // Define kernel properties |
| 12 | + |
19 | 13 | builder |
20 | 14 | .block_size(threads_per_block) |
21 | 15 | .grid_divisors(threads_per_block * elements_per_thread) |
22 | 16 | .template_args(kl::type_of<float>()) |
23 | 17 | .define("ELEMENTS_PER_THREAD", elements_per_thread); |
24 | 18 |
|
25 | | - // Define configuration |
26 | | - kl::Config config; |
27 | | - config.insert(threads_per_block, 32); |
28 | | - config.insert(elements_per_thread, 2); |
| 19 | + return builder; |
| 20 | +} |
| 21 | + |
| 22 | +void main() { |
| 23 | + kl::set_global_wisdom_directory("wisdom/"); |
| 24 | + kl::set_global_tuning_directory("tuning/"); |
| 25 | + |
| 26 | + // Define the kernel. "vector_add" is the tuning key. |
| 27 | + std::string tuning_key = "vector_add": |
| 28 | + kl::KernelBuilder builder = build_kernel(); |
| 29 | + kl::WisdomKernel vector_add_kernel(tuning_key, builder); |
29 | 30 |
|
30 | | - // Compile kernel |
31 | | - kl::Kernel<int, int*, const int*, const int*> vector_add_kernel; |
32 | | - vector_add_kernel.compile(builder, config); |
33 | | - |
34 | 31 | // Initialize CUDA memory. This is outside the scope of kernel_launcher. |
35 | 32 | unsigned int n = 1000000; |
36 | 33 | float *dev_A, *dev_B, *dev_C; |
37 | 34 | /* cudaMalloc, cudaMemcpy, ... */ |
38 | | - |
| 35 | + |
39 | 36 | // Launch the kernel! |
40 | 37 | unsigned int problem_size = n; |
41 | | - vector_add_kernel |
42 | | - .instantiate(problem_size) |
43 | | - .launch(n, dev_C, dev_A, dev_B); |
| 38 | + vector_add_kernel(problem_size)(n, dev_C, dev_A, dev_B); |
44 | 39 | } |
0 commit comments