Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion cub/test/catch2_test_device_reduce_env.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@ struct block_size_check_t

__device__ int operator()(int a, int b)
{
*ptr = blockDim.x;
if (threadIdx.x == 0)
{
*ptr = blockDim.x;
}
return a + b;
}
};
Expand Down
63 changes: 63 additions & 0 deletions cub/test/catch2_test_device_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,66 @@ void compute_inclusive_scan_by_key_reference(
compute_inclusive_scan_by_key_reference(
host_values.cbegin(), host_keys.cbegin(), result_out_it, scan_op, equality_op, num_items);
}

struct block_size_recording_constant_iterator
{
using value_type = int;
using reference = int;
using pointer = int*;
using difference_type = ptrdiff_t;
using iterator_category = ::cuda::std::random_access_iterator_tag;

int value;
int* block_size_ptr;
difference_type offset;

__host__ __device__ block_size_recording_constant_iterator(int val, int* bs_ptr, difference_type off = 0)
: value(val)
, block_size_ptr(bs_ptr)
, offset(off)
{}

__device__ reference operator[](difference_type) const
{
if (threadIdx.x == 0)
{
*block_size_ptr = blockDim.x;
}
return value;
}

__device__ reference operator*() const
{
if (threadIdx.x == 0)
{
*block_size_ptr = blockDim.x;
}
return value;
}

__host__ __device__ block_size_recording_constant_iterator operator+(difference_type n) const
{
return {value, block_size_ptr, offset + n};
}

__host__ __device__ block_size_recording_constant_iterator& operator+=(difference_type n)
{
offset += n;
return *this;
}

__host__ __device__ difference_type operator-(const block_size_recording_constant_iterator& other) const
{
return offset - other.offset;
}

__host__ __device__ bool operator==(const block_size_recording_constant_iterator& other) const
{
return offset == other.offset;
}

__host__ __device__ bool operator!=(const block_size_recording_constant_iterator& other) const
{
return offset != other.offset;
}
};
99 changes: 35 additions & 64 deletions cub/test/catch2_test_device_scan_env.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ struct stream_registry_factory_t;
#include <cuda/__device/arch_id.h>
#include <cuda/iterator>

#include "catch2_test_device_scan.cuh"
#include "catch2_test_env_launch_helper.h"

DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScan, device_scan_exclusive);
Expand All @@ -43,7 +44,10 @@ struct block_size_check_t

__device__ int operator()(int a, int b)
{
*ptr = blockDim.x;
if (threadIdx.x == 0)
{
*ptr = blockDim.x;
}
return a + b;
}
};
Expand All @@ -57,22 +61,20 @@ TEST_CASE("Device scan exclusive scan works with default environment", "[scan][d
using selector_t =
cub::detail::scan::policy_selector_from_types<value_t, value_t, value_t, offset_t, block_size_check_t>;

int current_device{};
REQUIRE(cudaSuccess == cudaGetDevice(&current_device));

cuda::arch_id arch_id;
REQUIRE(cudaSuccess == cub::detail::ptx_arch_id(arch_id));
const auto target_block_size = selector_t{}(arch_id).block_threads;

num_items_t num_items = 1;
num_items_t num_items = 2;
c2h::device_vector<int> d_block_size(1);
block_size_check_t block_size_check{thrust::raw_pointer_cast(d_block_size.data())};
auto d_in = cuda::constant_iterator(value_t{1});
auto d_out = thrust::device_vector<value_t>(1);
auto d_out = thrust::device_vector<value_t>(num_items);

auto init = value_t{0};
auto init = value_t{42};
REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveScan(d_in, d_out.begin(), block_size_check, init, num_items));
REQUIRE(d_out[0] == init);
REQUIRE(d_out[1] == (init + value_t{1}));

// Make sure we use default tuning
REQUIRE(d_block_size[0] == target_block_size);
Expand All @@ -82,21 +84,15 @@ TEST_CASE("Device scan exclusive sum works with default environment", "[sum][dev
{
using num_items_t = int;
using value_t = int;
using offset_t = cub::detail::choose_offset_t<num_items_t>;

int current_device{};
REQUIRE(cudaSuccess == cudaGetDevice(&current_device));

int ptx_version{};
REQUIRE(cudaSuccess == cub::PtxVersion(ptx_version, current_device));

num_items_t num_items = 1;
num_items_t num_items = 2;

auto d_in = cuda::constant_iterator(value_t{1});
auto d_out = thrust::device_vector<value_t>(1);
auto d_out = thrust::device_vector<value_t>(num_items);

REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveSum(d_in, d_out.begin(), num_items));
REQUIRE(d_out[0] == value_t{0});
REQUIRE(d_out[0] == value_t{});
REQUIRE(d_out[1] == value_t{} + d_in[0]);
}

template <int BlockThreads>
Expand Down Expand Up @@ -138,16 +134,14 @@ C2H_TEST("Device scan exclusive-scan can be tuned", "[scan][device]", block_size
auto num_items = 3;
auto d_in = cuda::constant_iterator(1);
auto d_out = thrust::device_vector<int>(num_items);
auto init = int{42};

// We are expecting that `unrelated_tuning` is ignored
auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});

REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveScan(d_in, d_out.begin(), block_size_check, 0, num_items, env));
REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveScan(d_in, d_out.begin(), block_size_check, init, num_items, env));

for (int i = 0; i < num_items; i++)
{
REQUIRE(d_out[i] == i);
}
REQUIRE(thrust::equal(d_out.begin(), d_out.end(), thrust::make_counting_iterator(init)));
REQUIRE(d_block_size[0] == target_block_size);
}

Expand All @@ -156,18 +150,19 @@ C2H_TEST("Device scan exclusive-sum can be tuned", "[scan][device]", block_sizes
constexpr int target_block_size = c2h::get<0, TestType>::value;

auto num_items = target_block_size;
auto d_in = cuda::constant_iterator(1);
auto d_out = thrust::device_vector<int>(num_items);
c2h::device_vector<int> d_block_size(1, 0);
// use block_size_recording_iterator to embed blockDim info in the input type and query after
// since ExclusiveSum can not take a custom scan_op
auto d_in = block_size_recording_constant_iterator(1, thrust::raw_pointer_cast(d_block_size.data()));
auto d_out = thrust::device_vector<int>(num_items);

// We are expecting that `unrelated_tuning` is ignored
auto env = cuda::execution::__tune(scan_tuning<target_block_size>{}, unrelated_tuning{});

REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveSum(d_in, d_out.begin(), num_items, env));

for (int i = 0; i < num_items; i++)
{
REQUIRE(d_out[i] == i);
}
REQUIRE(thrust::equal(d_out.begin(), d_out.end(), thrust::make_counting_iterator(0)));
REQUIRE(d_block_size[0] == target_block_size);
}

TEST_CASE("Device scan inclusive-scan works with default environment", "[scan][device]")
Expand All @@ -179,21 +174,18 @@ TEST_CASE("Device scan inclusive-scan works with default environment", "[scan][d
using selector_t =
cub::detail::scan::policy_selector_from_types<value_t, value_t, value_t, offset_t, block_size_check_t>;

int current_device{};
REQUIRE(cudaSuccess == cudaGetDevice(&current_device));

cuda::arch_id arch_id;
REQUIRE(cudaSuccess == cub::detail::ptx_arch_id(arch_id));
const auto target_block_size = selector_t{}(arch_id).block_threads;

num_items_t num_items = 1;
num_items_t num_items = 2;
c2h::device_vector<int> d_block_size(1);
block_size_check_t block_size_check{thrust::raw_pointer_cast(d_block_size.data())};
auto d_in = cuda::constant_iterator(value_t{1});
auto d_out = thrust::device_vector<value_t>(1);

auto d_out = thrust::device_vector<value_t>(num_items);
REQUIRE(cudaSuccess == cub::DeviceScan::InclusiveScan(d_in, d_out.begin(), block_size_check, num_items));
REQUIRE(d_out[0] == value_t{1});
REQUIRE(d_out[0] == d_in[0]);
REQUIRE(d_out[1] == d_in[0] + d_in[1]);

// Make sure we use default tuning
REQUIRE(d_block_size[0] == target_block_size);
Expand All @@ -214,10 +206,7 @@ C2H_TEST("Device scan inclusive-scan can be tuned", "[scan][device]", block_size

REQUIRE(cudaSuccess == cub::DeviceScan::InclusiveScan(d_in, d_out.begin(), block_size_check, num_items, env));

for (int i = 0; i < num_items; i++)
{
REQUIRE(d_out[i] == (i + 1));
}
REQUIRE(thrust::equal(d_out.begin(), d_out.end(), thrust::make_counting_iterator(1)));
REQUIRE(d_block_size[0] == target_block_size);
}

Expand All @@ -234,10 +223,7 @@ TEST_CASE("Device scan inclusive-scan-init works with default environment", "[sc

REQUIRE(cudaSuccess == cub::DeviceScan::InclusiveScanInit(d_in, d_out.begin(), cuda::std::plus{}, init, num_items));

for (int i = 0; i < num_items; i++)
{
REQUIRE(d_out[i] == (i + 1 + init));
}
REQUIRE(thrust::equal(d_out.begin(), d_out.end(), thrust::make_counting_iterator(init + 1)));
}

C2H_TEST("Device scan inclusive-scan-init can be tuned", "[scan][device]", block_sizes)
Expand All @@ -258,10 +244,7 @@ C2H_TEST("Device scan inclusive-scan-init can be tuned", "[scan][device]", block
REQUIRE(
cudaSuccess == cub::DeviceScan::InclusiveScanInit(d_in, d_out.begin(), block_size_check, init, num_items, env));

for (int i = 0; i < num_items; i++)
{
REQUIRE(d_out[i] == (i + 1 + init));
}
REQUIRE(thrust::equal(d_out.begin(), d_out.end(), thrust::make_counting_iterator(init + 1)));
REQUIRE(d_block_size[0] == target_block_size);
}

Expand All @@ -278,7 +261,7 @@ C2H_TEST("Device scan exclusive-scan uses environment", "[scan][device]")

using init_t = float;

init_t init{};
init_t init{42.0f};

size_t expected_bytes_allocated{};
REQUIRE(cudaSuccess
Expand All @@ -290,10 +273,7 @@ C2H_TEST("Device scan exclusive-scan uses environment", "[scan][device]")

device_scan_exclusive(d_in, d_out.begin(), scan_op_t{}, init, num_items, env);

for (int i = 0; i < num_items; i++)
{
REQUIRE(d_out[i] == i);
}
REQUIRE(thrust::equal(d_out.begin(), d_out.end(), thrust::make_counting_iterator(static_cast<int>(init))));
}

C2H_TEST("Device scan exclusive-sum uses environment", "[scan][device]")
Expand All @@ -314,10 +294,7 @@ C2H_TEST("Device scan exclusive-sum uses environment", "[scan][device]")

device_scan_exclusive_sum(d_in, d_out.begin(), num_items, env);

for (int i = 0; i < num_items; i++)
{
REQUIRE(d_out[i] == i);
}
REQUIRE(thrust::equal(d_out.begin(), d_out.end(), thrust::make_counting_iterator(0)));
}

C2H_TEST("Device scan inclusive-scan uses environment", "[scan][device]")
Expand All @@ -338,10 +315,7 @@ C2H_TEST("Device scan inclusive-scan uses environment", "[scan][device]")

device_scan_inclusive(d_in, d_out.begin(), scan_op_t{}, num_items, env);

for (int i = 0; i < num_items; i++)
{
REQUIRE(d_out[i] == (i + 1));
}
REQUIRE(thrust::equal(d_out.begin(), d_out.end(), thrust::make_counting_iterator(1)));
}

C2H_TEST("Device scan inclusive-scan-init uses environment", "[scan][device]")
Expand All @@ -366,8 +340,5 @@ C2H_TEST("Device scan inclusive-scan-init uses environment", "[scan][device]")

device_scan_inclusive_init(d_in, d_out.begin(), scan_op_t{}, init, num_items, env);

for (int i = 0; i < num_items; i++)
{
REQUIRE(d_out[i] == (i + 1 + 10.0f));
}
REQUIRE(thrust::equal(d_out.begin(), d_out.end(), thrust::make_counting_iterator(static_cast<int>(init + 1))));
}
Loading