Skip to content

Commit efffd3d

Browse files
committed
Add two new examples:
- thrust.example.cuda.global_device_vector, which demonstrates how to make global device_vectors work via a custom allocator that ignores shutdown failures. - thrust.example.scan_matrix_by_rows, which demonstrates how to scan the rows of a contiguous dense matrix in a single call to inclusive_scan_by_key.
1 parent dca6722 commit efffd3d

4 files changed

Lines changed: 117 additions & 0 deletions

File tree

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#include <thrust/device_vector.h>
2+
3+
// If you create a global `thrust::device_vector` with the default allocator,
4+
// you'll get an error during program termination when the memory of the vector
5+
// is freed, as the CUDA runtime cannot be used during program termination.
6+
//
7+
// To get around this, you can create your own allocator which ignores
8+
// deallocation failures that occur because the CUDA runtime is shut down.
9+
10+
extern "C" cudaError_t cudaFreeIgnoreShutdown(void* ptr) {
11+
cudaError_t const err = cudaFree(ptr);
12+
if (cudaSuccess == err || cudaErrorCudartUnloading == err)
13+
return cudaSuccess;
14+
return err;
15+
}
16+
17+
typedef thrust::system::cuda::detail::cuda_memory_resource<
18+
cudaMalloc,
19+
cudaFreeIgnoreShutdown,
20+
thrust::cuda::pointer<void>
21+
> device_ignore_shutdown_memory_resource;
22+
23+
#if __cplusplus >= 201103L
24+
template <typename T>
25+
using device_ignore_shutdown_allocator =
26+
thrust::mr::stateless_resource_allocator<
27+
T,
28+
thrust::device_ptr_memory_resource<device_ignore_shutdown_memory_resource>
29+
>;
30+
31+
thrust::device_vector<double, device_ignore_shutdown_allocator<double>> d;
32+
#else
33+
thrust::device_vector<
34+
double,
35+
thrust::mr::stateless_resource_allocator<
36+
double,
37+
thrust::device_ptr_memory_resource<device_ignore_shutdown_memory_resource>
38+
>
39+
> d;
40+
#endif
41+
42+
int main() {
43+
d.resize(25);
44+
}
45+

examples/scan_matrix_by_rows.cu

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#include <thrust/device_vector.h>
2+
#include <thrust/scan.h>
3+
#include <thrust/iterator/transform_iterator.h>
4+
#include <thrust/iterator/counting_iterator.h>
5+
6+
#include <assert.h>
7+
8+
// We have a matrix stored in a `thrust::device_vector`. We want to perform a
9+
// scan on each row of a matrix.
10+
11+
__host__
12+
void scan_matrix_by_rows0(thrust::device_vector<int>& u, int n, int m) {
13+
// Here, we launch a separate scan for each row in the matrix. This works,
14+
// but each kernel only does a small amount of work. It would be better if we
15+
// could launch one big kernel for the entire matrix.
16+
for (int i = 0; i < n; ++i)
17+
thrust::inclusive_scan(u.begin() + m * i, u.begin() + m * (i + 1),
18+
u.begin() + m * i);
19+
}
20+
21+
// We can batch the operation using `thrust::inclusive_scan_by_key`, which
22+
// scans each group of consecutive equal keys. All we need to do is generate
23+
// the right key sequence. We want the keys for elements on the same row to
24+
// be identical.
25+
26+
// So first, we define an unary function object which takes the index of an
27+
// element and returns the row that it belongs to.
28+
29+
struct which_row : thrust::unary_function<int, int> {
30+
int row_length;
31+
32+
__host__ __device__
33+
which_row(int row_length_) : row_length(row_length_) {}
34+
35+
__host__ __device__
36+
int operator()(int idx) const {
37+
return idx / row_length;
38+
}
39+
};
40+
41+
__host__
42+
void scan_matrix_by_rows1(thrust::device_vector<int>& u, int n, int m) {
43+
// This `thrust::counting_iterator` represents the index of the element.
44+
thrust::counting_iterator<int> c_first(0);
45+
46+
// We construct a `thrust::transform_iterator` which applies the `which_row`
47+
// function object to the index of each element.
48+
thrust::transform_iterator<which_row, thrust::counting_iterator<int> >
49+
t_first(c_first, which_row(m));
50+
51+
// Finally, we use our `thrust::transform_iterator` as the key sequence to
52+
// `thrust::inclusive_scan_by_key`.
53+
thrust::inclusive_scan_by_key(t_first, t_first + n * m, u.begin(), u.begin());
54+
}
55+
56+
int main() {
57+
int const n = 4;
58+
int const m = 5;
59+
60+
thrust::device_vector<int> u0(n * m);
61+
thrust::sequence(u0.begin(), u0.end());
62+
scan_matrix_by_rows0(u0, n, m);
63+
64+
thrust::device_vector<int> u1(n * m);
65+
thrust::sequence(u1.begin(), u1.end());
66+
scan_matrix_by_rows1(u1, n, m);
67+
68+
for (int i = 0; i < n; ++i)
69+
for (int j = 0; j < m; ++j)
70+
assert(u0[j + m * i] == u1[j + m * i]);
71+
}
72+

internal/test/thrust.example.cuda.global_device_vector.filecheck

Whitespace-only changes.

internal/test/thrust.example.scan_matrix_by_rows.filecheck

Whitespace-only changes.

0 commit comments

Comments
 (0)