After making a CUDA API call, always clear the global CUDA error state by calling

brycelelbach · brycelelbach · commit a424837bce6c · 2019-10-14T20:59:10.000-07:00
cudaGetLastError. Otherwise, if the CUDA API call is followed directly by a kernel
launch, checking for a synchronous error during the kernel launch by calling
cudaGetLastError may potentially return the error code from the CUDA API call.
This type of error leakage is very subtle and difficult to trace.

Bug 2720132
diff --git a/testing/out_of_memory_recovery.cu b/testing/out_of_memory_recovery.cu
@@ -0,0 +1,24 @@
+// Regression test for NVBug 2720132.
+
+#include <unittest/unittest.h>
+#include <thrust/device_vector.h>
+#include <thrust/detail/cstdint.h>
+
+struct non_trivial
+{
+  __host__ __device__ non_trivial() {}
+  __host__ __device__ ~non_trivial() {}
+};
+
+void test_out_of_memory_recovery()
+{
+  try
+  {
+    thrust::device_vector<non_trivial> x(1);
+
+    for (thrust::detail::uint64_t n = 1 ;; n <<= 1)
+      thrust::device_vector<thrust::detail::uint32_t> y(n);
+  }
+  catch (...) { }
+}
+DECLARE_UNITTEST(test_out_of_memory_recovery);
diff --git a/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -177,13 +177,13 @@ public:
         res_desc.res.linear.desc        = channel_desc;
         res_desc.res.linear.sizeInBytes = bytes;
         tex_desc.readMode               = cudaReadModeElementType;
-        return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
+        return CubDebug(cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL));
     }
 
     /// Unbind this iterator from its texture reference
     cudaError_t UnbindTexture()
     {
-        return cudaDestroyTextureObject(tex_obj);
+        return CubDebug(cudaDestroyTextureObject(tex_obj));
     }
 
     /// Postfix increment
diff --git a/thrust/system/cuda/detail/cub/util_allocator.cuh b/thrust/system/cuda/detail/cub/util_allocator.cuh
@@ -406,7 +406,7 @@ struct CachingDeviceAllocator
                 // in use by the device, only consider cached blocks that are
                 // either (from the active stream) or (from an idle stream)
                 if ((active_stream == block_itr->associated_stream) ||
-                    (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
+                    (CubDebug(cudaEventQuery(block_itr->ready_event)) != cudaErrorNotReady))
                 {
                     // Reuse existing cache block.  Insert into live blocks.
                     found = true;
diff --git a/thrust/system/cuda/detail/cub/util_debug.cuh b/thrust/system/cuda/detail/cub/util_debug.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -72,6 +72,13 @@ __host__ __device__ __forceinline__ cudaError_t Debug(
 {
     (void)filename;
     (void)line;
+
+#ifdef CUB_RUNTIME_ENABLED
+    // Clear the global CUDA error state which may have been set by the last
+    // call. Otherwise, errors may "leak" to unrelated kernel launches.
+    cudaGetLastError();
+#endif
+
 #ifdef CUB_STDERR
     if (error)
     {
diff --git a/thrust/system/cuda/detail/cub/util_device.cuh b/thrust/system/cuda/detail/cub/util_device.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -199,11 +199,11 @@ CUB_RUNTIME_FUNCTION __forceinline__
 static cudaError_t SyncStream(cudaStream_t stream)
 {
 #if (CUB_PTX_ARCH == 0)
-    return cudaStreamSynchronize(stream);
+    return CubDebug(cudaStreamSynchronize(stream));
 #else
     (void)stream;
     // Device can't yet sync on a specific stream
-    return cudaDeviceSynchronize();
+    return CubDebug(cudaDeviceSynchronize());
 #endif
 }
 
@@ -255,15 +255,12 @@ cudaError_t MaxSmOccupancy(
 
     // CUDA API calls not supported from this device
     return CubDebug(cudaErrorInvalidConfiguration);
-
 #else
-
-    return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
+    return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_sm_occupancy,
         kernel_ptr,
         block_threads,
-        dynamic_smem_bytes);
-
+        dynamic_smem_bytes));
 #endif  // CUB_RUNTIME_ENABLED
 }
 
diff --git a/thrust/system/cuda/detail/malloc_and_free.h b/thrust/system/cuda/detail/malloc_and_free.h
@@ -62,9 +62,9 @@ void *malloc(execution_policy<DerivedPolicy> &, std::size_t n)
 
   if(status != cudaSuccess)
   {
-  //  cuda_cub::throw_on_error(status, "device malloc failed");
+    cudaGetLastError(); // Clear global CUDA error state.
     thrust::system::detail::bad_alloc(thrust::cuda_category().message(status).c_str());
-  } 
+  }
 #else
   result = thrust::raw_pointer_cast(thrust::malloc(thrust::seq, n));
 #endif
diff --git a/thrust/system/cuda/detail/par.h b/thrust/system/cuda/detail/par.h
@@ -29,6 +29,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
 
 #include <thrust/detail/allocator_aware_execution_policy.h>
 
@@ -40,36 +41,6 @@
 THRUST_BEGIN_NS
 namespace cuda_cub {
 
-inline __host__ __device__
-cudaStream_t
-default_stream()
-{
-  return cudaStreamLegacy;
-}
-
-template <class Derived>
-__host__ __device__
-cudaStream_t
-get_stream(execution_policy<Derived> &)
-{
-  return default_stream();
-}
-
-__thrust_exec_check_disable__
-template <class Derived>
-__host__ __device__
-cudaError_t
-synchronize_stream(execution_policy<Derived> &)
-{
-  #if __THRUST_HAS_CUDART__
-    cudaDeviceSynchronize();
-    return cudaGetLastError();
-  #else
-    return cudaSuccess;
-  #endif
-}
-
-
 template <class Derived>
 struct execute_on_stream_base : execution_policy<Derived>
 {
diff --git a/thrust/system/cuda/detail/util.h b/thrust/system/cuda/detail/util.h
diff --git a/thrust/system/cuda/experimental/pinned_allocator.h b/thrust/system/cuda/experimental/pinned_allocator.h
diff --git a/thrust/system/cuda/memory_resource.h b/thrust/system/cuda/memory_resource.h

Original file line number	Diff line number	Diff line change
`@@ -406,7 +406,7 @@ struct CachingDeviceAllocator`
`406`	`406`	`// in use by the device, only consider cached blocks that are`
`407`	`407`	`// either (from the active stream) or (from an idle stream)`
`408`	`408`	`if ((active_stream == block_itr->associated_stream) \|\|`
`409`		`- (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))`
	`409`	`+ (CubDebug(cudaEventQuery(block_itr->ready_event)) != cudaErrorNotReady))`
`410`	`410`	`{`
`411`	`411`	`// Reuse existing cache block. Insert into live blocks.`
`412`	`412`	`found = true;`