Fix CUDA kernel index data type in deeplearning/projects/fairseq-py/fairseq/modules/cuda_utils.cu +10

r-barnes · facebook-github-bot · commit 4c4816d5cf4d · 2025-03-17T19:59:58.000-07:00
Summary: CUDA kernel variables matching the type `(thread|block|grid).(Idx|Dim).(x|y|z)` [have the data type `uint`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#built-in-variables). Many programmers mistakenly use implicit casts to turn these data types into `int`. In fact, the [CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/) it self is inconsistent and incorrect in its use of data types in programming examples. The result of these implicit casts is that our kernels may give unexpected results when exposed to large datasets, i.e., those exceeding >~2B items. While we now have linters in place to prevent simple mistakes (D71236150), our codebase has many problematic instances. This diff fixes some of them. Reviewed By: dtolnay Differential Revision: D71355350 fbshipit-source-id: a23a7b3ab08dd958db91bd55fe5cad47dd9741f0
diff --git a/flashlight/lib/sequence/criterion/cuda/CriterionUtils.cu b/flashlight/lib/sequence/criterion/cuda/CriterionUtils.cu
@@ -20,7 +20,7 @@ using namespace fl::lib::seq;
  */
 __global__ void
 batchTargetSizeKernel(int L, int maxSize, const int* _target, int* targetSize) {
-  int b = blockIdx.x;
+  auto b = blockIdx.x;
   auto target = _target + b * L;
 
   __shared__ int idx;
@@ -31,7 +31,7 @@ batchTargetSizeKernel(int L, int maxSize, const int* _target, int* targetSize) {
 
   __syncthreads();
 
-  for (int i = L - 1 - threadIdx.x; i >= 0; i -= blockDim.x) {
+  for (auto i = L - 1 - threadIdx.x; i >= 0; i -= blockDim.x) {
     if (target[i] >= 0) {
       atomicMax(&idx, i + 1);
       break;
@@ -57,7 +57,7 @@ __global__ void computeScaleKernel(
     CriterionScaleMode scaleMode,
     const int* targetSize,
     Float* scale) {
-  for (int b = threadIdx.x; b < B; b += blockDim.x) {
+  for (auto b = threadIdx.x; b < B; b += blockDim.x) {
     switch (scaleMode) {
       case CriterionScaleMode::NONE:
         scale[b] = 1.0;
diff --git a/flashlight/lib/sequence/criterion/cuda/ForceAlignmentCriterion.cu b/flashlight/lib/sequence/criterion/cuda/ForceAlignmentCriterion.cu
@@ -57,15 +57,15 @@ __global__ void forwardKernel(
     const Float* trans,
     Float* _loss,
     WorkspacePtrs<Float> ws) {
-  int b = blockIdx.x;
+  auto b = blockIdx.x;
   auto* alpha = &ws.alpha[b * T * _L];
   auto* input = &_input[b * T * N];
   auto* target = &_target[b * _L];
   auto* transBuf1 = &ws.transBuf1[b * _L];
   auto* transBuf2 = &ws.transBuf2[b * _L];
   int L = targetSize[b];
 
-  for (int i = threadIdx.x; i < L; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < L; i += blockDim.x) {
     alpha[i] = i == 0 ? input[target[0]] : 0;
     transBuf1[i] = trans[target[i] * N + target[i]];
     transBuf2[i] = i > 0 ? trans[target[i] * N + target[i - 1]] : 0;
@@ -92,7 +92,7 @@ __global__ void forwardKernel(
       }
     }
 
-    for (int i = low + threadIdx.x; i < high; i += blockDim.x) {
+    for (auto i = low + threadIdx.x; i < high; i += blockDim.x) {
       double s1 = alphaPrev[i] + transBuf1[i];
       double s2 = alphaPrev[i - 1] + transBuf2[i];
       // lse = logSumExp(s1, s2)
@@ -124,7 +124,7 @@ __global__ void backwardKernel(
     Float* _inputGrad,
     Float* transGrad,
     WorkspacePtrs<Float> ws) {
-  int b = blockIdx.x;
+  auto b = blockIdx.x;
   auto* alpha = &ws.alpha[b * T * _L];
   auto* alphaGrad = &ws.alphaGrad[b * T * _L];
   auto* inputGrad = &_inputGrad[b * T * N];
@@ -154,7 +154,7 @@ __global__ void backwardKernel(
 
     __syncthreads();
 
-    for (int i = low1 + threadIdx.x; i < high1; i += blockDim.x) {
+    for (auto i = low1 + threadIdx.x; i < high1; i += blockDim.x) {
       atomicAdd(&inputCurGrad[target[i]], alphaCurGrad[i]);
     }
 
@@ -170,7 +170,7 @@ __global__ void backwardKernel(
       }
     }
 
-    for (int i = low + threadIdx.x; i < high; i += blockDim.x) {
+    for (auto i = low + threadIdx.x; i < high; i += blockDim.x) {
       double s1 = alphaPrev[i] + transBuf1[i];
       double s2 = alphaPrev[i - 1] + transBuf2[i];
       // d1, d2 = dLogSumExp(s1, s2)
@@ -198,7 +198,7 @@ __global__ void backwardKernel(
     gradScale = grad[b] * ws.scale[b];
   }
 
-  for (int i = threadIdx.x; i < L; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < L; i += blockDim.x) {
     atomicAdd(&transBatchGrad[target[i] * N + target[i]], transBufGrad1[i]);
     if (i > 0) {
       atomicAdd(
@@ -208,11 +208,11 @@ __global__ void backwardKernel(
 
   __syncthreads();
 
-  for (int i = threadIdx.x; i < T * N; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < T * N; i += blockDim.x) {
     inputGrad[i] *= gradScale;
   }
 
-  for (int i = threadIdx.x; i < N * N; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < N * N; i += blockDim.x) {
     atomicAdd(&transGrad[i], gradScale * transBatchGrad[i]);
   }
 }
@@ -228,19 +228,19 @@ __global__ void viterbiPathKernel(
     const Float* trans,
     int* bestPaths,
     WorkspacePtrs<Float> ws) {
-  int b = blockIdx.x;
+  auto b = blockIdx.x;
   auto* alpha = &ws.alpha[b * T * _L];
   auto* input = &_input[b * T * N];
   auto* target = &_target[b * _L];
   auto* transBuf1 = &ws.transBuf1[b * _L];
   auto* transBuf2 = &ws.transBuf2[b * _L];
   int L = targetSize[b];
 
-  for (int i = threadIdx.x; i < L * T; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < L * T; i += blockDim.x) {
     alpha[i] = i == 0 ? input[target[0]] : -CUDART_INF_F;
   }
 
-  for (int i = threadIdx.x; i < L; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < L; i += blockDim.x) {
     transBuf1[i] = trans[target[i] * N + target[i]];
     transBuf2[i] = i > 0 ? trans[target[i] * N + target[i - 1]] : 0;
   }
@@ -270,7 +270,7 @@ __global__ void viterbiPathKernel(
       }
     }
 
-    for (int i = low + threadIdx.x; i < high; i += blockDim.x) {
+    for (auto i = low + threadIdx.x; i < high; i += blockDim.x) {
       double s1 = alphaPrev[i] + transBuf1[i];
       double s2 = alphaPrev[i - 1] + transBuf2[i];
       alphaCur[i] = inputCur[target[i]] + max(s1, s2);
diff --git a/flashlight/lib/sequence/criterion/cuda/FullConnectionCriterion.cu b/flashlight/lib/sequence/criterion/cuda/FullConnectionCriterion.cu
@@ -46,8 +46,8 @@ struct WorkspacePtrs {
 template <class Float>
 __global__ void
 forwardInitial(int T, int N, const Float* input, WorkspacePtrs<Float> ws) {
-  int b = blockIdx.x;
-  for (int n = threadIdx.x; n < N; n += blockDim.x) {
+  auto b = blockIdx.x;
+  for (auto n = threadIdx.x; n < N; n += blockDim.x) {
     int k = b * T * N + n;
     ws.alpha[k] = input[k];
   }
@@ -84,7 +84,7 @@ __global__ void forwardStep(
   __shared__ double maxValue;
 
   double threadMax = -INFINITY;
-  for (int n = threadIdx.x; n < N; n += blockDim.x) {
+  for (auto n = threadIdx.x; n < N; n += blockDim.x) {
     double val = transBuf[n] = alphaPrev[n] + (Final ? 0 : trans[m * N + n]);
     threadMax = val > threadMax ? val : threadMax;
   }
@@ -97,7 +97,7 @@ __global__ void forwardStep(
   __syncthreads();
 
   double threadSum = 0;
-  for (int n = threadIdx.x; n < N; n += blockDim.x) {
+  for (auto n = threadIdx.x; n < N; n += blockDim.x) {
     threadSum += exp(transBuf[n] - maxValue);
   }
 
@@ -142,7 +142,7 @@ __global__ void backwardStep1(
   __shared__ double sumValue;
 
   double threadMax = -INFINITY;
-  for (int n = threadIdx.x; n < N; n += blockDim.x) {
+  for (auto n = threadIdx.x; n < N; n += blockDim.x) {
     double val = transBuf[n] = alphaPrev[n] + (Initial ? 0 : trans[m * N + n]);
     threadMax = val > threadMax ? val : threadMax;
   }
@@ -153,7 +153,7 @@ __global__ void backwardStep1(
   }
 
   double threadSum = 0;
-  for (int n = threadIdx.x; n < N; n += blockDim.x) {
+  for (auto n = threadIdx.x; n < N; n += blockDim.x) {
     transBuf[n] = exp(transBuf[n] - maxValue);
     threadSum += transBuf[n];
   }
@@ -165,7 +165,7 @@ __global__ void backwardStep1(
 
   __syncthreads();
 
-  for (int n = threadIdx.x; n < N; n += blockDim.x) {
+  for (auto n = threadIdx.x; n < N; n += blockDim.x) {
     if (Initial) {
       alphaPrevGrad[n] = transBuf[n] / sumValue;
     } else {
@@ -181,16 +181,16 @@ __global__ void backwardStep1(
  */
 template <class Float>
 __global__ void backwardStep2(int T, int N, int t, WorkspacePtrs<Float> ws) {
-  int b = blockIdx.x / N;
-  int m = blockIdx.x % N;
+  auto b = blockIdx.x / N;
+  auto m = blockIdx.x % N;
 
   auto* alphaPrevGrad = &ws.alphaGrad[b * T * N + (t - 1) * N];
 
   using BlockReduce = cub::BlockReduce<double, kBlockSize>;
   __shared__ typename BlockReduce::TempStorage tempStorage;
 
   double threadSum = 0;
-  for (int n = threadIdx.x; n < N; n += blockDim.x) {
+  for (auto n = threadIdx.x; n < N; n += blockDim.x) {
     threadSum += ws.transBuf[b * N * N + n * N + m];
   }
 
@@ -212,7 +212,7 @@ __global__ void backwardFinal(
     Float* _inputGrad,
     Float* transGrad,
     WorkspacePtrs<Float> ws) {
-  int b = blockIdx.x;
+  auto b = blockIdx.x;
 
   auto* alphaGrad = &ws.alphaGrad[b * T * N];
   auto* inputGrad = &_inputGrad[b * T * N];
@@ -226,11 +226,11 @@ __global__ void backwardFinal(
 
   __syncthreads();
 
-  for (int i = threadIdx.x; i < T * N; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < T * N; i += blockDim.x) {
     inputGrad[i] = gradScale * alphaGrad[i];
   }
 
-  for (int i = threadIdx.x; i < N * N; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < N * N; i += blockDim.x) {
     atomicAdd(&transGrad[i], gradScale * transBatchGrad[i]);
   }
 }
diff --git a/flashlight/lib/sequence/criterion/cuda/ViterbiPath.cu b/flashlight/lib/sequence/criterion/cuda/ViterbiPath.cu
@@ -38,8 +38,8 @@ struct WorkspacePtrs {
 template <class Float>
 __global__ void
 computeInitial(int T, int N, const Float* input, WorkspacePtrs<Float> ws) {
-  int b = blockIdx.x;
-  for (int n = threadIdx.x; n < N; n += blockDim.x) {
+  auto b = blockIdx.x;
+  for (auto n = threadIdx.x; n < N; n += blockDim.x) {
     ws.alpha[b * 2 * N + n] = input[b * T * N + n];
   }
 }
@@ -76,7 +76,7 @@ __global__ void computeStep(
 
   cub::KeyValuePair<int, Float> threadMax;
   threadMax.value = -INFINITY;
-  for (int n = threadIdx.x; n < N; n += blockDim.x) {
+  for (auto n = threadIdx.x; n < N; n += blockDim.x) {
     Float val = alphaPrev[n] + (Final ? 0 : trans[m * N + n]);
     if (val > threadMax.value) {
       threadMax.key = n;