Fix logits scaling in GraniteKernels: correct scaling order for hb and output writes.

mikepapadim · mikepapadim · commit 49eb298c27a9 · 2025-12-18T09:59:56.000+02:00
diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceCore.java b/src/main/java/org/beehive/gpullama3/inference/InferenceCore.java
@@ -663,7 +663,7 @@ public static FloatTensor forwardGranite(Model model, State state, int token, in
         weights.wcls.matmul(state.x, state.logits, config.vocabularySize(), dim);
 
         // Apply Granite logit scaling (divide by the scaling factor)
-        state.logits.mapInPlace(v -> v / logitScale);
+        state.logits.mapInPlace(v -> v * logitScale);
 
         return state.logits;
     }
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/kernels/GraniteKernels.java b/src/main/java/org/beehive/gpullama3/tornadovm/kernels/GraniteKernels.java
@@ -77,7 +77,7 @@ public static void matrixVectorGenericWithGraniteScale(
 
         // Thread 0 in each workgroup writes the final result
         if (localId == 0) {
-            hb.set(rowId, sum);
+            hb.set(rowId,  sum * logitsScale);
         }
     }
 
@@ -156,7 +156,6 @@ public static void processHeadsFlashAttentionWithGraniteScale(KernelContext cont
                     score += q_shared[d] * k_tile[score_idx_in_tile * headSize + d];
                 }
                 score *= attentionScale;
-//                score /= TornadoMath.sqrt(headSize);
                 s_tile[score_idx_in_tile] = score;
             }
 
@@ -339,7 +338,7 @@ public static void matrixVectorGenericQ8ByteWithGraniteScale(KernelContext conte
 
         // Thread 0 writes the result
         if (localId == 0) {
-            output.set(rowId, logitsScale * sum);
+            output.set(rowId, sum * logitsScale);
         }
     }
 

Original file line number	Diff line number	Diff line change
`@@ -663,7 +663,7 @@ public static FloatTensor forwardGranite(Model model, State state, int token, in`
`663`	`663`	`weights.wcls.matmul(state.x, state.logits, config.vocabularySize(), dim);`
`664`	`664`
`665`	`665`	`// Apply Granite logit scaling (divide by the scaling factor)`
`666`		`- state.logits.mapInPlace(v -> v / logitScale);`
	`666`	`+ state.logits.mapInPlace(v -> v * logitScale);`
`667`	`667`
`668`	`668`	`return state.logits;`
`669`	`669`	`}`
Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ public static void matrixVectorGenericWithGraniteScale(`
`77`	`77`
`78`	`78`	`// Thread 0 in each workgroup writes the final result`
`79`	`79`	`if (localId == 0) {`
`80`		`- hb.set(rowId, sum);`
	`80`	`+ hb.set(rowId, sum * logitsScale);`
`81`	`81`	`}`
`82`	`82`	`}`
`83`	`83`
`@@ -156,7 +156,6 @@ public static void processHeadsFlashAttentionWithGraniteScale(KernelContext cont`
`156`	`156`	`score += q_shared[d] * k_tile[score_idx_in_tile * headSize + d];`
`157`	`157`	`}`
`158`	`158`	`score *= attentionScale;`
`159`		`-// score /= TornadoMath.sqrt(headSize);`
`160`	`159`	`s_tile[score_idx_in_tile] = score;`
`161`	`160`	`}`
`162`	`161`
`@@ -339,7 +338,7 @@ public static void matrixVectorGenericQ8ByteWithGraniteScale(KernelContext conte`
`339`	`338`
`340`	`339`	`// Thread 0 writes the result`
`341`	`340`	`if (localId == 0) {`
`342`		`- output.set(rowId, logitsScale * sum);`
	`341`	`+ output.set(rowId, sum * logitsScale);`
`343`	`342`	`}`
`344`	`343`	`}`
`345`	`344`