|
11 | 11 | import org.beehive.gpullama3.inference.weights.tornado.TornadoWeights; |
12 | 12 | import org.beehive.gpullama3.model.Configuration; |
13 | 13 | import org.beehive.gpullama3.model.Model; |
| 14 | +import org.beehive.gpullama3.model.granite.GraniteConfiguration; |
14 | 15 | import org.beehive.gpullama3.model.phi3.Phi3Configuration; |
15 | 16 | import org.beehive.gpullama3.model.qwen2.Qwen2Configuration; |
16 | 17 | import org.beehive.gpullama3.model.qwen3.Qwen3Configuration; |
@@ -546,6 +547,127 @@ public static FloatTensor forwardJavaPhi3(Model model, Phi3State state, int toke |
546 | 547 | return state.logits; |
547 | 548 | } |
548 | 549 |
|
| 550 | + /** |
| 551 | + * Forward pass for Granite models with µP scaling factors applied. |
| 552 | + * <p> |
| 553 | + * Granite uses the same transformer architecture as Llama but with maximal update parameterization (µP) |
| 554 | + * scaling factors applied at specific points: |
| 555 | + * <ul> |
| 556 | + * <li>Embedding scaling: multiply embeddings after lookup</li> |
| 557 | + * <li>Attention scaling: use custom multiplier instead of 1/sqrt(headDim)</li> |
| 558 | + * <li>Residual scaling: multiply residual connections</li> |
| 559 | + * <li>Logit scaling: divide logits by the scaling factor</li> |
| 560 | + * </ul> |
| 561 | + */ |
| 562 | + public static FloatTensor forwardGranite(Model model, State state, int token, int position) { |
| 563 | + final GraniteConfiguration config = (GraniteConfiguration) model.configuration(); |
| 564 | + final StandardWeights weights = (StandardWeights) model.weights(); |
| 565 | + int dim = config.dim(); |
| 566 | + int headSize = config.headSize(); |
| 567 | + int kvDim = (config.dim() * config.numberOfKeyValueHeads()) / config.numberOfHeads(); |
| 568 | + int kvMul = config.numberOfHeads() / config.numberOfKeyValueHeads(); |
| 569 | + float attentionScale = config.attentionScale(); |
| 570 | + float residualScale = config.residualScale(); |
| 571 | + float embeddingScale = config.embeddingScale(); |
| 572 | + float logitScale = config.logitScale(); |
| 573 | + |
| 574 | + // copy the token embedding into x |
| 575 | + weights.token_embedding_table.copyTo(token * dim, state.x, 0, dim); |
| 576 | + // Apply Granite embedding scaling |
| 577 | + state.x.mapInPlace(v -> v * embeddingScale); |
| 578 | + |
| 579 | + // forward all the layers |
| 580 | + for (int l = 0; l < config.numberOfLayers(); l++) { |
| 581 | + // attention rmsnorm |
| 582 | + rmsnorm(state.xb, state.x, weights.rms_att_weight[l], 0, dim, config.rmsNormEps()); |
| 583 | + |
| 584 | + // qkv matmuls for this position |
| 585 | + weights.wq[l].matmul(state.xb, state.q, dim, dim); |
| 586 | + weights.wk[l].matmul(state.xb, state.k, kvDim, dim); |
| 587 | + weights.wv[l].matmul(state.xb, state.v, kvDim, dim); |
| 588 | + |
| 589 | + // RoPE relative positional encoding |
| 590 | + for (int i = 0; i < dim; i += 2) { |
| 591 | + int head_dim = i % headSize; |
| 592 | + float fcr = weights.freq_cis_real.getFloat(position * (headSize / 2) + (head_dim / 2)); |
| 593 | + float fci = weights.freq_cis_imag.getFloat(position * (headSize / 2) + (head_dim / 2)); |
| 594 | + int rotn = i < kvDim ? 2 : 1; |
| 595 | + for (int v = 0; v < rotn; v++) { |
| 596 | + FloatTensor vec = v == 0 ? state.q : state.k; |
| 597 | + float v0 = vec.getFloat(i); |
| 598 | + float v1 = vec.getFloat(i + 1); |
| 599 | + vec.setFloat(i, v0 * fcr - v1 * fci); |
| 600 | + vec.setFloat(i + 1, v0 * fci + v1 * fcr); |
| 601 | + } |
| 602 | + } |
| 603 | + |
| 604 | + // save key,value at this time step to kv cache |
| 605 | + state.k.copyTo(0, state.keyCache[l], position * kvDim, kvDim); |
| 606 | + state.v.copyTo(0, state.valueCache[l], position * kvDim, kvDim); |
| 607 | + |
| 608 | + int curLayer = l; |
| 609 | + |
| 610 | + // multihead attention with Granite attention scaling |
| 611 | + Parallel.parallelFor(0, config.numberOfHeads(), h -> { |
| 612 | + int qOffset = h * headSize; |
| 613 | + int attOffset = h * config.contextLength(); |
| 614 | + |
| 615 | + for (int t = 0; t <= position; t++) { |
| 616 | + int keyCacheOffset = t * kvDim + (h / kvMul) * headSize; |
| 617 | + float score = state.q.dot(qOffset, state.keyCache[curLayer], keyCacheOffset, headSize); |
| 618 | + // Granite uses custom attention multiplier instead of 1/sqrt(headSize) |
| 619 | + score *= attentionScale; |
| 620 | + state.att.setFloat(attOffset + t, score); |
| 621 | + } |
| 622 | + |
| 623 | + state.att.softmaxInPlace(attOffset, position + 1); |
| 624 | + |
| 625 | + int xbOffset = h * headSize; |
| 626 | + state.xb.fillInPlace(xbOffset, headSize, 0f); |
| 627 | + |
| 628 | + for (int t = 0; t <= position; t++) { |
| 629 | + int vOffset = t * kvDim + (h / kvMul) * headSize; |
| 630 | + float a = state.att.getFloat(attOffset + t); |
| 631 | + state.xb.saxpyInPlace(xbOffset, state.valueCache[curLayer], vOffset, headSize, a); |
| 632 | + } |
| 633 | + }); |
| 634 | + |
| 635 | + // final matmul to get the output of the attention |
| 636 | + weights.wo[l].matmul(state.xb, state.xb2, dim, dim); |
| 637 | + |
| 638 | + // residual connection with Granite scaling |
| 639 | + state.xb2.mapInPlace(v -> v * residualScale); |
| 640 | + state.x.addInPlace(state.xb2); |
| 641 | + |
| 642 | + // ffn rmsnorm |
| 643 | + rmsnorm(state.xb, state.x, weights.rms_ffn_weight[l], 0, dim, config.rmsNormEps()); |
| 644 | + |
| 645 | + // FFN: self.w2(F.silu(self.w1(x)) * self.w3(x)) |
| 646 | + weights.w1[l].matmul(state.xb, state.hb, config.hiddenDim(), dim); |
| 647 | + weights.w3[l].matmul(state.xb, state.hb2, config.hiddenDim(), dim); |
| 648 | + |
| 649 | + // SwiGLU non-linearity |
| 650 | + state.hb.mapInPlace(value -> value / (float) (1.0 + Math.exp(-value))); |
| 651 | + state.hb.multiplyInPlace(state.hb2); |
| 652 | + |
| 653 | + // final matmul to get the output of the ffn |
| 654 | + weights.w2[l].matmul(state.hb, state.xb, dim, config.hiddenDim()); |
| 655 | + |
| 656 | + // residual connection with Granite scaling |
| 657 | + state.xb.mapInPlace(v -> v * residualScale); |
| 658 | + state.x.addInPlace(state.xb); |
| 659 | + } |
| 660 | + |
| 661 | + rmsnorm(state.x, state.x, weights.rms_final_weight, 0, dim, config.rmsNormEps()); |
| 662 | + |
| 663 | + weights.wcls.matmul(state.x, state.logits, config.vocabularySize(), dim); |
| 664 | + |
| 665 | + // Apply Granite logit scaling (divide by the scaling factor) |
| 666 | + state.logits.mapInPlace(v -> v * logitScale); |
| 667 | + |
| 668 | + return state.logits; |
| 669 | + } |
| 670 | + |
549 | 671 | static void copyChunk(FloatTensor in, FloatTensor out, int dim1In, int dim1Out, int nChunks, int chunkNo) { |
550 | 672 | assert (dim1In == dim1Out * nChunks); |
551 | 673 | final int startOffsetInDim1 = chunkNo * dim1Out; |
|
0 commit comments