88< meta charset ="utf-8 ">
99< meta name ="viewport " content ="width=device-width, initial-scale=1 ">
1010< title > Zerfoo — Machine Learning Framework for Go</ title >
11- < meta name ="description " content ="Train, run, and serve ML models in your Go application. 235 tok/s on Gemma 3 1B — 25 % faster than Ollama. Pure Go, zero CGo. ">
11+ < meta name ="description " content ="Train, run, and serve ML models in your Go application. 241 tok/s on Gemma 3 1B — 28 % faster than Ollama. Pure Go, zero CGo. ">
1212< meta name ="theme-color " content ="#8B5CF6 ">
1313< link rel ="icon " href ="zerfoo.svg " type ="image/svg+xml ">
1414< script > ( function ( ) { var t = localStorage . getItem ( 'theme' ) ; if ( t ) document . documentElement . classList . add ( t ) } ) ( ) </ script >
266266 < h1 > Machine learning for Go.< br > < span class ="grad "> Pure Go. Zero CGo.</ span > </ h1 >
267267 < p class ="sub "> Train, run, and serve ML models in your Go application. One import, GPU-accelerated at runtime, no C compiler needed.</ p >
268268 < div class ="stats ">
269- < div class ="stat "> < div class ="num "> 235 tok/s</ div > < div class ="label "> Gemma 3 1B Q4_K_M</ div > </ div >
269+ < div class ="stat "> < div class ="num "> 241 tok/s</ div > < div class ="label "> Gemma 3 1B Q4_K_M</ div > </ div >
270270 < div class ="stat "> < div class ="num "> +25%</ div > < div class ="label "> faster than Ollama</ div > </ div >
271271 < div class ="stat "> < div class ="num "> 99.5%</ div > < div class ="label "> CUDA graph coverage</ div > </ div >
272272 < div class ="stat "> < div class ="num "> 0</ div > < div class ="label "> CGo calls</ div > </ div >
@@ -445,8 +445,8 @@ <h2>Faster than Ollama</h2>
445445 < td class ="highlight "> Zerfoo</ td >
446446 < td >
447447 < div class ="bench-bar ">
448- < div class ="bar " style ="width:min(235px ,60vw) "> </ div >
449- < div class ="val "> 235 tok/s</ div >
448+ < div class ="bar " style ="width:min(241px ,60vw) "> </ div >
449+ < div class ="val "> 241 tok/s</ div >
450450 </ div >
451451 </ td >
452452 < td > Pure Go, zero CGo, CUDA graph capture, fused kernels</ td >
@@ -472,7 +472,7 @@ <h3 style="font-size:1rem;font-weight:600;margin-bottom:16px">Performance journe
472472 < tr > < th > Date</ th > < th > Milestone</ th > < th > Tok/s</ th > < th > Improvement</ th > </ tr >
473473 </ thead >
474474 < tbody >
475- < tr > < td > Mar 27</ td > < td class ="highlight "> Multi-model benchmark (3-run median)</ td > < td class ="highlight "> 235 </ td > < td > +25 % vs Ollama</ td > </ tr >
475+ < tr > < td > Mar 27</ td > < td class ="highlight "> Multi-model benchmark (3-run median)</ td > < td class ="highlight "> 241 </ td > < td > +28 % vs Ollama</ td > </ tr >
476476 < tr > < td > Mar 17</ td > < td > Q4_0 re-quant restored</ td > < td > 245</ td > < td > +32% vs regression</ td > </ tr >
477477 < tr > < td > Mar 14</ td > < td > CUDA graph capture</ td > < td > 234</ td > < td > +26% vs non-graph</ td > </ tr >
478478 < tr > < td > Mar 13</ td > < td > GPU-first pipeline</ td > < td > 103</ td > < td > D2H elimination</ td > </ tr >
@@ -642,7 +642,7 @@ <h2>From the blog</h2>
642642 < a href ="/docs/blog/how-we-beat-ollama-cuda-graph-capture/ " class ="blog-card ">
643643 < div class ="tag "> Performance</ div >
644644 < h3 > How We Beat Ollama: CUDA Graph Capture in Pure Go</ h3 >
645- < p > CUDA graph capture and fused kernels took Zerfoo from 186 tok/s to 235 tok/s. A deep dive into making the decode path GPU-only.</ p >
645+ < p > CUDA graph capture and fused kernels took Zerfoo from 186 tok/s to 241 tok/s. A deep dive into making the decode path GPU-only.</ p >
646646 </ a >
647647 < a href ="/docs/blog/zero-cgo-pure-go-ml-inference/ " class ="blog-card ">
648648 < div class ="tag "> Architecture</ div >
0 commit comments