|
| 1 | +--- |
| 2 | +title: Quick Start |
| 3 | +weight: 2 |
| 4 | +bookToc: true |
| 5 | +--- |
| 6 | + |
| 7 | +# Quick Start |
| 8 | + |
| 9 | +Run your first LLM inference in under 5 minutes. |
| 10 | + |
| 11 | +## Load a Model and Generate Text |
| 12 | + |
| 13 | +Create a new Go project and add Zerfoo: |
| 14 | + |
| 15 | +```bash |
| 16 | +mkdir my-llm-app && cd my-llm-app |
| 17 | +go mod init my-llm-app |
| 18 | +go get github.com/zerfoo/zerfoo@latest |
| 19 | +``` |
| 20 | + |
| 21 | +Write `main.go`: |
| 22 | + |
| 23 | +```go |
| 24 | +package main |
| 25 | + |
| 26 | +import ( |
| 27 | + "fmt" |
| 28 | + "log" |
| 29 | + |
| 30 | + "github.com/zerfoo/zerfoo" |
| 31 | +) |
| 32 | + |
| 33 | +func main() { |
| 34 | + m, err := zerfoo.Load("google/gemma-3-4b") |
| 35 | + if err != nil { |
| 36 | + log.Fatal(err) |
| 37 | + } |
| 38 | + defer m.Close() |
| 39 | + |
| 40 | + reply, err := m.Chat("Explain quicksort in one sentence.") |
| 41 | + if err != nil { |
| 42 | + log.Fatal(err) |
| 43 | + } |
| 44 | + fmt.Println(reply) |
| 45 | +} |
| 46 | +``` |
| 47 | + |
| 48 | +Run it: |
| 49 | + |
| 50 | +```bash |
| 51 | +go run main.go |
| 52 | +``` |
| 53 | + |
| 54 | +`zerfoo.Load` accepts a HuggingFace model ID (e.g. `"google/gemma-3-4b"`) or a local GGUF file path (e.g. `"./model.gguf"`). If the model is not cached locally it is downloaded automatically. The default quantization is Q4_K_M. |
| 55 | + |
| 56 | +To request a specific quantization, append it to the ID: |
| 57 | + |
| 58 | +``` |
| 59 | +google/gemma-3-4b/Q8_0 |
| 60 | +``` |
| 61 | + |
| 62 | +## Chat Completion |
| 63 | + |
| 64 | +For multi-turn conversations, use the `Chat` method with structured messages: |
| 65 | + |
| 66 | +```go |
| 67 | +package main |
| 68 | + |
| 69 | +import ( |
| 70 | + "context" |
| 71 | + "fmt" |
| 72 | + "log" |
| 73 | + |
| 74 | + "github.com/zerfoo/zerfoo/inference" |
| 75 | +) |
| 76 | + |
| 77 | +func main() { |
| 78 | + mdl, err := inference.Load("gemma-3-1b-q4") |
| 79 | + if err != nil { |
| 80 | + log.Fatal(err) |
| 81 | + } |
| 82 | + defer mdl.Close() |
| 83 | + |
| 84 | + resp, err := mdl.Chat(context.Background(), []inference.Message{ |
| 85 | + {Role: "system", Content: "You are a helpful assistant."}, |
| 86 | + {Role: "user", Content: "What is the capital of France?"}, |
| 87 | + }, |
| 88 | + inference.WithTemperature(0.5), |
| 89 | + inference.WithMaxTokens(64), |
| 90 | + ) |
| 91 | + if err != nil { |
| 92 | + log.Fatal(err) |
| 93 | + } |
| 94 | + fmt.Println(resp.Content) |
| 95 | + fmt.Printf("Tokens used: %d (prompt: %d, completion: %d)\n", |
| 96 | + resp.TokensUsed, resp.PromptTokens, resp.CompletionTokens) |
| 97 | +} |
| 98 | +``` |
| 99 | + |
| 100 | +The `Chat` method formats messages using the model's built-in chat template and returns a `Response` with token usage statistics. |
| 101 | + |
| 102 | +**CLI equivalent:** |
| 103 | + |
| 104 | +```bash |
| 105 | +zerfoo run gemma-3-1b-q4 |
| 106 | +``` |
| 107 | + |
| 108 | +This starts an interactive chat session: |
| 109 | + |
| 110 | +``` |
| 111 | +Model loaded. Type your message (Ctrl-D to quit). |
| 112 | +
|
| 113 | +> What is the capital of France? |
| 114 | +The capital of France is Paris. |
| 115 | +> |
| 116 | +``` |
| 117 | + |
| 118 | +## Stream Responses |
| 119 | + |
| 120 | +Print tokens as they arrive: |
| 121 | + |
| 122 | +```go |
| 123 | +package main |
| 124 | + |
| 125 | +import ( |
| 126 | + "context" |
| 127 | + "fmt" |
| 128 | + "log" |
| 129 | + |
| 130 | + "github.com/zerfoo/zerfoo" |
| 131 | +) |
| 132 | + |
| 133 | +func main() { |
| 134 | + m, err := zerfoo.Load("google/gemma-3-4b") |
| 135 | + if err != nil { |
| 136 | + log.Fatal(err) |
| 137 | + } |
| 138 | + defer m.Close() |
| 139 | + |
| 140 | + stream, err := m.ChatStream(context.Background(), "Write a haiku about Go.") |
| 141 | + if err != nil { |
| 142 | + log.Fatal(err) |
| 143 | + } |
| 144 | + for tok := range stream { |
| 145 | + if tok.Done { |
| 146 | + break |
| 147 | + } |
| 148 | + fmt.Print(tok.Text) |
| 149 | + } |
| 150 | + fmt.Println() |
| 151 | +} |
| 152 | +``` |
| 153 | + |
| 154 | +For lower-level control, use the `inference` package directly: |
| 155 | + |
| 156 | +```go |
| 157 | +err = mdl.GenerateStream(ctx, "Tell me a joke.", |
| 158 | + generate.TokenStreamFunc(func(token string, done bool) error { |
| 159 | + if !done { |
| 160 | + fmt.Print(token) |
| 161 | + } |
| 162 | + return nil |
| 163 | + }), |
| 164 | + inference.WithMaxTokens(128), |
| 165 | +) |
| 166 | +``` |
| 167 | + |
| 168 | +**CLI equivalent:** |
| 169 | + |
| 170 | +```bash |
| 171 | +zerfoo predict --model gemma-3-1b-q4 --prompt "Write a haiku about Go." |
| 172 | +``` |
| 173 | + |
| 174 | +## Generate Embeddings |
| 175 | + |
| 176 | +Use the OpenAI-compatible API server to generate text embeddings. Start the server: |
| 177 | + |
| 178 | +```bash |
| 179 | +zerfoo serve gemma-3-1b-q4 --port 8080 |
| 180 | +``` |
| 181 | + |
| 182 | +Then request embeddings: |
| 183 | + |
| 184 | +```bash |
| 185 | +curl http://localhost:8080/v1/embeddings \ |
| 186 | + -H "Content-Type: application/json" \ |
| 187 | + -d '{ |
| 188 | + "model": "gemma-3-1b-q4", |
| 189 | + "input": "Zerfoo is an ML framework for Go." |
| 190 | + }' |
| 191 | +``` |
| 192 | + |
| 193 | +Any OpenAI-compatible client library works -- just point it at your server: |
| 194 | + |
| 195 | +```go |
| 196 | +// Using the standard net/http package |
| 197 | +package main |
| 198 | + |
| 199 | +import ( |
| 200 | + "bytes" |
| 201 | + "encoding/json" |
| 202 | + "fmt" |
| 203 | + "log" |
| 204 | + "net/http" |
| 205 | +) |
| 206 | + |
| 207 | +func main() { |
| 208 | + body, _ := json.Marshal(map[string]any{ |
| 209 | + "model": "gemma-3-1b-q4", |
| 210 | + "input": "Zerfoo is an ML framework for Go.", |
| 211 | + }) |
| 212 | + |
| 213 | + resp, err := http.Post( |
| 214 | + "http://localhost:8080/v1/embeddings", |
| 215 | + "application/json", |
| 216 | + bytes.NewReader(body), |
| 217 | + ) |
| 218 | + if err != nil { |
| 219 | + log.Fatal(err) |
| 220 | + } |
| 221 | + defer resp.Body.Close() |
| 222 | + |
| 223 | + var result map[string]any |
| 224 | + json.NewDecoder(resp.Body).Decode(&result) |
| 225 | + fmt.Println(result) |
| 226 | +} |
| 227 | +``` |
| 228 | + |
| 229 | +## Structured JSON Output |
| 230 | + |
| 231 | +Control generation with temperature, token limits, and nucleus sampling to get structured output: |
| 232 | + |
| 233 | +```go |
| 234 | +package main |
| 235 | + |
| 236 | +import ( |
| 237 | + "context" |
| 238 | + "fmt" |
| 239 | + "log" |
| 240 | + |
| 241 | + "github.com/zerfoo/zerfoo/inference" |
| 242 | +) |
| 243 | + |
| 244 | +func main() { |
| 245 | + mdl, err := inference.Load("gemma-3-1b-q4") |
| 246 | + if err != nil { |
| 247 | + log.Fatal(err) |
| 248 | + } |
| 249 | + defer mdl.Close() |
| 250 | + |
| 251 | + result, err := mdl.Generate( |
| 252 | + context.Background(), |
| 253 | + `Return a JSON object with the fields "name", "capital", and "population" for France. Output only valid JSON, no other text.`, |
| 254 | + inference.WithMaxTokens(128), |
| 255 | + inference.WithTemperature(0.0), |
| 256 | + ) |
| 257 | + if err != nil { |
| 258 | + log.Fatal(err) |
| 259 | + } |
| 260 | + fmt.Println(result) |
| 261 | +} |
| 262 | +``` |
| 263 | + |
| 264 | +**CLI equivalent:** |
| 265 | + |
| 266 | +```bash |
| 267 | +zerfoo predict \ |
| 268 | + --model gemma-3-1b-q4 \ |
| 269 | + --prompt 'Return a JSON object with the fields "name", "capital", and "population" for France. Output only valid JSON, no other text.' \ |
| 270 | + --temperature 0 \ |
| 271 | + --max-tokens 128 |
| 272 | +``` |
| 273 | + |
| 274 | +## Generation Options |
| 275 | + |
| 276 | +Both the library API and CLI support these sampling parameters: |
| 277 | + |
| 278 | +| Option | CLI Flag | Default | Description | |
| 279 | +|--------|----------|---------|-------------| |
| 280 | +| `WithTemperature` | `--temperature` | 1.0 | Sampling temperature | |
| 281 | +| `WithTopP` | `--top-p` | 1.0 | Nucleus sampling | |
| 282 | +| `WithTopK` | `--top-k` | disabled | Top-K sampling | |
| 283 | +| `WithMaxTokens` | `--max-tokens` | 256 | Maximum tokens to generate | |
| 284 | +| `WithRepetitionPenalty` | `--repetition-penalty` | 1.0 | Penalize repeated tokens | |
| 285 | + |
| 286 | +Example with multiple options: |
| 287 | + |
| 288 | +```go |
| 289 | +result, err := m.Generate(context.Background(), "Tell me a joke.", |
| 290 | + zerfoo.WithGenTemperature(0.7), |
| 291 | + zerfoo.WithGenMaxTokens(128), |
| 292 | + zerfoo.WithGenTopP(0.9), |
| 293 | +) |
| 294 | +if err != nil { |
| 295 | + log.Fatal(err) |
| 296 | +} |
| 297 | +fmt.Println(result.Text) |
| 298 | +fmt.Printf("Tokens: %d, Duration: %s\n", result.TokenCount, result.Duration) |
| 299 | +``` |
| 300 | + |
| 301 | +## Next Steps |
| 302 | + |
| 303 | +- [Installation](/docs/getting-started/installation) -- detailed installation and platform support |
| 304 | +- [GPU Setup](/docs/getting-started/gpu-setup) -- configure CUDA, ROCm, or OpenCL for hardware-accelerated inference |
| 305 | +- [API Server](/docs/deployment) -- serve models behind an OpenAI-compatible HTTP API |
| 306 | +- [API Reference](/docs/api) -- full API documentation |
| 307 | +- [Tutorials](/docs/tutorials) -- step-by-step guides for common tasks |
0 commit comments