zerfoo
diff --git a/‎content/docs/cookbooks/basic-text-generation.md‎
Lines changed: 86 additions & 0 deletions b/‎content/docs/cookbooks/basic-text-generation.md‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎content/docs/cookbooks/custom-sampling.md‎
Lines changed: 116 additions & 0 deletions b/‎content/docs/cookbooks/custom-sampling.md‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎content/docs/cookbooks/embedding-similarity.md‎
Lines changed: 120 additions & 0 deletions b/‎content/docs/cookbooks/embedding-similarity.md‎
Lines changed: 120 additions & 0 deletions
@@ -0,0 +1,86 @@
+---
+title: Basic Text Generation
+weight: 1
+bookToc: true
+---
+
+# Basic Text Generation
+
+Load a GGUF model and generate a text completion with a single function call. This is the simplest way to run inference with Zerfoo.
+
+## Usage
+
+```bash
+go run ./docs/cookbook/01-basic-text-generation/ --model path/to/model.gguf
+go run ./docs/cookbook/01-basic-text-generation/ --model google/gemma-3-1b
+```
+
+## Full Code
+
+```go
+// Recipe 01: Basic Text Generation
+//
+// Load a GGUF model and generate a text completion with a single function call.
+// This is the simplest way to run inference with Zerfoo.
+//
+// Usage:
+//
+//	go run ./docs/cookbook/01-basic-text-generation/ --model path/to/model.gguf
+//	go run ./docs/cookbook/01-basic-text-generation/ --model google/gemma-3-1b
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/zerfoo/zerfoo"
+)
+
+func main() {
+	modelPath := flag.String("model", "", "path to GGUF model file or HuggingFace model ID")
+	prompt := flag.String("prompt", "Explain goroutines in one paragraph.", "generation prompt")
+	flag.Parse()
+
+	if *modelPath == "" {
+		fmt.Fprintln(os.Stderr, "usage: basic-text-generation --model <path-or-id> [--prompt <text>]")
+		os.Exit(1)
+	}
+
+	// Load the model. Accepts a local GGUF path or a HuggingFace model ID
+	// like "google/gemma-3-1b". Remote models are downloaded and cached.
+	m, err := zerfoo.Load(*modelPath)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "load: %v\n", err)
+		os.Exit(1)
+	}
+	defer m.Close()
+
+	// Generate a completion. The result includes the generated text,
+	// token count, and wall-clock duration.
+	result, err := m.Generate(context.Background(), *prompt,
+		zerfoo.WithGenMaxTokens(256),
+		zerfoo.WithGenTemperature(0.7),
+	)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "generate: %v\n", err)
+		os.Exit(1)
+	}
+
+	fmt.Println(result.Text)
+	fmt.Fprintf(os.Stderr, "\n[%d tokens in %s]\n", result.TokenCount, result.Duration)
+}
+```
+
+## How It Works
+
+1. **Model loading** -- `zerfoo.Load` accepts either a local GGUF file path or a HuggingFace model ID (e.g. `"google/gemma-3-1b"`). Remote models are downloaded and cached automatically.
+2. **Generation** -- `m.Generate` runs autoregressive decoding with the given prompt. The `WithGenMaxTokens` and `WithGenTemperature` options control output length and sampling randomness.
+3. **Result** -- The returned `result` contains `Text` (the generated string), `TokenCount`, and `Duration` for performance tracking.
+
+## See Also
+
+- [Quick Start](/docs/getting-started/quickstart) -- minimal setup guide
+- [Streaming Chat](/docs/cookbooks/streaming-chat) -- stream tokens as they are generated
+- [Custom Sampling](/docs/cookbooks/custom-sampling) -- explore temperature, top-K, and top-P
@@ -0,0 +1,116 @@
+---
+title: Custom Sampling
+weight: 5
+bookToc: true
+---
+
+# Custom Sampling Parameters
+
+Demonstrate how temperature, top-K, top-P, and repetition penalty affect text generation. The program generates the same prompt three times with different sampling configurations so you can compare the outputs.
+
+## Usage
+
+```bash
+go run ./docs/cookbook/05-custom-sampling/ --model path/to/model.gguf
+```
+
+## Full Code
+
+```go
+// Recipe 05: Custom Sampling Parameters
+//
+// Demonstrate how temperature, top-K, top-P, and repetition penalty affect
+// text generation. The program generates the same prompt three times with
+// different sampling configurations so you can compare the outputs.
+//
+// Usage:
+//
+//	go run ./docs/cookbook/05-custom-sampling/ --model path/to/model.gguf
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/zerfoo/zerfoo/inference"
+)
+
+func main() {
+	modelPath := flag.String("model", "", "path to GGUF model file")
+	device := flag.String("device", "cpu", `compute device: "cpu", "cuda"`)
+	prompt := flag.String("prompt", "Write a haiku about concurrency.", "generation prompt")
+	flag.Parse()
+
+	if *modelPath == "" {
+		fmt.Fprintln(os.Stderr, "usage: custom-sampling --model <model.gguf>")
+		os.Exit(1)
+	}
+
+	model, err := inference.LoadFile(*modelPath, inference.WithDevice(*device))
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "load: %v\n", err)
+		os.Exit(1)
+	}
+	defer model.Close()
+
+	ctx := context.Background()
+
+	// Configuration 1: Greedy decoding (temperature=0).
+	// Deterministic output -- always picks the highest-probability token.
+	fmt.Println("=== Greedy (temperature=0) ===")
+	text, err := model.Generate(ctx, *prompt,
+		inference.WithMaxTokens(64),
+		inference.WithTemperature(0),
+	)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "generate: %v\n", err)
+		os.Exit(1)
+	}
+	fmt.Println(text)
+
+	// Configuration 2: Creative (high temperature + top-P nucleus sampling).
+	// Produces more varied, surprising output.
+	fmt.Println("\n=== Creative (temp=1.2, top-P=0.9) ===")
+	text, err = model.Generate(ctx, *prompt,
+		inference.WithMaxTokens(64),
+		inference.WithTemperature(1.2),
+		inference.WithTopP(0.9),
+	)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "generate: %v\n", err)
+		os.Exit(1)
+	}
+	fmt.Println(text)
+
+	// Configuration 3: Focused (low temperature + top-K).
+	// Picks from a narrow set of likely tokens for coherent output.
+	fmt.Println("\n=== Focused (temp=0.3, top-K=10) ===")
+	text, err = model.Generate(ctx, *prompt,
+		inference.WithMaxTokens(64),
+		inference.WithTemperature(0.3),
+		inference.WithTopK(10),
+	)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "generate: %v\n", err)
+		os.Exit(1)
+	}
+	fmt.Println(text)
+}
+```
+
+## How It Works
+
+The recipe runs three generation passes with the same prompt but different sampling strategies:
+
+| Configuration | Settings | Behavior |
+|---------------|----------|----------|
+| **Greedy** | `temperature=0` | Deterministic -- always picks the highest-probability token. Produces the most predictable output. |
+| **Creative** | `temperature=1.2, top-P=0.9` | High temperature flattens the probability distribution, making unlikely tokens more probable. Top-P (nucleus sampling) truncates the distribution to the smallest set of tokens whose cumulative probability exceeds 0.9. |
+| **Focused** | `temperature=0.3, top-K=10` | Low temperature sharpens the distribution toward high-probability tokens. Top-K limits selection to the 10 most likely tokens. Produces coherent, on-topic output. |
+
+## See Also
+
+- [Basic Text Generation](/docs/cookbooks/basic-text-generation) -- simple generation with default sampling
+- [Structured JSON Output](/docs/cookbooks/structured-json-output) -- constrain output format with grammar-guided decoding
@@ -0,0 +1,120 @@
+---
+title: Embedding Similarity
+weight: 3
+bookToc: true
+---
+
+# Embedding and Cosine Similarity
+
+Compute text embeddings and rank a corpus of documents by relevance to a query using cosine similarity. This is the retrieval building block for semantic search and RAG systems.
+
+## Usage
+
+```bash
+go run ./docs/cookbook/03-embedding-similarity/ --model path/to/model.gguf
+go run ./docs/cookbook/03-embedding-similarity/ --model path/to/model.gguf --query "memory management"
+```
+
+## Full Code
+
+```go
+// Recipe 03: Embedding and Cosine Similarity
+//
+// Compute text embeddings and rank a corpus of documents by relevance to a
+// query using cosine similarity. This is the retrieval building block for
+// semantic search and RAG systems.
+//
+// Usage:
+//
+//	go run ./docs/cookbook/03-embedding-similarity/ --model path/to/model.gguf
+//	go run ./docs/cookbook/03-embedding-similarity/ --model path/to/model.gguf --query "memory management"
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"sort"
+
+	"github.com/zerfoo/zerfoo"
+)
+
+// corpus is a small document set for demonstration.
+var corpus = []string{
+	"Go's garbage collector is a concurrent, tri-color, mark-sweep collector.",
+	"Goroutines are multiplexed onto OS threads by the Go runtime scheduler.",
+	"The sync.Mutex type provides mutual exclusion for shared state.",
+	"Go modules use go.mod and go.sum to manage versioned dependencies.",
+	"Channels are the primary mechanism for goroutine communication.",
+	"The context package carries deadlines and cancellation signals.",
+	"Go interfaces are satisfied implicitly without an implements keyword.",
+	"The testing package supports automated unit and benchmark tests.",
+}
+
+func main() {
+	modelPath := flag.String("model", "", "path to GGUF model file or HuggingFace model ID")
+	query := flag.String("query", "How does Go handle concurrency?", "search query")
+	topN := flag.Int("top", 3, "number of results to display")
+	flag.Parse()
+
+	if *modelPath == "" {
+		fmt.Fprintln(os.Stderr, "usage: embedding-similarity --model <path> [--query <text>]")
+		os.Exit(1)
+	}
+
+	m, err := zerfoo.Load(*modelPath)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "load: %v\n", err)
+		os.Exit(1)
+	}
+	defer m.Close()
+
+	// Embed all documents in the corpus.
+	corpusEmbeds, err := m.Embed(corpus)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "embed corpus: %v\n", err)
+		os.Exit(1)
+	}
+
+	// Embed the query.
+	queryEmbeds, err := m.Embed([]string{*query})
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "embed query: %v\n", err)
+		os.Exit(1)
+	}
+	qe := queryEmbeds[0]
+
+	// Rank documents by cosine similarity.
+	type result struct {
+		doc   string
+		score float32
+	}
+	results := make([]result, len(corpus))
+	for i, emb := range corpusEmbeds {
+		results[i] = result{corpus[i], qe.CosineSimilarity(emb)}
+	}
+	sort.Slice(results, func(i, j int) bool { return results[i].score > results[j].score })
+
+	fmt.Printf("Query: %q\n\n", *query)
+	n := *topN
+	if n > len(results) {
+		n = len(results)
+	}
+	for i := 0; i < n; i++ {
+		fmt.Printf("  %d. [%.4f] %s\n", i+1, results[i].score, results[i].doc)
+	}
+}
+```
+
+## How It Works
+
+1. **Embedding** -- `m.Embed` takes a slice of strings and returns a slice of embedding vectors. Each vector captures the semantic meaning of the input text.
+2. **Cosine similarity** -- The `CosineSimilarity` method computes the cosine of the angle between two embedding vectors. Values closer to 1.0 indicate higher semantic similarity.
+3. **Ranking** -- Documents are sorted by their similarity score to the query, and the top N results are displayed.
+
+This pattern is the foundation for retrieval-augmented generation (RAG): embed your document corpus, embed the user query, retrieve the most relevant documents, and pass them as context to the generation step.
+
+## See Also
+
+- [Basic Text Generation](/docs/cookbooks/basic-text-generation) -- generate text completions
+- [OpenAI Server](/docs/cookbooks/openai-server) -- the `/v1/embeddings` endpoint serves embeddings over HTTP