|
| 1 | +"""QLoRA + vLoRA: End-to-end pipeline for efficient multi-adapter serving. |
| 2 | +
|
| 3 | +This example shows the full workflow: |
| 4 | +1. Load a QLoRA-quantized base model (4-bit NF4) |
| 5 | +2. Load multiple LoRA adapters (produced by QLoRA fine-tuning) |
| 6 | +3. Build a shared subspace with NF4 quantization |
| 7 | +4. Serve with instant task switching via VLoRAModel |
| 8 | +
|
| 9 | +Requirements: |
| 10 | + pip install vlora-dev[hub] transformers bitsandbytes accelerate |
| 11 | +
|
| 12 | +The pipeline combines two orthogonal compression techniques: |
| 13 | +- QLoRA: compresses the base model (FP16 -> NF4, ~4x savings) |
| 14 | +- vLoRA: compresses the adapter space (N adapters -> shared subspace, ~122x) |
| 15 | +Together they enable serving hundreds of task-specific adapters on a single GPU. |
| 16 | +""" |
| 17 | + |
| 18 | +from __future__ import annotations |
| 19 | + |
| 20 | +import torch |
| 21 | + |
| 22 | +# ── Step 0: Configuration ────────────────────────────────────────────── |
| 23 | +BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Small model for demo |
| 24 | +ADAPTER_REPOS = [ |
| 25 | + # Replace with your QLoRA adapter repos from HuggingFace Hub |
| 26 | + # "username/adapter-task-a", |
| 27 | + # "username/adapter-task-b", |
| 28 | +] |
| 29 | +NUM_COMPONENTS = 4 # Subspace dimension |
| 30 | +USE_NF4_STORAGE = True # Save subspace in packed NF4 format |
| 31 | + |
| 32 | + |
| 33 | +def main(): |
| 34 | + # ── Step 1: Load QLoRA base model ────────────────────────────────── |
| 35 | + # In production, load with 4-bit quantization: |
| 36 | + # |
| 37 | + # from transformers import AutoModelForCausalLM, BitsAndBytesConfig |
| 38 | + # bnb_config = BitsAndBytesConfig( |
| 39 | + # load_in_4bit=True, |
| 40 | + # bnb_4bit_quant_type="nf4", |
| 41 | + # bnb_4bit_compute_dtype=torch.bfloat16, |
| 42 | + # ) |
| 43 | + # base_model = AutoModelForCausalLM.from_pretrained( |
| 44 | + # BASE_MODEL, quantization_config=bnb_config |
| 45 | + # ) |
| 46 | + # |
| 47 | + # For this demo, we simulate with synthetic data: |
| 48 | + print("=== QLoRA + vLoRA Pipeline Demo ===\n") |
| 49 | + |
| 50 | + # ── Step 2: Load adapters ────────────────────────────────────────── |
| 51 | + from vlora import LoRAWeights, SharedSubspace, VLoRAModel |
| 52 | + |
| 53 | + print("Creating synthetic adapters (replace with load_adapter_from_hub)...") |
| 54 | + layers = [ |
| 55 | + "model.layers.0.self_attn.q_proj", |
| 56 | + "model.layers.0.self_attn.v_proj", |
| 57 | + "model.layers.1.self_attn.q_proj", |
| 58 | + "model.layers.1.self_attn.v_proj", |
| 59 | + ] |
| 60 | + rank = 8 |
| 61 | + dim = 512 |
| 62 | + n_adapters = 10 |
| 63 | + |
| 64 | + # Create correlated adapters (simulates real LoRA adapters sharing structure) |
| 65 | + torch.manual_seed(42) |
| 66 | + shared_basis = {l: torch.randn(5, rank * dim) for l in layers} |
| 67 | + adapters = [] |
| 68 | + task_ids = [] |
| 69 | + for i in range(n_adapters): |
| 70 | + lora_a = {l: (torch.randn(5) @ shared_basis[l]).reshape(rank, dim) for l in layers} |
| 71 | + lora_b = {l: torch.randn(dim, rank) * 0.01 for l in layers} |
| 72 | + adapters.append(LoRAWeights(layer_names=layers, lora_a=lora_a, lora_b=lora_b, rank=rank)) |
| 73 | + task_ids.append(f"task_{i}") |
| 74 | + print(f" Loaded {n_adapters} adapters, rank={rank}, {len(layers)} layers\n") |
| 75 | + |
| 76 | + # ── Step 3: Build shared subspace ────────────────────────────────── |
| 77 | + print("Building shared subspace...") |
| 78 | + subspace = SharedSubspace.from_adapters( |
| 79 | + adapters, |
| 80 | + task_ids=task_ids, |
| 81 | + num_components=NUM_COMPONENTS, |
| 82 | + ) |
| 83 | + |
| 84 | + stats = subspace.compression_stats() |
| 85 | + print(f" Components: {subspace.num_components}") |
| 86 | + print(f" Compression: {stats['compression_ratio']:.1f}x") |
| 87 | + print(f" Original params: {stats['total_params_original']:,}") |
| 88 | + print(f" Compressed params: {stats['total_params_compressed']:,}\n") |
| 89 | + |
| 90 | + # ── Step 4: Apply NF4 quantization to subspace ───────────────────── |
| 91 | + print("Quantizing subspace with NF4...") |
| 92 | + subspace.quantize(method="nf4", quantize_loadings=True) |
| 93 | + print(" Done (components + loadings quantized)\n") |
| 94 | + |
| 95 | + # ── Step 5: Save with packed NF4 storage ─────────────────────────── |
| 96 | + import tempfile |
| 97 | + from pathlib import Path |
| 98 | + |
| 99 | + save_dir = Path(tempfile.mkdtemp()) / "subspace" |
| 100 | + |
| 101 | + if USE_NF4_STORAGE: |
| 102 | + print("Saving with NF4-packed format...") |
| 103 | + subspace.save_quantized(save_dir) |
| 104 | + else: |
| 105 | + print("Saving with float32 format...") |
| 106 | + subspace.save(save_dir) |
| 107 | + |
| 108 | + # Compare file sizes |
| 109 | + total_bytes = sum(f.stat().st_size for f in save_dir.rglob("*") if f.is_file()) |
| 110 | + print(f" Saved to: {save_dir}") |
| 111 | + print(f" Total size: {total_bytes / 1024:.1f} KB\n") |
| 112 | + |
| 113 | + # ── Step 6: Load and serve ───────────────────────────────────────── |
| 114 | + print("Loading subspace (auto-detects format)...") |
| 115 | + loaded = SharedSubspace.load(save_dir) |
| 116 | + print(f" {loaded!r}\n") |
| 117 | + |
| 118 | + # Full-stack compression stats (with hypothetical QLoRA base model) |
| 119 | + full_stats = loaded.full_stack_compression( |
| 120 | + base_model_params=1_100_000_000, # TinyLlama 1.1B |
| 121 | + base_model_bits=16, |
| 122 | + quantized_bits=4, |
| 123 | + ) |
| 124 | + if "total_compression_ratio" in full_stats: |
| 125 | + print("Full-stack compression (QLoRA base + vLoRA adapters):") |
| 126 | + print(f" Base model: {full_stats['base_model']['compression_ratio']:.1f}x (FP16->NF4)") |
| 127 | + print(f" Adapters: {stats['compression_ratio']:.1f}x ({n_adapters} adapters)") |
| 128 | + print(f" Total: {full_stats['total_original_bytes']/1e9:.1f} GB -> " |
| 129 | + f"{full_stats['total_compressed_bytes']/1e9:.2f} GB") |
| 130 | + print(f" Combined: {full_stats['total_compression_ratio']:.1f}x\n") |
| 131 | + |
| 132 | + # In production with a real base model: |
| 133 | + # |
| 134 | + # model = VLoRAModel(base_model, loaded, compute_dtype=torch.bfloat16) |
| 135 | + # print(f"QLoRA info: {model.qlora_info}") |
| 136 | + # |
| 137 | + # # Instant task switching |
| 138 | + # model.set_task("task_0") |
| 139 | + # output = model(input_ids) |
| 140 | + # |
| 141 | + # model.set_task("task_5") # microseconds to switch |
| 142 | + # output = model(input_ids) |
| 143 | + |
| 144 | + # Demonstrate reconstruction |
| 145 | + print("Reconstructing adapters from subspace...") |
| 146 | + for tid in ["task_0", "task_5", "task_9"]: |
| 147 | + recon = loaded.reconstruct(tid) |
| 148 | + print(f" {tid}: {recon!r}") |
| 149 | + |
| 150 | + print("\nDone!") |
| 151 | + |
| 152 | + |
| 153 | +if __name__ == "__main__": |
| 154 | + main() |
0 commit comments