Skip to content

Commit ef57eb6

Browse files
committed
Release v0.3.0: NF4 packed storage, QLoRA pipeline example
Features: - NF4 packed storage format: save_quantized() packs components as uint8 (two 4-bit indices per byte) + per-block float32 scales, giving ~7x disk compression vs float32. load() auto-detects format. - nf4_pack/nf4_unpack ops for direct 4-bit packing - Double quantization support (quantize NF4 block scales to FP8) - full_stack_compression() for combined base model + adapter stats - QLoRA + vLoRA end-to-end example (examples/qlora_pipeline.py) Version bump to 0.3.0. Tests: 196 passed, 4 skipped.
1 parent 56dc534 commit ef57eb6

7 files changed

Lines changed: 483 additions & 4 deletions

File tree

examples/qlora_pipeline.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
"""QLoRA + vLoRA: End-to-end pipeline for efficient multi-adapter serving.
2+
3+
This example shows the full workflow:
4+
1. Load a QLoRA-quantized base model (4-bit NF4)
5+
2. Load multiple LoRA adapters (produced by QLoRA fine-tuning)
6+
3. Build a shared subspace with NF4 quantization
7+
4. Serve with instant task switching via VLoRAModel
8+
9+
Requirements:
10+
pip install vlora-dev[hub] transformers bitsandbytes accelerate
11+
12+
The pipeline combines two orthogonal compression techniques:
13+
- QLoRA: compresses the base model (FP16 -> NF4, ~4x savings)
14+
- vLoRA: compresses the adapter space (N adapters -> shared subspace, ~122x)
15+
Together they enable serving hundreds of task-specific adapters on a single GPU.
16+
"""
17+
18+
from __future__ import annotations
19+
20+
import torch
21+
22+
# ── Step 0: Configuration ──────────────────────────────────────────────
23+
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # Small model for demo
24+
ADAPTER_REPOS = [
25+
# Replace with your QLoRA adapter repos from HuggingFace Hub
26+
# "username/adapter-task-a",
27+
# "username/adapter-task-b",
28+
]
29+
NUM_COMPONENTS = 4 # Subspace dimension
30+
USE_NF4_STORAGE = True # Save subspace in packed NF4 format
31+
32+
33+
def main():
34+
# ── Step 1: Load QLoRA base model ──────────────────────────────────
35+
# In production, load with 4-bit quantization:
36+
#
37+
# from transformers import AutoModelForCausalLM, BitsAndBytesConfig
38+
# bnb_config = BitsAndBytesConfig(
39+
# load_in_4bit=True,
40+
# bnb_4bit_quant_type="nf4",
41+
# bnb_4bit_compute_dtype=torch.bfloat16,
42+
# )
43+
# base_model = AutoModelForCausalLM.from_pretrained(
44+
# BASE_MODEL, quantization_config=bnb_config
45+
# )
46+
#
47+
# For this demo, we simulate with synthetic data:
48+
print("=== QLoRA + vLoRA Pipeline Demo ===\n")
49+
50+
# ── Step 2: Load adapters ──────────────────────────────────────────
51+
from vlora import LoRAWeights, SharedSubspace, VLoRAModel
52+
53+
print("Creating synthetic adapters (replace with load_adapter_from_hub)...")
54+
layers = [
55+
"model.layers.0.self_attn.q_proj",
56+
"model.layers.0.self_attn.v_proj",
57+
"model.layers.1.self_attn.q_proj",
58+
"model.layers.1.self_attn.v_proj",
59+
]
60+
rank = 8
61+
dim = 512
62+
n_adapters = 10
63+
64+
# Create correlated adapters (simulates real LoRA adapters sharing structure)
65+
torch.manual_seed(42)
66+
shared_basis = {l: torch.randn(5, rank * dim) for l in layers}
67+
adapters = []
68+
task_ids = []
69+
for i in range(n_adapters):
70+
lora_a = {l: (torch.randn(5) @ shared_basis[l]).reshape(rank, dim) for l in layers}
71+
lora_b = {l: torch.randn(dim, rank) * 0.01 for l in layers}
72+
adapters.append(LoRAWeights(layer_names=layers, lora_a=lora_a, lora_b=lora_b, rank=rank))
73+
task_ids.append(f"task_{i}")
74+
print(f" Loaded {n_adapters} adapters, rank={rank}, {len(layers)} layers\n")
75+
76+
# ── Step 3: Build shared subspace ──────────────────────────────────
77+
print("Building shared subspace...")
78+
subspace = SharedSubspace.from_adapters(
79+
adapters,
80+
task_ids=task_ids,
81+
num_components=NUM_COMPONENTS,
82+
)
83+
84+
stats = subspace.compression_stats()
85+
print(f" Components: {subspace.num_components}")
86+
print(f" Compression: {stats['compression_ratio']:.1f}x")
87+
print(f" Original params: {stats['total_params_original']:,}")
88+
print(f" Compressed params: {stats['total_params_compressed']:,}\n")
89+
90+
# ── Step 4: Apply NF4 quantization to subspace ─────────────────────
91+
print("Quantizing subspace with NF4...")
92+
subspace.quantize(method="nf4", quantize_loadings=True)
93+
print(" Done (components + loadings quantized)\n")
94+
95+
# ── Step 5: Save with packed NF4 storage ───────────────────────────
96+
import tempfile
97+
from pathlib import Path
98+
99+
save_dir = Path(tempfile.mkdtemp()) / "subspace"
100+
101+
if USE_NF4_STORAGE:
102+
print("Saving with NF4-packed format...")
103+
subspace.save_quantized(save_dir)
104+
else:
105+
print("Saving with float32 format...")
106+
subspace.save(save_dir)
107+
108+
# Compare file sizes
109+
total_bytes = sum(f.stat().st_size for f in save_dir.rglob("*") if f.is_file())
110+
print(f" Saved to: {save_dir}")
111+
print(f" Total size: {total_bytes / 1024:.1f} KB\n")
112+
113+
# ── Step 6: Load and serve ─────────────────────────────────────────
114+
print("Loading subspace (auto-detects format)...")
115+
loaded = SharedSubspace.load(save_dir)
116+
print(f" {loaded!r}\n")
117+
118+
# Full-stack compression stats (with hypothetical QLoRA base model)
119+
full_stats = loaded.full_stack_compression(
120+
base_model_params=1_100_000_000, # TinyLlama 1.1B
121+
base_model_bits=16,
122+
quantized_bits=4,
123+
)
124+
if "total_compression_ratio" in full_stats:
125+
print("Full-stack compression (QLoRA base + vLoRA adapters):")
126+
print(f" Base model: {full_stats['base_model']['compression_ratio']:.1f}x (FP16->NF4)")
127+
print(f" Adapters: {stats['compression_ratio']:.1f}x ({n_adapters} adapters)")
128+
print(f" Total: {full_stats['total_original_bytes']/1e9:.1f} GB -> "
129+
f"{full_stats['total_compressed_bytes']/1e9:.2f} GB")
130+
print(f" Combined: {full_stats['total_compression_ratio']:.1f}x\n")
131+
132+
# In production with a real base model:
133+
#
134+
# model = VLoRAModel(base_model, loaded, compute_dtype=torch.bfloat16)
135+
# print(f"QLoRA info: {model.qlora_info}")
136+
#
137+
# # Instant task switching
138+
# model.set_task("task_0")
139+
# output = model(input_ids)
140+
#
141+
# model.set_task("task_5") # microseconds to switch
142+
# output = model(input_ids)
143+
144+
# Demonstrate reconstruction
145+
print("Reconstructing adapters from subspace...")
146+
for tid in ["task_0", "task_5", "task_9"]:
147+
recon = loaded.reconstruct(tid)
148+
print(f" {tid}: {recon!r}")
149+
150+
print("\nDone!")
151+
152+
153+
if __name__ == "__main__":
154+
main()

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "vlora-dev"
7-
version = "0.2.1"
7+
version = "0.3.0"
88
description = "Various LoRA adapters. One shared basis. Up to 122x compression at scale."
99
readme = "README.md"
1010
license = "Apache-2.0"

src/vlora/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,17 @@
55
maintain one shared basis and per-task coefficient vectors.
66
"""
77

8-
__version__ = "0.2.1"
8+
__version__ = "0.3.0"
99

1010
from vlora.io import LoRAWeights, load_adapter, load_adapter_from_hub, save_adapter
1111
from vlora.ops import (
1212
NF4_QUANT_TABLE,
1313
compute_svd,
1414
explained_variance_ratio,
1515
gram_schmidt,
16+
nf4_pack,
1617
nf4_quantize_dequantize,
18+
nf4_unpack,
1719
project_onto_subspace,
1820
reconstruct_from_subspace,
1921
select_num_components,
@@ -56,6 +58,8 @@
5658
# NF4 quantization (QLoRA-style)
5759
"NF4_QUANT_TABLE",
5860
"nf4_quantize_dequantize",
61+
"nf4_pack",
62+
"nf4_unpack",
5963
# Analysis
6064
"compute_similarity_matrix",
6165
"find_clusters",

src/vlora/ops.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,3 +322,75 @@ def nf4_quantize_dequantize(
322322

323323
# Remove padding and restore shape
324324
return dequantized.flatten()[:numel].reshape(original_shape).to(original_dtype)
325+
326+
327+
def nf4_pack(tensor: Tensor, block_size: int = 64) -> tuple[Tensor, Tensor, int]:
328+
"""Pack a float tensor into 4-bit NF4 format for storage.
329+
330+
Returns compact representation: uint8 packed indices + float32
331+
per-block scales. Two 4-bit indices are packed per byte, giving
332+
~7x compression vs float32.
333+
334+
Args:
335+
tensor: Float tensor to pack.
336+
block_size: Elements per quantization block.
337+
338+
Returns:
339+
packed: uint8 tensor (numel/2 bytes) with pairs of NF4 indices.
340+
scales: float32 tensor (num_blocks,) per-block absmax values.
341+
numel: Original number of elements (for unpadding).
342+
"""
343+
flat = tensor.detach().float().flatten()
344+
numel = flat.numel()
345+
346+
# Pad to multiple of block_size (and even for packing)
347+
pad_to = block_size if block_size % 2 == 0 else block_size * 2
348+
remainder = numel % pad_to
349+
if remainder:
350+
flat = torch.cat([flat, torch.zeros(pad_to - remainder, device=flat.device)])
351+
352+
blocks = flat.reshape(-1, block_size)
353+
scales = blocks.abs().amax(dim=1).clamp(min=1e-10)
354+
normalized = blocks / scales.unsqueeze(1)
355+
356+
# Map to NF4 indices (0-15)
357+
table = NF4_QUANT_TABLE.to(device=normalized.device, dtype=normalized.dtype)
358+
midpoints = (table[:-1] + table[1:]) / 2
359+
indices = torch.bucketize(normalized, midpoints).to(torch.uint8)
360+
361+
# Pack two 4-bit values into one uint8: high nibble + low nibble
362+
indices_flat = indices.flatten()
363+
high = indices_flat[0::2]
364+
low = indices_flat[1::2]
365+
packed = (high << 4) | low
366+
367+
return packed, scales, numel
368+
369+
370+
def nf4_unpack(packed: Tensor, scales: Tensor, numel: int, block_size: int = 64) -> Tensor:
371+
"""Unpack NF4 data back to float32.
372+
373+
Args:
374+
packed: uint8 tensor from nf4_pack.
375+
scales: float32 per-block scales from nf4_pack.
376+
numel: Original element count.
377+
block_size: Block size used during packing.
378+
379+
Returns:
380+
Float32 tensor with numel elements.
381+
"""
382+
# Unpack two 4-bit indices from each byte
383+
high = (packed >> 4) & 0x0F
384+
low = packed & 0x0F
385+
indices = torch.stack([high, low], dim=1).flatten().long()
386+
387+
# Look up NF4 values
388+
table = NF4_QUANT_TABLE.to(device=packed.device)
389+
values = table[indices]
390+
391+
# Reshape into blocks and multiply by scales
392+
n_padded = len(values)
393+
blocks = values.reshape(-1, block_size)
394+
dequantized = blocks * scales.unsqueeze(1)
395+
396+
return dequantized.flatten()[:numel]

0 commit comments

Comments
 (0)