Fix correctness bugs and improve robustness

tveseli · tveseli · commit 291ca1a615de · 2026-03-29T01:13:31.000-04:00
Fixes:
- Sync __version__ to 0.2.1 (was 0.1.0, pyproject.toml says 0.2.1)
- Wire up check_tensor_health after SVD to catch NaN/Inf in components
- Fix TIES merge normalization: use contributor_count divisor instead
  of over-scaling by total adapter count n
- Warn on task ID collision in absorb/absorb_incremental
- Sanitize task IDs for filesystem-safe filenames in save/load
  (stores tid→filename mapping in metadata, backwards-compatible)
- Add task_ids length validation to from_adapters_streaming

Performance:
- Cache module handles in VLoRAModel at init (avoids O(M) scan of
  all named_modules on every task switch)
- Replace NF4 distance broadcast with torch.bucketize (binary search),
  reducing memory from O(N*16) to O(N) for large weight matrices
diff --git a/src/vlora/__init__.py b/src/vlora/__init__.py
@@ -5,7 +5,7 @@
 maintain one shared basis and per-task coefficient vectors.
 """
 
-__version__ = "0.1.0"
+__version__ = "0.2.1"
 
 from vlora.io import LoRAWeights, load_adapter, load_adapter_from_hub, save_adapter
 from vlora.ops import (
diff --git a/src/vlora/merge.py b/src/vlora/merge.py
@@ -141,9 +141,9 @@ def ties_merge(
             weighted = stacked * w
             # Zero out values with wrong sign
             weighted = weighted * mask.float()
-            # Sum and normalize by number of contributors (avoid division by zero)
+            # Average over contributors that match elected sign
             contributor_count = mask.float().sum(dim=0).clamp(min=1)
-            merged = weighted.sum(dim=0) * (n / contributor_count)
+            merged = weighted.sum(dim=0) / contributor_count
 
             out_dict[layer] = merged
 
diff --git a/src/vlora/model.py b/src/vlora/model.py
@@ -80,6 +80,12 @@ def __init__(
         self._active_task: str | None = None
         self._cached_deltas: dict[str, Tensor] | None = None
         self._hooks: list[torch.utils.hooks.RemovableHook] = []
+        # Cache module handles once to avoid O(M) scan on every task switch
+        self._target_modules: dict[str, nn.Module] = {
+            name: module
+            for name, module in self.base_model.named_modules()
+            if name in self.subspace.layer_names and _is_linear_layer(module)
+        }
         self._qlora_info = self._detect_quantization()
 
     def set_task(self, task_id: str) -> None:
@@ -113,8 +119,8 @@ def _apply_hooks(self) -> None:
         if self._cached_deltas is None:
             return
 
-        for name, module in self.base_model.named_modules():
-            if name in self._cached_deltas and _is_linear_layer(module):
+        for name, module in self._target_modules.items():
+            if name in self._cached_deltas:
                 delta = self._cached_deltas[name]
                 hook = module.register_forward_hook(
                     self._make_lora_hook(delta)
@@ -174,12 +180,12 @@ def _detect_quantization(self) -> dict:
             "quantized": False,
             "method": None,
             "num_quantized_layers": 0,
-            "num_target_layers": 0,
+            "num_target_layers": len(self._target_modules),
         }
         try:
             import bitsandbytes as bnb
 
-            for name, module in self.base_model.named_modules():
+            for module in self._target_modules.values():
                 if isinstance(module, bnb.nn.Linear4bit):
                     info["quantized"] = True
                     info["method"] = info["method"] or "nf4"
@@ -191,11 +197,6 @@ def _detect_quantization(self) -> dict:
         except ImportError:
             pass
 
-        # Count how many subspace layers match modules in the base model
-        for name, module in self.base_model.named_modules():
-            if name in self.subspace.layer_names and _is_linear_layer(module):
-                info["num_target_layers"] += 1
-
         return info
 
     @property
diff --git a/src/vlora/ops.py b/src/vlora/ops.py
@@ -246,6 +246,9 @@ def nf4_quantize_dequantize(tensor: Tensor, block_size: int = 64) -> Tensor:
     The returned tensor is the same dtype as input but only contains
     values representable in NF4 format.
 
+    Uses ``torch.bucketize`` (binary search) for O(N log 16) lookup
+    instead of broadcasting all 16 distances, keeping memory O(N).
+
     Based on QLoRA (Dettmers et al., 2023, arXiv:2305.14314).
 
     Args:
@@ -274,11 +277,12 @@ def nf4_quantize_dequantize(tensor: Tensor, block_size: int = 64) -> Tensor:
     absmax = blocks.abs().amax(dim=1, keepdim=True).clamp(min=1e-10)
     normalized = blocks / absmax
 
-    # Snap each value to nearest NF4 level
+    # Snap to nearest NF4 level via binary search (memory-efficient).
+    # Midpoints between adjacent NF4 levels serve as bucket boundaries.
     table = NF4_QUANT_TABLE.to(device=normalized.device, dtype=normalized.dtype)
-    # (num_blocks, block_size, 1) vs (1, 1, 16) → distances (num_blocks, block_size, 16)
-    distances = (normalized.unsqueeze(-1) - table).abs()
-    indices = distances.argmin(dim=-1)
+    midpoints = (table[:-1] + table[1:]) / 2  # 15 boundaries
+    # bucketize returns the index of the bucket each value falls into
+    indices = torch.bucketize(normalized, midpoints)
     quantized_normalized = table[indices]
 
     # Dequantize: scale back by absmax
diff --git a/src/vlora/subspace.py b/src/vlora/subspace.py
@@ -149,6 +149,8 @@ def from_adapters(
             ]:
                 data = stacked[layer]
                 comps, svals, mean = compute_svd(data, num_components=None, center=True)
+                check_tensor_health(comps, f"{layer}.components_{side.lower()}")
+                check_tensor_health(mean, f"{layer}.mean_{side.lower()}")
 
                 if adaptive_k:
                     # Per-layer: each layer/side gets its own k
@@ -265,6 +267,12 @@ def absorb(self, new_adapter: LoRAWeights, new_task_id: str) -> None:
         reruns SVD to produce an updated basis.
         """
         check_adapter_matches_subspace(new_adapter, self, "absorb")
+        if new_task_id in self.tasks:
+            import warnings
+            warnings.warn(
+                f"Task '{new_task_id}' already exists and will be overwritten by absorb.",
+                stacklevel=2,
+            )
         logger.info("Absorbing adapter '%s' (full SVD recompute, %d existing tasks)", new_task_id, len(self.tasks))
         # Reconstruct all existing tasks as full adapters
         all_adapters = []
@@ -308,6 +316,12 @@ def absorb_incremental(self, new_adapter: LoRAWeights, new_task_id: str) -> None
         approximation trade-off.
         """
         check_adapter_matches_subspace(new_adapter, self, "absorb_incremental")
+        if new_task_id in self.tasks:
+            import warnings
+            warnings.warn(
+                f"Task '{new_task_id}' already exists and will be overwritten by absorb_incremental.",
+                stacklevel=2,
+            )
         logger.debug("Absorbing adapter '%s' incrementally", new_task_id)
         loadings_a: dict[str, Tensor] = {}
         loadings_b: dict[str, Tensor] = {}
@@ -390,6 +404,11 @@ def from_adapters_streaming(
         paths = [Path(p) for p in adapter_paths]
         if task_ids is None:
             task_ids = [p.name for p in paths]
+        if len(task_ids) != len(paths):
+            raise ValueError(
+                f"task_ids length ({len(task_ids)}) must match "
+                f"adapter_paths length ({len(paths)})"
+            )
 
         # Initialize from first adapter(s) — use first two if available
         # so SVD has enough samples to find >1 component
@@ -633,8 +652,16 @@ def get_trainable_params(
 
         return params
 
+    @staticmethod
+    def _safe_filename(task_id: str) -> str:
+        """Convert a task ID to a filesystem-safe filename component."""
+        import re
+        return re.sub(r'[^\w\-.]', '_', task_id)
+
     def save(self, path: str | Path) -> None:
         """Serialize the subspace to disk."""
+        import json
+
         path = Path(path)
         path.mkdir(parents=True, exist_ok=True)
 
@@ -650,19 +677,22 @@ def save(self, path: str | Path) -> None:
 
         save_file(tensors, str(path / "subspace.safetensors"))
 
-        # Save per-task loadings
+        # Save per-task loadings (with sanitized filenames)
+        tid_to_filename: dict[str, str] = {}
         for tid, proj in self.tasks.items():
+            safe_name = self._safe_filename(tid)
+            tid_to_filename[tid] = safe_name
             task_tensors = {}
             for layer in self.layer_names:
                 task_tensors[f"{layer}.loadings_a"] = proj.loadings_a[layer].contiguous()
                 task_tensors[f"{layer}.loadings_b"] = proj.loadings_b[layer].contiguous()
-            save_file(task_tensors, str(path / f"task_{tid}.safetensors"))
+            save_file(task_tensors, str(path / f"task_{safe_name}.safetensors"))
 
-        # Save metadata
-        import json
+        # Save metadata (includes filename mapping for safe round-trip)
         meta = {
             "layer_names": self.layer_names,
             "task_ids": list(self.tasks.keys()),
+            "task_filenames": tid_to_filename,
             "rank": self.rank,
             "num_components": self.num_components,
         }
@@ -683,6 +713,8 @@ def load(cls, path: str | Path) -> SharedSubspace:
         task_ids = meta["task_ids"]
         rank = meta["rank"]
         num_components = meta["num_components"]
+        # Support both old format (no mapping) and new format
+        tid_to_filename = meta.get("task_filenames", {})
 
         tensors = load_file(str(path / "subspace.safetensors"))
         components_a = {l: tensors[f"{l}.components_a"] for l in layer_names}
@@ -694,7 +726,8 @@ def load(cls, path: str | Path) -> SharedSubspace:
 
         tasks = {}
         for tid in task_ids:
-            task_tensors = load_file(str(path / f"task_{tid}.safetensors"))
+            safe_name = tid_to_filename.get(tid, tid)
+            task_tensors = load_file(str(path / f"task_{safe_name}.safetensors"))
             loadings_a = {l: task_tensors[f"{l}.loadings_a"] for l in layer_names}
             loadings_b = {l: task_tensors[f"{l}.loadings_b"] for l in layer_names}
             tasks[tid] = TaskProjection(