bump vesions

codelion · codelion · commit d8f798138ebe · 2025-10-06T18:40:04.000+05:30
diff --git a/scripts/benchmark_onnx_speedup.py b/scripts/benchmark_onnx_speedup.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""Benchmark ONNX vs PyTorch performance for adaptive classifier."""
+
+import time
+import logging
+import datasets
+from adaptive_classifier import AdaptiveClassifier
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def benchmark_model(model_id: str, test_texts: list, use_onnx: bool, num_runs: int = 3):
+    """Benchmark a model configuration."""
+    mode = "ONNX (Quantized)" if use_onnx else "PyTorch"
+    logger.info(f"\n{'='*60}")
+    logger.info(f"Benchmarking: {mode}")
+    logger.info(f"{'='*60}")
+
+    # Load model
+    logger.info(f"Loading model from {model_id}...")
+    start = time.time()
+    classifier = AdaptiveClassifier.load(model_id, use_onnx=use_onnx)
+    load_time = time.time() - start
+    logger.info(f"Model loaded in {load_time:.2f}s")
+
+    # Warm-up run (not timed)
+    logger.info("Warming up...")
+    _ = classifier.predict_batch(test_texts[:5])
+
+    # Benchmark runs
+    times = []
+    for run in range(num_runs):
+        logger.info(f"Run {run + 1}/{num_runs}...")
+        start = time.time()
+        predictions = classifier.predict_batch(test_texts)
+        elapsed = time.time() - start
+        times.append(elapsed)
+        logger.info(f"  Completed in {elapsed:.3f}s ({len(test_texts)/elapsed:.1f} samples/sec)")
+
+    avg_time = sum(times) / len(times)
+    throughput = len(test_texts) / avg_time
+
+    logger.info(f"\nResults for {mode}:")
+    logger.info(f"  Average time: {avg_time:.3f}s")
+    logger.info(f"  Throughput: {throughput:.1f} samples/sec")
+    logger.info(f"  Per-sample latency: {avg_time*1000/len(test_texts):.1f}ms")
+
+    return {
+        'mode': mode,
+        'load_time': load_time,
+        'avg_time': avg_time,
+        'throughput': throughput,
+        'times': times
+    }
+
+def main():
+    # Configuration
+    model_id = "adaptive-classifier/llm-router"
+    num_samples = 100
+    num_runs = 3
+
+    logger.info(f"Benchmark Configuration:")
+    logger.info(f"  Model: {model_id}")
+    logger.info(f"  Samples: {num_samples}")
+    logger.info(f"  Runs per config: {num_runs}")
+
+    # Load test data
+    logger.info(f"\nLoading test dataset...")
+    dataset = datasets.load_dataset("routellm/gpt4_dataset", split="validation")
+    test_data = dataset.select(range(min(num_samples, len(dataset))))
+    test_texts = [item['prompt'] for item in test_data]
+    logger.info(f"Loaded {len(test_texts)} test samples")
+
+    # Benchmark PyTorch version
+    pytorch_results = benchmark_model(model_id, test_texts, use_onnx=False, num_runs=num_runs)
+
+    # Benchmark ONNX version
+    onnx_results = benchmark_model(model_id, test_texts, use_onnx=True, num_runs=num_runs)
+
+    # Compare results
+    logger.info(f"\n{'='*60}")
+    logger.info(f"COMPARISON SUMMARY")
+    logger.info(f"{'='*60}")
+
+    speedup = pytorch_results['avg_time'] / onnx_results['avg_time']
+    throughput_increase = onnx_results['throughput'] / pytorch_results['throughput']
+    latency_reduction = (1 - onnx_results['avg_time'] / pytorch_results['avg_time']) * 100
+
+    logger.info(f"\nPyTorch (Baseline):")
+    logger.info(f"  Average time: {pytorch_results['avg_time']:.3f}s")
+    logger.info(f"  Throughput: {pytorch_results['throughput']:.1f} samples/sec")
+
+    logger.info(f"\nONNX Quantized:")
+    logger.info(f"  Average time: {onnx_results['avg_time']:.3f}s")
+    logger.info(f"  Throughput: {onnx_results['throughput']:.1f} samples/sec")
+
+    logger.info(f"\nSpeedup:")
+    logger.info(f"  🚀 {speedup:.2f}x faster")
+    logger.info(f"  📈 {throughput_increase:.2f}x throughput increase")
+    logger.info(f"  ⏱️  {latency_reduction:.1f}% latency reduction")
+
+    logger.info(f"\nModel Size Comparison:")
+    logger.info(f"  PyTorch: Uses full precision weights")
+    logger.info(f"  ONNX Quantized: 65.6 MB (4x smaller than unquantized)")
+
+    logger.info(f"\n{'='*60}")
+    logger.info(f"BENCHMARK COMPLETE")
+    logger.info(f"{'='*60}")
+
+    return {
+        'pytorch': pytorch_results,
+        'onnx': onnx_results,
+        'speedup': speedup,
+        'throughput_increase': throughput_increase,
+        'latency_reduction': latency_reduction
+    }
+
+if __name__ == "__main__":
+    results = main()
diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 
 setup(
     name="adaptive-classifier",
-    version="0.0.19",
+    version="0.1.0",
     author="codelion",
     author_email="codelion@okyasoft.com",
     description="A flexible, adaptive classification system for dynamic text classification",
diff --git a/src/adaptive_classifier/classifier.py b/src/adaptive_classifier/classifier.py
@@ -701,6 +701,41 @@ def _from_pretrained(
                     token=token,
                     local_files_only=local_files_only,
                 )
+
+                # Try to download ONNX files if they exist
+                try:
+                    # Download quantized ONNX model (primary)
+                    hf_hub_download(
+                        repo_id=model_id,
+                        filename="onnx/model_quantized.onnx",
+                        revision=revision,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        resume_download=resume_download,
+                        token=token,
+                        local_files_only=local_files_only,
+                    )
+                    # Download ONNX config files
+                    for onnx_file in ["config.json", "ort_config.json", "tokenizer.json",
+                                      "tokenizer_config.json", "special_tokens_map.json", "vocab.txt"]:
+                        try:
+                            hf_hub_download(
+                                repo_id=model_id,
+                                filename=f"onnx/{onnx_file}",
+                                revision=revision,
+                                cache_dir=cache_dir,
+                                force_download=force_download,
+                                proxies=proxies,
+                                resume_download=resume_download,
+                                token=token,
+                                local_files_only=local_files_only,
+                            )
+                        except:
+                            pass  # Some files might not exist
+                    logger.info("Downloaded ONNX model files from Hub")
+                except Exception as e:
+                    logger.debug(f"ONNX model not available on Hub: {e}")
         except Exception as e:
             raise ValueError(f"Error loading model from {model_id}: {e}")
 
@@ -712,9 +747,9 @@ def _from_pretrained(
         with open(model_path / "examples.json", "r", encoding="utf-8") as f:
             saved_examples = json.load(f)
 
-        # Check if ONNX model exists
+        # Check if ONNX model exists (quantized or unquantized)
         onnx_path = model_path / "onnx"
-        has_onnx = onnx_path.exists() and (onnx_path / "model.onnx").exists()
+        has_onnx = onnx_path.exists() and ((onnx_path / "model_quantized.onnx").exists() or (onnx_path / "model.onnx").exists())
 
         # Determine if we should use ONNX
         final_use_onnx = use_onnx