@@ -635,6 +635,7 @@ def _from_pretrained(
635635 local_files_only : bool = False ,
636636 token : Optional [Union [str , bool ]] = None ,
637637 use_onnx : Optional [Union [bool , str ]] = "auto" ,
638+ prefer_quantized : bool = True ,
638639 ** kwargs
639640 ) -> "AdaptiveClassifier" :
640641 """Load a model from the HuggingFace Hub or local directory.
@@ -1012,6 +1013,10 @@ def export_onnx(
10121013 export = True
10131014 )
10141015
1016+ # Always save unquantized version first
1017+ ort_model .save_pretrained (save_directory )
1018+ logger .info (f"Saved unquantized ONNX model to { save_directory } " )
1019+
10151020 if quantize :
10161021 logger .info (f"Applying { quantization_config } INT8 quantization..." )
10171022
@@ -1026,15 +1031,13 @@ def export_onnx(
10261031 logger .warning (f"Unknown quantization config: { quantization_config } . Using arm64." )
10271032 qconfig = AutoQuantizationConfig .arm64 (is_static = False , per_channel = False )
10281033
1029- # Apply quantization
1034+ # Apply quantization (saves quantized version alongside unquantized)
10301035 quantizer = ORTQuantizer .from_pretrained (ort_model )
10311036 quantizer .quantize (
10321037 save_dir = save_directory ,
10331038 quantization_config = qconfig
10341039 )
1035- else :
1036- # Save without quantization
1037- ort_model .save_pretrained (save_directory )
1040+ logger .info (f"Saved quantized ONNX model to { save_directory } " )
10381041
10391042 logger .info (f"ONNX model exported to { save_directory } " )
10401043 return save_directory
0 commit comments