refactor: improve content extraction and encoding handling

MODSetter · MODSetter · commit 2f793e7a6930 · 2026-04-16T00:25:46.000-07:00
- Enhanced Azure Document Intelligence parser to raise an error for empty or whitespace-only content.
- Updated LLMRouterService to log premium model strings more clearly.
- Added automatic encoding detection for file reading in document processors.
- Improved error handling for empty markdown content extraction in file processors.
- Refactored DocumentUploadTab component for better accessibility and user interaction.
diff --git a/surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py b/surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py
@@ -62,10 +62,13 @@ async def parse_with_azure_doc_intelligence(
                     f"after {len(attempt_errors)} failures"
                 )
 
-            if not result.content:
-                return ""
+            content = result.content or ""
+            if not content.strip():
+                raise RuntimeError(
+                    "Azure Document Intelligence returned empty/whitespace-only content"
+                )
 
-            return result.content
+            return content
 
         except ClientAuthenticationError:
             raise
diff --git a/surfsense_backend/app/services/llm_router_service.py b/surfsense_backend/app/services/llm_router_service.py
@@ -186,8 +186,12 @@ def initialize(
             if deployment:
                 model_list.append(deployment)
                 if config.get("billing_tier") == "premium":
-                    model_string = deployment["litellm_params"]["model"]
+                    params = deployment["litellm_params"]
+                    model_string = params["model"]
                     premium_models.add(model_string)
+                    base = params.get("base_model") or config.get("model_name", "")
+                    if base and base != model_string:
+                        premium_models.add(base)
 
         if not model_list:
             logger.warning("No valid LLM configs found for router initialization")
@@ -197,9 +201,9 @@ def initialize(
         instance._premium_model_strings = premium_models
         instance._router_settings = router_settings or {}
         logger.info(
-            "Router pool: %d deployments (%d premium)",
+            "Router pool: %d deployments, premium model strings: %s",
             len(model_list),
-            len(premium_models),
+            sorted(premium_models),
         )
 
         # Default router settings optimized for rate limit handling
@@ -258,9 +262,18 @@ def is_premium_model(cls, model_string: str) -> bool:
     def compute_premium_tokens(cls, calls: list) -> int:
         """Sum ``total_tokens`` for calls whose model is premium."""
         instance = cls.get_instance()
-        return sum(
+        total = sum(
             c.total_tokens for c in calls if c.model in instance._premium_model_strings
         )
+        if calls:
+            call_models = [c.model for c in calls]
+            logger.info(
+                "[premium_tokens] call models=%s, premium_set=%s, result=%d",
+                call_models,
+                sorted(instance._premium_model_strings),
+                total,
+            )
+        return total
 
     @classmethod
     def _build_context_fallback_groups(
diff --git a/surfsense_backend/app/tasks/document_processors/_direct_converters.py b/surfsense_backend/app/tasks/document_processors/_direct_converters.py
@@ -21,6 +21,42 @@
 # at import time so every csv.reader call in this module can handle large fields.
 csv.field_size_limit(2**31 - 1)
 
+_BOM_ENCODINGS: list[tuple[bytes, str]] = [
+    (b"\xff\xfe\x00\x00", "utf-32-le"),
+    (b"\x00\x00\xfe\xff", "utf-32-be"),
+    (b"\xff\xfe", "utf-16-le"),
+    (b"\xfe\xff", "utf-16-be"),
+    (b"\xef\xbb\xbf", "utf-8-sig"),
+]
+
+
+def _detect_encoding(file_path: str) -> str:
+    """Sniff the BOM to pick an encoding, falling back to utf-8."""
+    head = Path(file_path).read_bytes()[:4]
+    for bom, encoding in _BOM_ENCODINGS:
+        if head.startswith(bom):
+            return encoding
+    return "utf-8"
+
+
+def _read_text(file_path: str) -> str:
+    """Read a file with automatic encoding detection.
+
+    Tries BOM-based detection first, then utf-8, then latin-1 as a
+    last resort (latin-1 accepts every byte value).
+    """
+    encoding = _detect_encoding(file_path)
+    try:
+        return Path(file_path).read_text(encoding=encoding)
+    except (UnicodeDecodeError, UnicodeError):
+        pass
+    if encoding != "utf-8":
+        try:
+            return Path(file_path).read_text(encoding="utf-8")
+        except (UnicodeDecodeError, UnicodeError):
+            pass
+    return Path(file_path).read_text(encoding="latin-1")
+
 
 def _escape_pipe(cell: str) -> str:
     """Escape literal pipe characters inside a markdown table cell."""
@@ -33,9 +69,9 @@ def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str:
     The first row is treated as the header.  An empty file returns an
     empty string so the caller can decide how to handle it.
     """
-    with open(file_path, encoding="utf-8", newline="") as fh:
-        reader = csv.reader(fh, delimiter=delimiter)
-        rows = list(reader)
+    text = _read_text(file_path)
+    reader = csv.reader(text.splitlines(), delimiter=delimiter)
+    rows = list(reader)
 
     if not rows:
         return ""
@@ -64,7 +100,7 @@ def tsv_to_markdown(file_path: str) -> str:
 
 def html_to_markdown(file_path: str) -> str:
     """Convert an HTML file to markdown via ``markdownify``."""
-    html = Path(file_path).read_text(encoding="utf-8")
+    html = _read_text(file_path)
     return markdownify(html).strip()
 
 
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -436,7 +436,7 @@ async def _extract_file_content(
     with contextlib.suppress(Exception):
         os.unlink(file_path)
 
-    if not result.markdown_content:
+    if not result.markdown_content or not result.markdown_content.strip():
         raise RuntimeError(f"Failed to extract content from file: {filename}")
 
     return result.markdown_content, result.etl_service, billable_pages
diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx
@@ -546,29 +546,35 @@ export function DocumentUploadTab({
 						</button>
 					)
 				) : (
-					<button
-						type="button"
-						tabIndex={0}
-						className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent outline-none select-none"
-						onClick={() => {
+				<div
+					role="button"
+					tabIndex={0}
+					className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent outline-none select-none"
+					onClick={() => {
+						if (!isElectron) fileInputRef.current?.click();
+					}}
+					onKeyDown={(e) => {
+						if (e.key === "Enter" || e.key === " ") {
+							e.preventDefault();
 							if (!isElectron) fileInputRef.current?.click();
-						}}
+						}
+					}}
+				>
+					<Upload className="h-10 w-10 text-muted-foreground" />
+					<div className="text-center space-y-1.5">
+						<p className="text-base font-medium">
+							{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
+						</p>
+						<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
+					</div>
+					<fieldset
+						className="w-full mt-1 border-none p-0 m-0"
+						onClick={(e) => e.stopPropagation()}
+						onKeyDown={(e) => e.stopPropagation()}
 					>
-						<Upload className="h-10 w-10 text-muted-foreground" />
-						<div className="text-center space-y-1.5">
-							<p className="text-base font-medium">
-								{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
-							</p>
-							<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
-						</div>
-						<fieldset
-							className="w-full mt-1 border-none p-0 m-0"
-							onClick={(e) => e.stopPropagation()}
-							onKeyDown={(e) => e.stopPropagation()}
-						>
-							{renderBrowseButton({ fullWidth: true })}
-						</fieldset>
-					</button>
+						{renderBrowseButton({ fullWidth: true })}
+					</fieldset>
+				</div>
 				)}
 			</div>