Skip to content

Commit 2f793e7

Browse files
committed
refactor: improve content extraction and encoding handling
- Enhanced Azure Document Intelligence parser to raise an error for empty or whitespace-only content. - Updated LLMRouterService to log premium model strings more clearly. - Added automatic encoding detection for file reading in document processors. - Improved error handling for empty markdown content extraction in file processors. - Refactored DocumentUploadTab component for better accessibility and user interaction.
1 parent 4a51ccd commit 2f793e7

5 files changed

Lines changed: 91 additions & 33 deletions

File tree

surfsense_backend/app/etl_pipeline/parsers/azure_doc_intelligence.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,13 @@ async def parse_with_azure_doc_intelligence(
6262
f"after {len(attempt_errors)} failures"
6363
)
6464

65-
if not result.content:
66-
return ""
65+
content = result.content or ""
66+
if not content.strip():
67+
raise RuntimeError(
68+
"Azure Document Intelligence returned empty/whitespace-only content"
69+
)
6770

68-
return result.content
71+
return content
6972

7073
except ClientAuthenticationError:
7174
raise

surfsense_backend/app/services/llm_router_service.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,12 @@ def initialize(
186186
if deployment:
187187
model_list.append(deployment)
188188
if config.get("billing_tier") == "premium":
189-
model_string = deployment["litellm_params"]["model"]
189+
params = deployment["litellm_params"]
190+
model_string = params["model"]
190191
premium_models.add(model_string)
192+
base = params.get("base_model") or config.get("model_name", "")
193+
if base and base != model_string:
194+
premium_models.add(base)
191195

192196
if not model_list:
193197
logger.warning("No valid LLM configs found for router initialization")
@@ -197,9 +201,9 @@ def initialize(
197201
instance._premium_model_strings = premium_models
198202
instance._router_settings = router_settings or {}
199203
logger.info(
200-
"Router pool: %d deployments (%d premium)",
204+
"Router pool: %d deployments, premium model strings: %s",
201205
len(model_list),
202-
len(premium_models),
206+
sorted(premium_models),
203207
)
204208

205209
# Default router settings optimized for rate limit handling
@@ -258,9 +262,18 @@ def is_premium_model(cls, model_string: str) -> bool:
258262
def compute_premium_tokens(cls, calls: list) -> int:
259263
"""Sum ``total_tokens`` for calls whose model is premium."""
260264
instance = cls.get_instance()
261-
return sum(
265+
total = sum(
262266
c.total_tokens for c in calls if c.model in instance._premium_model_strings
263267
)
268+
if calls:
269+
call_models = [c.model for c in calls]
270+
logger.info(
271+
"[premium_tokens] call models=%s, premium_set=%s, result=%d",
272+
call_models,
273+
sorted(instance._premium_model_strings),
274+
total,
275+
)
276+
return total
264277

265278
@classmethod
266279
def _build_context_fallback_groups(

surfsense_backend/app/tasks/document_processors/_direct_converters.py

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,42 @@
2121
# at import time so every csv.reader call in this module can handle large fields.
2222
csv.field_size_limit(2**31 - 1)
2323

24+
_BOM_ENCODINGS: list[tuple[bytes, str]] = [
25+
(b"\xff\xfe\x00\x00", "utf-32-le"),
26+
(b"\x00\x00\xfe\xff", "utf-32-be"),
27+
(b"\xff\xfe", "utf-16-le"),
28+
(b"\xfe\xff", "utf-16-be"),
29+
(b"\xef\xbb\xbf", "utf-8-sig"),
30+
]
31+
32+
33+
def _detect_encoding(file_path: str) -> str:
34+
"""Sniff the BOM to pick an encoding, falling back to utf-8."""
35+
head = Path(file_path).read_bytes()[:4]
36+
for bom, encoding in _BOM_ENCODINGS:
37+
if head.startswith(bom):
38+
return encoding
39+
return "utf-8"
40+
41+
42+
def _read_text(file_path: str) -> str:
43+
"""Read a file with automatic encoding detection.
44+
45+
Tries BOM-based detection first, then utf-8, then latin-1 as a
46+
last resort (latin-1 accepts every byte value).
47+
"""
48+
encoding = _detect_encoding(file_path)
49+
try:
50+
return Path(file_path).read_text(encoding=encoding)
51+
except (UnicodeDecodeError, UnicodeError):
52+
pass
53+
if encoding != "utf-8":
54+
try:
55+
return Path(file_path).read_text(encoding="utf-8")
56+
except (UnicodeDecodeError, UnicodeError):
57+
pass
58+
return Path(file_path).read_text(encoding="latin-1")
59+
2460

2561
def _escape_pipe(cell: str) -> str:
2662
"""Escape literal pipe characters inside a markdown table cell."""
@@ -33,9 +69,9 @@ def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str:
3369
The first row is treated as the header. An empty file returns an
3470
empty string so the caller can decide how to handle it.
3571
"""
36-
with open(file_path, encoding="utf-8", newline="") as fh:
37-
reader = csv.reader(fh, delimiter=delimiter)
38-
rows = list(reader)
72+
text = _read_text(file_path)
73+
reader = csv.reader(text.splitlines(), delimiter=delimiter)
74+
rows = list(reader)
3975

4076
if not rows:
4177
return ""
@@ -64,7 +100,7 @@ def tsv_to_markdown(file_path: str) -> str:
64100

65101
def html_to_markdown(file_path: str) -> str:
66102
"""Convert an HTML file to markdown via ``markdownify``."""
67-
html = Path(file_path).read_text(encoding="utf-8")
103+
html = _read_text(file_path)
68104
return markdownify(html).strip()
69105

70106

surfsense_backend/app/tasks/document_processors/file_processors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@ async def _extract_file_content(
436436
with contextlib.suppress(Exception):
437437
os.unlink(file_path)
438438

439-
if not result.markdown_content:
439+
if not result.markdown_content or not result.markdown_content.strip():
440440
raise RuntimeError(f"Failed to extract content from file: {filename}")
441441

442442
return result.markdown_content, result.etl_service, billable_pages

surfsense_web/components/sources/DocumentUploadTab.tsx

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -546,29 +546,35 @@ export function DocumentUploadTab({
546546
</button>
547547
)
548548
) : (
549-
<button
550-
type="button"
551-
tabIndex={0}
552-
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent outline-none select-none"
553-
onClick={() => {
549+
<div
550+
role="button"
551+
tabIndex={0}
552+
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent outline-none select-none"
553+
onClick={() => {
554+
if (!isElectron) fileInputRef.current?.click();
555+
}}
556+
onKeyDown={(e) => {
557+
if (e.key === "Enter" || e.key === " ") {
558+
e.preventDefault();
554559
if (!isElectron) fileInputRef.current?.click();
555-
}}
560+
}
561+
}}
562+
>
563+
<Upload className="h-10 w-10 text-muted-foreground" />
564+
<div className="text-center space-y-1.5">
565+
<p className="text-base font-medium">
566+
{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
567+
</p>
568+
<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
569+
</div>
570+
<fieldset
571+
className="w-full mt-1 border-none p-0 m-0"
572+
onClick={(e) => e.stopPropagation()}
573+
onKeyDown={(e) => e.stopPropagation()}
556574
>
557-
<Upload className="h-10 w-10 text-muted-foreground" />
558-
<div className="text-center space-y-1.5">
559-
<p className="text-base font-medium">
560-
{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
561-
</p>
562-
<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
563-
</div>
564-
<fieldset
565-
className="w-full mt-1 border-none p-0 m-0"
566-
onClick={(e) => e.stopPropagation()}
567-
onKeyDown={(e) => e.stopPropagation()}
568-
>
569-
{renderBrowseButton({ fullWidth: true })}
570-
</fieldset>
571-
</button>
575+
{renderBrowseButton({ fullWidth: true })}
576+
</fieldset>
577+
</div>
572578
)}
573579
</div>
574580

0 commit comments

Comments
 (0)