Skip to content

Commit 0e42850

Browse files
committed
fix: llamaclud v2 impl
1 parent 2f793e7 commit 0e42850

2 files changed

Lines changed: 42 additions & 25 deletions

File tree

surfsense_backend/app/config/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
env_file = BASE_DIR / ".env"
1414
load_dotenv(env_file)
1515

16+
os.environ.setdefault("OR_APP_NAME", "SurfSense")
17+
os.environ.setdefault("OR_SITE_URL", "https://surfsense.com")
18+
1619

1720
def is_ffmpeg_installed():
1821
"""

surfsense_backend/app/etl_pipeline/parsers/llamacloud.py

Lines changed: 39 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,37 @@
1616
calculate_upload_timeout,
1717
)
1818

19-
LLAMA_TIER_BY_MODE = {
20-
"basic": "cost_effective",
21-
"premium": "agentic_plus",
19+
LLAMA_PARSE_MODE_MAP = {
20+
"basic": "parse_page_with_llm",
21+
"premium": "parse_page_with_agent",
2222
}
2323

2424

25+
def _extract_content(result) -> str:
26+
"""Pull markdown text out of whatever object LlamaParse.aparse returns."""
27+
if hasattr(result, "get_markdown_documents"):
28+
markdown_docs = result.get_markdown_documents(split_by_page=False)
29+
if markdown_docs and hasattr(markdown_docs[0], "text"):
30+
return markdown_docs[0].text
31+
if hasattr(result, "pages") and result.pages:
32+
return "\n\n".join(p.md for p in result.pages if hasattr(p, "md") and p.md)
33+
34+
if isinstance(result, list):
35+
if result and hasattr(result[0], "text"):
36+
return result[0].text
37+
return "\n\n".join(
38+
doc.page_content if hasattr(doc, "page_content") else str(doc)
39+
for doc in result
40+
)
41+
42+
return str(result)
43+
44+
2545
async def parse_with_llamacloud(
2646
file_path: str, estimated_pages: int, processing_mode: str = "basic"
2747
) -> str:
2848
from llama_cloud_services import LlamaParse
49+
from llama_cloud_services.parse.base import JobFailedException
2950
from llama_cloud_services.parse.utils import ResultType
3051

3152
file_size_bytes = os.path.getsize(file_path)
@@ -41,12 +62,13 @@ async def parse_with_llamacloud(
4162
pool=120.0,
4263
)
4364

44-
tier = LLAMA_TIER_BY_MODE.get(processing_mode, "cost_effective")
65+
parse_mode = LLAMA_PARSE_MODE_MAP.get(processing_mode, "parse_page_with_llm")
4566

4667
logging.info(
4768
f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
4869
f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
49-
f"job_timeout={job_timeout:.0f}s, tier={tier} (mode={processing_mode})"
70+
f"job_timeout={job_timeout:.0f}s, parse_mode={parse_mode} "
71+
f"(mode={processing_mode})"
5072
)
5173

5274
last_exception = None
@@ -61,11 +83,12 @@ async def parse_with_llamacloud(
6183
verbose=True,
6284
language="en",
6385
result_type=ResultType.MD,
86+
parse_mode=parse_mode,
87+
ignore_errors=False,
6488
max_timeout=int(max(2000, job_timeout + upload_timeout)),
6589
job_timeout_in_seconds=job_timeout,
6690
job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
6791
custom_client=custom_client,
68-
tier=tier,
6992
)
7093
result = await parser.aparse(file_path)
7194

@@ -75,27 +98,18 @@ async def parse_with_llamacloud(
7598
f"{len(attempt_errors)} failures"
7699
)
77100

78-
if hasattr(result, "get_markdown_documents"):
79-
markdown_docs = result.get_markdown_documents(split_by_page=False)
80-
if markdown_docs and hasattr(markdown_docs[0], "text"):
81-
return markdown_docs[0].text
82-
if hasattr(result, "pages") and result.pages:
83-
return "\n\n".join(
84-
p.md for p in result.pages if hasattr(p, "md") and p.md
85-
)
86-
return str(result)
87-
88-
if isinstance(result, list):
89-
if result and hasattr(result[0], "text"):
90-
return result[0].text
91-
return "\n\n".join(
92-
doc.page_content if hasattr(doc, "page_content") else str(doc)
93-
for doc in result
101+
content = _extract_content(result)
102+
if not content or not content.strip():
103+
raise RuntimeError(
104+
"LlamaCloud returned empty/whitespace-only content"
94105
)
106+
return content
95107

96-
return str(result)
97-
98-
except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
108+
except (
109+
*LLAMACLOUD_RETRYABLE_EXCEPTIONS,
110+
RuntimeError,
111+
JobFailedException,
112+
) as e:
99113
last_exception = e
100114
error_type = type(e).__name__
101115
error_msg = str(e)[:200]

0 commit comments

Comments
 (0)