Skip to content

Commit b1e45d3

Browse files
committed
Crawler API parity + snippet audit + cloud browser config
Crawler: - Rename urls_crawled → urls_visited (scrape-engine is truth) - Nested CrawlerState dataclass on CrawlerStatusResponse - Crawl.wait() adds allow_cancelled opt-in flag - Crawl.__repr__ includes URL via _config._params - Webhook module full rewrite: 8 real events vs fictional 4 (CrawlerLifecycleWebhook + 4 event-specific dataclasses) - Added /urls endpoint + Crawl.urls() convenience method - Strict parsing (KeyError on missing required fields) Cloud Browser: - New scrapfly/browser_config.py with BrowserConfig dataclass - Constants PROXY_POOL_DATACENTER / _RESIDENTIAL, OS_* literals - Validated proxy_pool values only: datacenter / residential - client.cloud_browser() URL builder + extension CRUD methods Reporter: - Renamed SentryReporter.py → sentry.py with lazy sentry_sdk import - Kept SentryReporter.py as backwards-compat shim Tests: - test_configuration: subdomain fields, respect_robots_txt None default - test_basic_workflow: visited/pending/failed URL filtering - test_urls_endpoint: pagination + status filter - test_crawler_webhooks: parse all 8 example JSON payloads - test_compliance: end-to-end parity check against api.scrapfly.home
1 parent 754e84e commit b1e45d3

30 files changed

Lines changed: 2722 additions & 521 deletions

examples/crawler/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ crawl.wait(verbose=True, max_wait=300)
120120

121121
# Check status
122122
status = crawl.status()
123-
print(f"Crawled {status.urls_crawled} URLs")
123+
print(f"Crawled {status.state.urls_visited} URLs")
124124

125125
# Get results
126126
artifact = crawl.warc()
@@ -140,8 +140,8 @@ if html:
140140

141141
```python
142142
stats = crawl.stats()
143-
print(f"URLs discovered: {stats['urls_discovered']}")
144-
print(f"URLs crawled: {stats['urls_crawled']}")
143+
print(f"URLs extracted: {stats['urls_extracted']}")
144+
print(f"URLs visited: {stats['urls_visited']}")
145145
print(f"Crawl rate: {stats['crawl_rate']:.1f}%")
146146
print(f"Total size: {stats['total_size_kb']:.2f} KB")
147147
```

examples/crawler/demo_markdown.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@
9393

9494
print("\n✅ Crawl completed!")
9595
final_status = crawl.status()
96-
print(f" Total pages crawled: {final_status.urls_crawled}")
96+
print(f" Total pages crawled: {final_status.state.urls_visited}")
9797
print(f" Failed: {final_status.urls_failed}")
9898

9999
# Get all URLs from WARC to retrieve in batch
@@ -209,8 +209,8 @@
209209
# Show statistics
210210
stats = crawl.stats()
211211
print("\n📊 Crawl Statistics:")
212-
print(f" URLs discovered: {stats['urls_discovered']}")
213-
print(f" URLs crawled: {stats['urls_crawled']}")
212+
print(f" URLs extracted: {stats['urls_extracted']}")
213+
print(f" URLs visited: {stats['urls_visited']}")
214214
print(f" URLs failed: {stats['urls_failed']}")
215215
print(f" Progress: {stats['progress_pct']:.1f}%")
216216

examples/crawler/sync_crawl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
status = client.get_crawl_status(start_response.uuid)
6161
print(f" Status: {status.status}")
6262
print(f" Progress: {status.progress_pct:.1f}%")
63-
print(f" Crawled: {status.urls_crawled}/{status.urls_discovered} pages")
63+
print(f" Visited: {status.state.urls_visited}/{status.state.urls_extracted} pages")
6464

6565
if status.is_complete:
6666
print("\n✓ Crawl completed!")
Lines changed: 106 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,106 +1,140 @@
11
"""
22
Example Crawler Webhook Handler
33
4-
This example demonstrates how to receive and handle Crawler API webhooks.
4+
This example demonstrates how to receive and handle the 8 real crawler
5+
webhook events emitted by the Scrapfly crawler API.
6+
7+
Event envelope
8+
==============
9+
10+
Every crawler webhook follows the same envelope:
11+
12+
{
13+
"event": "<event_name>",
14+
"payload": { ... event-specific fields ... }
15+
}
16+
17+
The 8 event names are defined by ``CrawlerWebhookEvent`` and match the
18+
scrape-engine's ``WebhookEvents`` class exactly.
519
"""
620

721
from scrapfly import (
822
webhook_from_payload,
9-
CrawlStartedWebhook,
10-
CrawlUrlDiscoveredWebhook,
11-
CrawlUrlFailedWebhook,
12-
CrawlCompletedWebhook,
23+
CrawlerWebhookEvent,
24+
CrawlerLifecycleWebhook,
25+
CrawlerUrlVisitedWebhook,
26+
CrawlerUrlSkippedWebhook,
27+
CrawlerUrlDiscoveredWebhook,
28+
CrawlerUrlFailedWebhook,
1329
)
1430

1531

16-
# Example: Simple Flask webhook endpoint
32+
# ---------------------------------------------------------------------------
33+
# Example 1: Flask webhook endpoint
34+
# ---------------------------------------------------------------------------
35+
36+
1737
def example_flask_webhook():
1838
"""Simple webhook handling with Flask"""
1939
from flask import Flask, request
2040

2141
app = Flask(__name__)
22-
SIGNING_SECRETS = ('your-secret-key-here',)
42+
SIGNING_SECRETS = ('your-secret-hex-here',)
2343

2444
@app.route('/webhook', methods=['POST'])
2545
def webhook():
26-
# Parse and verify the webhook
27-
webhook_obj = webhook_from_payload(
46+
# Parse and verify the webhook. The envelope is always
47+
# {"event": ..., "payload": ...}; webhook_from_payload dispatches
48+
# on the event name and returns a typed dataclass.
49+
wh = webhook_from_payload(
2850
request.json,
2951
signing_secrets=SIGNING_SECRETS,
30-
signature=request.headers.get('X-Scrapfly-Webhook-Signature')
52+
signature=request.headers.get('X-Scrapfly-Webhook-Signature'),
3153
)
3254

33-
# Handle different webhook types
34-
if isinstance(webhook_obj, CrawlStartedWebhook):
35-
print(f"Crawl {webhook_obj.uuid} started")
36-
37-
elif isinstance(webhook_obj, CrawlUrlDiscoveredWebhook):
38-
print(f"Discovered: {webhook_obj.url} (depth {webhook_obj.depth})")
39-
40-
elif isinstance(webhook_obj, CrawlUrlFailedWebhook):
41-
print(f"Failed: {webhook_obj.url} - {webhook_obj.error}")
42-
43-
elif isinstance(webhook_obj, CrawlCompletedWebhook):
44-
print(f"Completed: {webhook_obj.urls_crawled}/{webhook_obj.urls_discovered} URLs")
55+
# All webhooks carry the common base fields:
56+
print(f"[{wh.event}] crawler={wh.crawler_uuid} project={wh.project} "
57+
f"visited={wh.state.urls_visited}/{wh.state.urls_extracted}")
58+
59+
# Dispatch on the concrete type for event-specific fields.
60+
if isinstance(wh, CrawlerLifecycleWebhook):
61+
# Covers crawler_started / crawler_stopped / crawler_cancelled /
62+
# crawler_finished. Use wh.event to distinguish which one.
63+
if wh.event == CrawlerWebhookEvent.CRAWLER_FINISHED.value:
64+
print(f" ✓ finished: {wh.state.urls_visited} URLs visited, "
65+
f"credits={wh.state.api_credit_used}, "
66+
f"stop_reason={wh.state.stop_reason}")
67+
elif wh.event == CrawlerWebhookEvent.CRAWLER_STARTED.value:
68+
print(f" ▶ started at seed_url={wh.seed_url}")
69+
elif wh.event == CrawlerWebhookEvent.CRAWLER_CANCELLED.value:
70+
print(f" ✗ cancelled by user")
71+
elif wh.event == CrawlerWebhookEvent.CRAWLER_STOPPED.value:
72+
print(f" ⚠ stopped (stop_reason={wh.state.stop_reason})")
73+
74+
elif isinstance(wh, CrawlerUrlVisitedWebhook):
75+
print(f" ● visited {wh.url} [{wh.scrape.status_code}] "
76+
f"country={wh.scrape.country} log={wh.scrape.log_uuid}")
77+
78+
elif isinstance(wh, CrawlerUrlDiscoveredWebhook):
79+
print(f" ◆ discovered {len(wh.discovered_urls)} URLs "
80+
f"via {wh.origin}")
81+
82+
elif isinstance(wh, CrawlerUrlSkippedWebhook):
83+
for url, reason in wh.urls.items():
84+
print(f" ○ skipped {url} ({reason})")
85+
86+
elif isinstance(wh, CrawlerUrlFailedWebhook):
87+
print(f" ✗ failed {wh.url}: {wh.error}")
88+
if wh.log_link:
89+
print(f" log: {wh.log_link}")
4590

4691
return '', 200
4792

4893
app.run(port=5000)
4994

5095

51-
# Example: Using built-in webhook server
52-
def example_builtin_server():
53-
"""Using Scrapfly's built-in webhook server"""
54-
from scrapfly.webhook import create_server, ResourceType
96+
# ---------------------------------------------------------------------------
97+
# Example 2: Sanity-check with a real fixture payload
98+
# ---------------------------------------------------------------------------
5599

56-
def callback(data, resource_type, request):
57-
if resource_type == ResourceType.CRAWLER.value:
58-
webhook_obj = webhook_from_payload(data)
59-
print(f"Received {webhook_obj.event} for {webhook_obj.uuid}")
60-
61-
app = create_server(
62-
signing_secrets=('your-secret-key-here',),
63-
callback=callback
64-
)
65-
app.run(port=5000)
66100

67-
68-
# Test with example payloads
69101
if __name__ == '__main__':
70-
EXAMPLE_PAYLOADS = {
71-
'started': {
72-
"event": "crawl.started",
73-
"uuid": "test-uuid",
74-
"status": "RUNNING",
75-
"timestamp": "2025-01-16T10:30:00Z"
76-
},
77-
'url_discovered': {
78-
"event": "crawl.url_discovered",
79-
"uuid": "test-uuid",
80-
"url": "https://example.com/page",
81-
"depth": 1,
82-
"timestamp": "2025-01-16T10:30:05Z"
83-
},
84-
'url_failed': {
85-
"event": "crawl.url_failed",
86-
"uuid": "test-uuid",
87-
"url": "https://example.com/404",
88-
"error": "HTTP 404 Not Found",
89-
"status_code": 404,
90-
"timestamp": "2025-01-16T10:30:10Z"
102+
# A real crawler_finished payload — exactly as the scrape-engine emits it
103+
# (see apps/scrapfly/web-app/src/Template/Docs/crawler-api/
104+
# webhooks_example/crawler_finished.json for the canonical reference).
105+
example_finished = {
106+
"event": "crawler_finished",
107+
"payload": {
108+
"crawler_uuid": "b4867c50-318c-47cd-bfc9-bed67f24771a",
109+
"project": "default",
110+
"env": "LIVE",
111+
"seed_url": "https://web-scraping.dev/products",
112+
"action": "finished",
113+
"state": {
114+
"duration": 6.11,
115+
"urls_visited": 5,
116+
"urls_extracted": 49,
117+
"urls_failed": 0,
118+
"urls_skipped": 44,
119+
"urls_to_crawl": 5,
120+
"api_credit_used": 5,
121+
"stop_reason": "page_limit",
122+
"start_time": 1762940028,
123+
"stop_time": 1762940034.1143808,
124+
},
125+
"links": {
126+
"status": "https://api.scrapfly.io/crawl/b4867c50-318c-47cd-bfc9-bed67f24771a/status",
127+
},
91128
},
92-
'completed': {
93-
"event": "crawl.completed",
94-
"uuid": "test-uuid",
95-
"status": "COMPLETED",
96-
"urls_discovered": 100,
97-
"urls_crawled": 95,
98-
"urls_failed": 5,
99-
"timestamp": "2025-01-16T10:35:00Z"
100-
}
101129
}
102130

103-
print("Testing webhook parsing:\n")
104-
for name, payload in EXAMPLE_PAYLOADS.items():
105-
webhook = webhook_from_payload(payload)
106-
print(f"{webhook.event}: {webhook.uuid} at {webhook.timestamp}")
131+
wh = webhook_from_payload(example_finished)
132+
assert isinstance(wh, CrawlerLifecycleWebhook)
133+
assert wh.event == CrawlerWebhookEvent.CRAWLER_FINISHED.value
134+
print(f"Parsed: {wh.event}")
135+
print(f" crawler_uuid: {wh.crawler_uuid}")
136+
print(f" seed_url: {wh.seed_url}")
137+
print(f" state.urls_visited: {wh.state.urls_visited}")
138+
print(f" state.urls_extracted:{wh.state.urls_extracted}")
139+
print(f" state.stop_reason: {wh.state.stop_reason}")
140+
print(f" status_link: {wh.status_link}")

examples/demos/llm-txt-generator/generate_llm_txt.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,9 @@ def generate_llm_txt(
104104
# Get final status
105105
status = crawl.status()
106106
print(f"\n✅ Crawl completed!")
107-
print(f" Pages crawled: {status.urls_crawled}")
107+
print(f" Pages crawled: {status.state.urls_visited}")
108108
print(f" Pages failed: {status.urls_failed}")
109-
print(f" Total discovered: {status.urls_discovered}")
109+
print(f" Total discovered: {status.state.urls_extracted}")
110110

111111
# Get URLs from WARC artifact
112112
print("\n📥 Retrieving crawled URLs from WARC...")

scrapfly/__init__.py

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = '0.8.27'
1+
__version__ = '0.8.28'
22

33
from typing import Tuple
44
from .errors import ScrapflyError
@@ -24,13 +24,23 @@
2424
from .api_response import ScrapeApiResponse, ScreenshotApiResponse, ExtractionApiResponse, ResponseBodyHandler
2525
from .client import ScrapflyClient, ScraperAPI, MonitoringTargetPeriod, MonitoringAggregation
2626
from .scrape_config import ScrapeConfig
27-
from .screenshot_config import ScreenshotConfig
27+
from .screenshot_config import ScreenshotConfig, VisionDeficiency
28+
from .browser_config import (
29+
BrowserConfig,
30+
PROXY_POOL_DATACENTER,
31+
PROXY_POOL_RESIDENTIAL,
32+
OS_LINUX,
33+
OS_WINDOWS,
34+
OS_MAC,
35+
)
2836
from .extraction_config import ExtractionConfig
2937
from .crawler import (
3038
CrawlerConfig,
3139
CrawlerStartResponse,
3240
CrawlerStatusResponse,
3341
CrawlerArtifactResponse,
42+
CrawlerUrlsResponse,
43+
CrawlerUrlEntry,
3444
WarcParser,
3545
WarcRecord,
3646
parse_warc,
@@ -39,14 +49,17 @@
3949
Crawl,
4050
ContentFormat,
4151
CrawlContent,
52+
CrawlerState,
4253
CrawlerWebhookEvent,
4354
CrawlerWebhookBase,
44-
CrawlStartedWebhook,
45-
CrawlUrlDiscoveredWebhook,
46-
CrawlUrlFailedWebhook,
47-
CrawlCompletedWebhook,
55+
CrawlerLifecycleWebhook,
56+
CrawlerUrlVisitedWebhook,
57+
CrawlerUrlSkippedWebhook,
58+
CrawlerUrlDiscoveredWebhook,
59+
CrawlerUrlFailedWebhook,
60+
CrawlerScrapeResult,
4861
CrawlerWebhook,
49-
webhook_from_payload
62+
webhook_from_payload,
5063
)
5164

5265

@@ -74,6 +87,13 @@
7487
'ResponseBodyHandler',
7588
'ScrapeConfig',
7689
'ScreenshotConfig',
90+
'VisionDeficiency',
91+
'BrowserConfig',
92+
'PROXY_POOL_DATACENTER',
93+
'PROXY_POOL_RESIDENTIAL',
94+
'OS_LINUX',
95+
'OS_WINDOWS',
96+
'OS_MAC',
7797
'ExtractionConfig',
7898
'ScreenshotAPIError',
7999
'ExtractionAPIError',
@@ -85,7 +105,10 @@
85105
'CrawlerConfig',
86106
'CrawlerStartResponse',
87107
'CrawlerStatusResponse',
108+
'CrawlerState',
88109
'CrawlerArtifactResponse',
110+
'CrawlerUrlsResponse',
111+
'CrawlerUrlEntry',
89112
'WarcParser',
90113
'WarcRecord',
91114
'parse_warc',
@@ -96,10 +119,12 @@
96119
'CrawlContent',
97120
'CrawlerWebhookEvent',
98121
'CrawlerWebhookBase',
99-
'CrawlStartedWebhook',
100-
'CrawlUrlDiscoveredWebhook',
101-
'CrawlUrlFailedWebhook',
102-
'CrawlCompletedWebhook',
122+
'CrawlerLifecycleWebhook',
123+
'CrawlerUrlVisitedWebhook',
124+
'CrawlerUrlSkippedWebhook',
125+
'CrawlerUrlDiscoveredWebhook',
126+
'CrawlerUrlFailedWebhook',
127+
'CrawlerScrapeResult',
103128
'CrawlerWebhook',
104129
'webhook_from_payload',
105130
)

0 commit comments

Comments
 (0)