|
1 | 1 | """ |
2 | 2 | Example Crawler Webhook Handler |
3 | 3 |
|
4 | | -This example demonstrates how to receive and handle Crawler API webhooks. |
| 4 | +This example demonstrates how to receive and handle the 8 real crawler |
| 5 | +webhook events emitted by the Scrapfly crawler API. |
| 6 | +
|
| 7 | +Event envelope |
| 8 | +============== |
| 9 | +
|
| 10 | +Every crawler webhook follows the same envelope: |
| 11 | +
|
| 12 | + { |
| 13 | + "event": "<event_name>", |
| 14 | + "payload": { ... event-specific fields ... } |
| 15 | + } |
| 16 | +
|
| 17 | +The 8 event names are defined by ``CrawlerWebhookEvent`` and match the |
| 18 | +scrape-engine's ``WebhookEvents`` class exactly. |
5 | 19 | """ |
6 | 20 |
|
7 | 21 | from scrapfly import ( |
8 | 22 | webhook_from_payload, |
9 | | - CrawlStartedWebhook, |
10 | | - CrawlUrlDiscoveredWebhook, |
11 | | - CrawlUrlFailedWebhook, |
12 | | - CrawlCompletedWebhook, |
| 23 | + CrawlerWebhookEvent, |
| 24 | + CrawlerLifecycleWebhook, |
| 25 | + CrawlerUrlVisitedWebhook, |
| 26 | + CrawlerUrlSkippedWebhook, |
| 27 | + CrawlerUrlDiscoveredWebhook, |
| 28 | + CrawlerUrlFailedWebhook, |
13 | 29 | ) |
14 | 30 |
|
15 | 31 |
|
16 | | -# Example: Simple Flask webhook endpoint |
| 32 | +# --------------------------------------------------------------------------- |
| 33 | +# Example 1: Flask webhook endpoint |
| 34 | +# --------------------------------------------------------------------------- |
| 35 | + |
| 36 | + |
17 | 37 | def example_flask_webhook(): |
18 | 38 | """Simple webhook handling with Flask""" |
19 | 39 | from flask import Flask, request |
20 | 40 |
|
21 | 41 | app = Flask(__name__) |
22 | | - SIGNING_SECRETS = ('your-secret-key-here',) |
| 42 | + SIGNING_SECRETS = ('your-secret-hex-here',) |
23 | 43 |
|
24 | 44 | @app.route('/webhook', methods=['POST']) |
25 | 45 | def webhook(): |
26 | | - # Parse and verify the webhook |
27 | | - webhook_obj = webhook_from_payload( |
| 46 | + # Parse and verify the webhook. The envelope is always |
| 47 | + # {"event": ..., "payload": ...}; webhook_from_payload dispatches |
| 48 | + # on the event name and returns a typed dataclass. |
| 49 | + wh = webhook_from_payload( |
28 | 50 | request.json, |
29 | 51 | signing_secrets=SIGNING_SECRETS, |
30 | | - signature=request.headers.get('X-Scrapfly-Webhook-Signature') |
| 52 | + signature=request.headers.get('X-Scrapfly-Webhook-Signature'), |
31 | 53 | ) |
32 | 54 |
|
33 | | - # Handle different webhook types |
34 | | - if isinstance(webhook_obj, CrawlStartedWebhook): |
35 | | - print(f"Crawl {webhook_obj.uuid} started") |
36 | | - |
37 | | - elif isinstance(webhook_obj, CrawlUrlDiscoveredWebhook): |
38 | | - print(f"Discovered: {webhook_obj.url} (depth {webhook_obj.depth})") |
39 | | - |
40 | | - elif isinstance(webhook_obj, CrawlUrlFailedWebhook): |
41 | | - print(f"Failed: {webhook_obj.url} - {webhook_obj.error}") |
42 | | - |
43 | | - elif isinstance(webhook_obj, CrawlCompletedWebhook): |
44 | | - print(f"Completed: {webhook_obj.urls_crawled}/{webhook_obj.urls_discovered} URLs") |
| 55 | + # All webhooks carry the common base fields: |
| 56 | + print(f"[{wh.event}] crawler={wh.crawler_uuid} project={wh.project} " |
| 57 | + f"visited={wh.state.urls_visited}/{wh.state.urls_extracted}") |
| 58 | + |
| 59 | + # Dispatch on the concrete type for event-specific fields. |
| 60 | + if isinstance(wh, CrawlerLifecycleWebhook): |
| 61 | + # Covers crawler_started / crawler_stopped / crawler_cancelled / |
| 62 | + # crawler_finished. Use wh.event to distinguish which one. |
| 63 | + if wh.event == CrawlerWebhookEvent.CRAWLER_FINISHED.value: |
| 64 | + print(f" ✓ finished: {wh.state.urls_visited} URLs visited, " |
| 65 | + f"credits={wh.state.api_credit_used}, " |
| 66 | + f"stop_reason={wh.state.stop_reason}") |
| 67 | + elif wh.event == CrawlerWebhookEvent.CRAWLER_STARTED.value: |
| 68 | + print(f" ▶ started at seed_url={wh.seed_url}") |
| 69 | + elif wh.event == CrawlerWebhookEvent.CRAWLER_CANCELLED.value: |
| 70 | + print(f" ✗ cancelled by user") |
| 71 | + elif wh.event == CrawlerWebhookEvent.CRAWLER_STOPPED.value: |
| 72 | + print(f" ⚠ stopped (stop_reason={wh.state.stop_reason})") |
| 73 | + |
| 74 | + elif isinstance(wh, CrawlerUrlVisitedWebhook): |
| 75 | + print(f" ● visited {wh.url} [{wh.scrape.status_code}] " |
| 76 | + f"country={wh.scrape.country} log={wh.scrape.log_uuid}") |
| 77 | + |
| 78 | + elif isinstance(wh, CrawlerUrlDiscoveredWebhook): |
| 79 | + print(f" ◆ discovered {len(wh.discovered_urls)} URLs " |
| 80 | + f"via {wh.origin}") |
| 81 | + |
| 82 | + elif isinstance(wh, CrawlerUrlSkippedWebhook): |
| 83 | + for url, reason in wh.urls.items(): |
| 84 | + print(f" ○ skipped {url} ({reason})") |
| 85 | + |
| 86 | + elif isinstance(wh, CrawlerUrlFailedWebhook): |
| 87 | + print(f" ✗ failed {wh.url}: {wh.error}") |
| 88 | + if wh.log_link: |
| 89 | + print(f" log: {wh.log_link}") |
45 | 90 |
|
46 | 91 | return '', 200 |
47 | 92 |
|
48 | 93 | app.run(port=5000) |
49 | 94 |
|
50 | 95 |
|
51 | | -# Example: Using built-in webhook server |
52 | | -def example_builtin_server(): |
53 | | - """Using Scrapfly's built-in webhook server""" |
54 | | - from scrapfly.webhook import create_server, ResourceType |
| 96 | +# --------------------------------------------------------------------------- |
| 97 | +# Example 2: Sanity-check with a real fixture payload |
| 98 | +# --------------------------------------------------------------------------- |
55 | 99 |
|
56 | | - def callback(data, resource_type, request): |
57 | | - if resource_type == ResourceType.CRAWLER.value: |
58 | | - webhook_obj = webhook_from_payload(data) |
59 | | - print(f"Received {webhook_obj.event} for {webhook_obj.uuid}") |
60 | | - |
61 | | - app = create_server( |
62 | | - signing_secrets=('your-secret-key-here',), |
63 | | - callback=callback |
64 | | - ) |
65 | | - app.run(port=5000) |
66 | 100 |
|
67 | | - |
68 | | -# Test with example payloads |
69 | 101 | if __name__ == '__main__': |
70 | | - EXAMPLE_PAYLOADS = { |
71 | | - 'started': { |
72 | | - "event": "crawl.started", |
73 | | - "uuid": "test-uuid", |
74 | | - "status": "RUNNING", |
75 | | - "timestamp": "2025-01-16T10:30:00Z" |
76 | | - }, |
77 | | - 'url_discovered': { |
78 | | - "event": "crawl.url_discovered", |
79 | | - "uuid": "test-uuid", |
80 | | - "url": "https://example.com/page", |
81 | | - "depth": 1, |
82 | | - "timestamp": "2025-01-16T10:30:05Z" |
83 | | - }, |
84 | | - 'url_failed': { |
85 | | - "event": "crawl.url_failed", |
86 | | - "uuid": "test-uuid", |
87 | | - "url": "https://example.com/404", |
88 | | - "error": "HTTP 404 Not Found", |
89 | | - "status_code": 404, |
90 | | - "timestamp": "2025-01-16T10:30:10Z" |
| 102 | + # A real crawler_finished payload — exactly as the scrape-engine emits it |
| 103 | + # (see apps/scrapfly/web-app/src/Template/Docs/crawler-api/ |
| 104 | + # webhooks_example/crawler_finished.json for the canonical reference). |
| 105 | + example_finished = { |
| 106 | + "event": "crawler_finished", |
| 107 | + "payload": { |
| 108 | + "crawler_uuid": "b4867c50-318c-47cd-bfc9-bed67f24771a", |
| 109 | + "project": "default", |
| 110 | + "env": "LIVE", |
| 111 | + "seed_url": "https://web-scraping.dev/products", |
| 112 | + "action": "finished", |
| 113 | + "state": { |
| 114 | + "duration": 6.11, |
| 115 | + "urls_visited": 5, |
| 116 | + "urls_extracted": 49, |
| 117 | + "urls_failed": 0, |
| 118 | + "urls_skipped": 44, |
| 119 | + "urls_to_crawl": 5, |
| 120 | + "api_credit_used": 5, |
| 121 | + "stop_reason": "page_limit", |
| 122 | + "start_time": 1762940028, |
| 123 | + "stop_time": 1762940034.1143808, |
| 124 | + }, |
| 125 | + "links": { |
| 126 | + "status": "https://api.scrapfly.io/crawl/b4867c50-318c-47cd-bfc9-bed67f24771a/status", |
| 127 | + }, |
91 | 128 | }, |
92 | | - 'completed': { |
93 | | - "event": "crawl.completed", |
94 | | - "uuid": "test-uuid", |
95 | | - "status": "COMPLETED", |
96 | | - "urls_discovered": 100, |
97 | | - "urls_crawled": 95, |
98 | | - "urls_failed": 5, |
99 | | - "timestamp": "2025-01-16T10:35:00Z" |
100 | | - } |
101 | 129 | } |
102 | 130 |
|
103 | | - print("Testing webhook parsing:\n") |
104 | | - for name, payload in EXAMPLE_PAYLOADS.items(): |
105 | | - webhook = webhook_from_payload(payload) |
106 | | - print(f"{webhook.event}: {webhook.uuid} at {webhook.timestamp}") |
| 131 | + wh = webhook_from_payload(example_finished) |
| 132 | + assert isinstance(wh, CrawlerLifecycleWebhook) |
| 133 | + assert wh.event == CrawlerWebhookEvent.CRAWLER_FINISHED.value |
| 134 | + print(f"Parsed: {wh.event}") |
| 135 | + print(f" crawler_uuid: {wh.crawler_uuid}") |
| 136 | + print(f" seed_url: {wh.seed_url}") |
| 137 | + print(f" state.urls_visited: {wh.state.urls_visited}") |
| 138 | + print(f" state.urls_extracted:{wh.state.urls_extracted}") |
| 139 | + print(f" state.stop_reason: {wh.state.stop_reason}") |
| 140 | + print(f" status_link: {wh.status_link}") |
0 commit comments