scrapfly
diff --git a/‎examples/crawler/README.md‎
Lines changed: 3 additions & 3 deletions b/‎examples/crawler/README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/crawler/demo_markdown.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/crawler/demo_markdown.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/crawler/sync_crawl.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/crawler/sync_crawl.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/crawler/webhook_example.py‎
Lines changed: 106 additions & 72 deletions b/‎examples/crawler/webhook_example.py‎
Lines changed: 106 additions & 72 deletions
diff --git a/‎examples/demos/llm-txt-generator/generate_llm_txt.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/demos/llm-txt-generator/generate_llm_txt.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎scrapfly/__init__.py‎
Lines changed: 36 additions & 11 deletions b/‎scrapfly/__init__.py‎
Lines changed: 36 additions & 11 deletions
@@ -120,7 +120,7 @@ crawl.wait(verbose=True, max_wait=300)
 
 # Check status
 status = crawl.status()
-print(f"Crawled {status.urls_crawled} URLs")
+print(f"Crawled {status.state.urls_visited} URLs")
 
 # Get results
 artifact = crawl.warc()
@@ -140,8 +140,8 @@ if html:
 
 ```python
 stats = crawl.stats()
-print(f"URLs discovered: {stats['urls_discovered']}")
-print(f"URLs crawled: {stats['urls_crawled']}")
+print(f"URLs extracted: {stats['urls_extracted']}")
+print(f"URLs visited: {stats['urls_visited']}")
 print(f"Crawl rate: {stats['crawl_rate']:.1f}%")
 print(f"Total size: {stats['total_size_kb']:.2f} KB")
 ```
 
@@ -93,7 +93,7 @@
 
 print("\n✅ Crawl completed!")
 final_status = crawl.status()
-print(f"  Total pages crawled: {final_status.urls_crawled}")
+print(f"  Total pages crawled: {final_status.state.urls_visited}")
 print(f"  Failed: {final_status.urls_failed}")
 
 # Get all URLs from WARC to retrieve in batch
@@ -209,8 +209,8 @@
 # Show statistics
 stats = crawl.stats()
 print("\n📊 Crawl Statistics:")
-print(f"  URLs discovered: {stats['urls_discovered']}")
-print(f"  URLs crawled: {stats['urls_crawled']}")
+print(f"  URLs extracted: {stats['urls_extracted']}")
+print(f"  URLs visited: {stats['urls_visited']}")
 print(f"  URLs failed: {stats['urls_failed']}")
 print(f"  Progress: {stats['progress_pct']:.1f}%")
 
 
@@ -60,7 +60,7 @@
     status = client.get_crawl_status(start_response.uuid)
     print(f"  Status: {status.status}")
     print(f"  Progress: {status.progress_pct:.1f}%")
-    print(f"  Crawled: {status.urls_crawled}/{status.urls_discovered} pages")
+    print(f"  Visited: {status.state.urls_visited}/{status.state.urls_extracted} pages")
 
     if status.is_complete:
         print("\n✓ Crawl completed!")
 
@@ -1,106 +1,140 @@
 """
 Example Crawler Webhook Handler
 
-This example demonstrates how to receive and handle Crawler API webhooks.
+This example demonstrates how to receive and handle the 8 real crawler
+webhook events emitted by the Scrapfly crawler API.
+
+Event envelope
+==============
+
+Every crawler webhook follows the same envelope:
+
+    {
+        "event":   "<event_name>",
+        "payload": { ... event-specific fields ... }
+    }
+
+The 8 event names are defined by ``CrawlerWebhookEvent`` and match the
+scrape-engine's ``WebhookEvents`` class exactly.
 """
 
 from scrapfly import (
     webhook_from_payload,
-    CrawlStartedWebhook,
-    CrawlUrlDiscoveredWebhook,
-    CrawlUrlFailedWebhook,
-    CrawlCompletedWebhook,
+    CrawlerWebhookEvent,
+    CrawlerLifecycleWebhook,
+    CrawlerUrlVisitedWebhook,
+    CrawlerUrlSkippedWebhook,
+    CrawlerUrlDiscoveredWebhook,
+    CrawlerUrlFailedWebhook,
 )
 
 
-# Example: Simple Flask webhook endpoint
+# ---------------------------------------------------------------------------
+# Example 1: Flask webhook endpoint
+# ---------------------------------------------------------------------------
+
+
 def example_flask_webhook():
     """Simple webhook handling with Flask"""
     from flask import Flask, request
 
     app = Flask(__name__)
-    SIGNING_SECRETS = ('your-secret-key-here',)
+    SIGNING_SECRETS = ('your-secret-hex-here',)
 
     @app.route('/webhook', methods=['POST'])
     def webhook():
-        # Parse and verify the webhook
-        webhook_obj = webhook_from_payload(
+        # Parse and verify the webhook. The envelope is always
+        # {"event": ..., "payload": ...}; webhook_from_payload dispatches
+        # on the event name and returns a typed dataclass.
+        wh = webhook_from_payload(
             request.json,
             signing_secrets=SIGNING_SECRETS,
-            signature=request.headers.get('X-Scrapfly-Webhook-Signature')
+            signature=request.headers.get('X-Scrapfly-Webhook-Signature'),
         )
 
-        # Handle different webhook types
-        if isinstance(webhook_obj, CrawlStartedWebhook):
-            print(f"Crawl {webhook_obj.uuid} started")
-
-        elif isinstance(webhook_obj, CrawlUrlDiscoveredWebhook):
-            print(f"Discovered: {webhook_obj.url} (depth {webhook_obj.depth})")
-
-        elif isinstance(webhook_obj, CrawlUrlFailedWebhook):
-            print(f"Failed: {webhook_obj.url} - {webhook_obj.error}")
-
-        elif isinstance(webhook_obj, CrawlCompletedWebhook):
-            print(f"Completed: {webhook_obj.urls_crawled}/{webhook_obj.urls_discovered} URLs")
+        # All webhooks carry the common base fields:
+        print(f"[{wh.event}] crawler={wh.crawler_uuid} project={wh.project} "
+              f"visited={wh.state.urls_visited}/{wh.state.urls_extracted}")
+
+        # Dispatch on the concrete type for event-specific fields.
+        if isinstance(wh, CrawlerLifecycleWebhook):
+            # Covers crawler_started / crawler_stopped / crawler_cancelled /
+            # crawler_finished. Use wh.event to distinguish which one.
+            if wh.event == CrawlerWebhookEvent.CRAWLER_FINISHED.value:
+                print(f"  ✓ finished: {wh.state.urls_visited} URLs visited, "
+                      f"credits={wh.state.api_credit_used}, "
+                      f"stop_reason={wh.state.stop_reason}")
+            elif wh.event == CrawlerWebhookEvent.CRAWLER_STARTED.value:
+                print(f"  ▶ started at seed_url={wh.seed_url}")
+            elif wh.event == CrawlerWebhookEvent.CRAWLER_CANCELLED.value:
+                print(f"  ✗ cancelled by user")
+            elif wh.event == CrawlerWebhookEvent.CRAWLER_STOPPED.value:
+                print(f"  ⚠ stopped (stop_reason={wh.state.stop_reason})")
+
+        elif isinstance(wh, CrawlerUrlVisitedWebhook):
+            print(f"  ● visited {wh.url} [{wh.scrape.status_code}] "
+                  f"country={wh.scrape.country} log={wh.scrape.log_uuid}")
+
+        elif isinstance(wh, CrawlerUrlDiscoveredWebhook):
+            print(f"  ◆ discovered {len(wh.discovered_urls)} URLs "
+                  f"via {wh.origin}")
+
+        elif isinstance(wh, CrawlerUrlSkippedWebhook):
+            for url, reason in wh.urls.items():
+                print(f"  ○ skipped {url} ({reason})")
+
+        elif isinstance(wh, CrawlerUrlFailedWebhook):
+            print(f"  ✗ failed {wh.url}: {wh.error}")
+            if wh.log_link:
+                print(f"    log: {wh.log_link}")
 
         return '', 200
 
     app.run(port=5000)
 
 
-# Example: Using built-in webhook server
-def example_builtin_server():
-    """Using Scrapfly's built-in webhook server"""
-    from scrapfly.webhook import create_server, ResourceType
+# ---------------------------------------------------------------------------
+# Example 2: Sanity-check with a real fixture payload
+# ---------------------------------------------------------------------------
 
-    def callback(data, resource_type, request):
-        if resource_type == ResourceType.CRAWLER.value:
-            webhook_obj = webhook_from_payload(data)
-            print(f"Received {webhook_obj.event} for {webhook_obj.uuid}")
-
-    app = create_server(
-        signing_secrets=('your-secret-key-here',),
-        callback=callback
-    )
-    app.run(port=5000)
 
-
-# Test with example payloads
 if __name__ == '__main__':
-    EXAMPLE_PAYLOADS = {
-        'started': {
-            "event": "crawl.started",
-            "uuid": "test-uuid",
-            "status": "RUNNING",
-            "timestamp": "2025-01-16T10:30:00Z"
-        },
-        'url_discovered': {
-            "event": "crawl.url_discovered",
-            "uuid": "test-uuid",
-            "url": "https://example.com/page",
-            "depth": 1,
-            "timestamp": "2025-01-16T10:30:05Z"
-        },
-        'url_failed': {
-            "event": "crawl.url_failed",
-            "uuid": "test-uuid",
-            "url": "https://example.com/404",
-            "error": "HTTP 404 Not Found",
-            "status_code": 404,
-            "timestamp": "2025-01-16T10:30:10Z"
+    # A real crawler_finished payload — exactly as the scrape-engine emits it
+    # (see apps/scrapfly/web-app/src/Template/Docs/crawler-api/
+    #  webhooks_example/crawler_finished.json for the canonical reference).
+    example_finished = {
+        "event": "crawler_finished",
+        "payload": {
+            "crawler_uuid": "b4867c50-318c-47cd-bfc9-bed67f24771a",
+            "project": "default",
+            "env": "LIVE",
+            "seed_url": "https://web-scraping.dev/products",
+            "action": "finished",
+            "state": {
+                "duration": 6.11,
+                "urls_visited": 5,
+                "urls_extracted": 49,
+                "urls_failed": 0,
+                "urls_skipped": 44,
+                "urls_to_crawl": 5,
+                "api_credit_used": 5,
+                "stop_reason": "page_limit",
+                "start_time": 1762940028,
+                "stop_time": 1762940034.1143808,
+            },
+            "links": {
+                "status": "https://api.scrapfly.io/crawl/b4867c50-318c-47cd-bfc9-bed67f24771a/status",
+            },
         },
-        'completed': {
-            "event": "crawl.completed",
-            "uuid": "test-uuid",
-            "status": "COMPLETED",
-            "urls_discovered": 100,
-            "urls_crawled": 95,
-            "urls_failed": 5,
-            "timestamp": "2025-01-16T10:35:00Z"
-        }
     }
 
-    print("Testing webhook parsing:\n")
-    for name, payload in EXAMPLE_PAYLOADS.items():
-        webhook = webhook_from_payload(payload)
-        print(f"{webhook.event}: {webhook.uuid} at {webhook.timestamp}")
+    wh = webhook_from_payload(example_finished)
+    assert isinstance(wh, CrawlerLifecycleWebhook)
+    assert wh.event == CrawlerWebhookEvent.CRAWLER_FINISHED.value
+    print(f"Parsed: {wh.event}")
+    print(f"  crawler_uuid: {wh.crawler_uuid}")
+    print(f"  seed_url:     {wh.seed_url}")
+    print(f"  state.urls_visited:  {wh.state.urls_visited}")
+    print(f"  state.urls_extracted:{wh.state.urls_extracted}")
+    print(f"  state.stop_reason:   {wh.state.stop_reason}")
+    print(f"  status_link:         {wh.status_link}")
@@ -104,9 +104,9 @@ def generate_llm_txt(
     # Get final status
     status = crawl.status()
     print(f"\n✅ Crawl completed!")
-    print(f"  Pages crawled: {status.urls_crawled}")
+    print(f"  Pages crawled: {status.state.urls_visited}")
     print(f"  Pages failed: {status.urls_failed}")
-    print(f"  Total discovered: {status.urls_discovered}")
+    print(f"  Total discovered: {status.state.urls_extracted}")
 
     # Get URLs from WARC artifact
     print("\n📥 Retrieving crawled URLs from WARC...")
 
@@ -1,4 +1,4 @@
-__version__ = '0.8.27'
+__version__ = '0.8.28'
 
 from typing import Tuple
 from .errors import ScrapflyError
@@ -24,13 +24,23 @@
 from .api_response import ScrapeApiResponse, ScreenshotApiResponse, ExtractionApiResponse, ResponseBodyHandler
 from .client import ScrapflyClient, ScraperAPI, MonitoringTargetPeriod, MonitoringAggregation
 from .scrape_config import ScrapeConfig
-from .screenshot_config import ScreenshotConfig
+from .screenshot_config import ScreenshotConfig, VisionDeficiency
+from .browser_config import (
+    BrowserConfig,
+    PROXY_POOL_DATACENTER,
+    PROXY_POOL_RESIDENTIAL,
+    OS_LINUX,
+    OS_WINDOWS,
+    OS_MAC,
+)
 from .extraction_config import ExtractionConfig
 from .crawler import (
     CrawlerConfig,
     CrawlerStartResponse,
     CrawlerStatusResponse,
     CrawlerArtifactResponse,
+    CrawlerUrlsResponse,
+    CrawlerUrlEntry,
     WarcParser,
     WarcRecord,
     parse_warc,
@@ -39,14 +49,17 @@
     Crawl,
     ContentFormat,
     CrawlContent,
+    CrawlerState,
     CrawlerWebhookEvent,
     CrawlerWebhookBase,
-    CrawlStartedWebhook,
-    CrawlUrlDiscoveredWebhook,
-    CrawlUrlFailedWebhook,
-    CrawlCompletedWebhook,
+    CrawlerLifecycleWebhook,
+    CrawlerUrlVisitedWebhook,
+    CrawlerUrlSkippedWebhook,
+    CrawlerUrlDiscoveredWebhook,
+    CrawlerUrlFailedWebhook,
+    CrawlerScrapeResult,
     CrawlerWebhook,
-    webhook_from_payload
+    webhook_from_payload,
 )
 
 
@@ -74,6 +87,13 @@
     'ResponseBodyHandler',
     'ScrapeConfig',
     'ScreenshotConfig',
+    'VisionDeficiency',
+    'BrowserConfig',
+    'PROXY_POOL_DATACENTER',
+    'PROXY_POOL_RESIDENTIAL',
+    'OS_LINUX',
+    'OS_WINDOWS',
+    'OS_MAC',
     'ExtractionConfig',
     'ScreenshotAPIError',
     'ExtractionAPIError',
@@ -85,7 +105,10 @@
     'CrawlerConfig',
     'CrawlerStartResponse',
     'CrawlerStatusResponse',
+    'CrawlerState',
     'CrawlerArtifactResponse',
+    'CrawlerUrlsResponse',
+    'CrawlerUrlEntry',
     'WarcParser',
     'WarcRecord',
     'parse_warc',
@@ -96,10 +119,12 @@
     'CrawlContent',
     'CrawlerWebhookEvent',
     'CrawlerWebhookBase',
-    'CrawlStartedWebhook',
-    'CrawlUrlDiscoveredWebhook',
-    'CrawlUrlFailedWebhook',
-    'CrawlCompletedWebhook',
+    'CrawlerLifecycleWebhook',
+    'CrawlerUrlVisitedWebhook',
+    'CrawlerUrlSkippedWebhook',
+    'CrawlerUrlDiscoveredWebhook',
+    'CrawlerUrlFailedWebhook',
+    'CrawlerScrapeResult',
     'CrawlerWebhook',
     'webhook_from_payload',
 )