feat: Add keep_alive flag to crawler.__init__ (#921)

Pijukatel · web-flow · commit 7a82d0cbdbe6 · 2025-01-22T13:00:01.000+01:00
### Description Add keep_alive flag to `crawler.__init__` If True, this flag will keep crawler alive even when there are no more requests in queue. Crawler is then waiting for more requests to be added or to be explicitly stopped by `crawler.stop()`. Add test, add code example in docs. ### Issues - Closes: #891
diff --git a/docs/examples/code/beautifulsoup_crawler_keep_alive.py b/docs/examples/code/beautifulsoup_crawler_keep_alive.py
@@ -0,0 +1,47 @@
+import asyncio
+
+from crawlee._types import BasicCrawlingContext
+from crawlee.crawlers import BeautifulSoupCrawler
+
+
+async def main() -> None:
+    crawler = BeautifulSoupCrawler(
+        # Keep the crawler alive even when there are no requests to be processed at the moment.
+        keep_alive=True,
+    )
+
+    def stop_crawler_if_url_visited(context: BasicCrawlingContext) -> None:
+        """Stop crawler once specific url is visited. Just an example of guard condition to stop the crawler."""
+        if context.request.url == 'https://crawlee.dev/docs/examples':
+            crawler.stop('Stop crawler that was in keep_alive state after specific url was visited')
+        else:
+            context.log.info('keep_alive=True, waiting for more requests to come.')
+
+    async def add_request_later(url: str, after_s: int) -> None:
+        """Add requests to the queue after some time. This can be done by external code."""
+        # Just an example of request being added to the crawler later, when it is waiting due to `keep_alive=True`.
+        await asyncio.sleep(after_s)
+        await crawler.add_requests([url])
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: BasicCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Stop crawler if some guard condition has been met.
+        stop_crawler_if_url_visited(context)
+
+    # Start some tasks that will add some requests later to simulate real situation,
+    # where requests are added later by external code.
+    add_request_later_task1 = asyncio.create_task(add_request_later(url='https://crawlee.dev', after_s=1))
+    add_request_later_task2 = asyncio.create_task(add_request_later(url='https://crawlee.dev/docs/examples', after_s=5))
+
+    # Run the crawler without the initial list of requests.
+    # It will wait for more requests to be added to the queue later due to `keep_alive=True`.
+    await crawler.run()
+
+    await asyncio.gather(add_request_later_task1, add_request_later_task2)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/examples/crawler_keep_alive.mdx b/docs/examples/crawler_keep_alive.mdx
@@ -0,0 +1,15 @@
+---
+id: crawler-keep-alive
+title: Keep a Crawler alive waiting for more requests
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import CodeBlock from '@theme/CodeBlock';
+
+import BeautifulSoupExample from '!!raw-loader!./code/beautifulsoup_crawler_keep_alive.py';
+
+This example demonstrates how to keep crawler alive even when there are no requests at the moment by using `keep_alive=True` argument of <ApiLink to="class/BasicCrawler#__init__">`BasicCrawler.__init__`</ApiLink>. This is available to all crawlers that inherit from <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink> and in the example below it is shown on <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. To stop the crawler that was started with `keep_alive=True` you can call `crawler.stop()`.
+
+<CodeBlock className="language-python">
+    {BeautifulSoupExample}
+</CodeBlock>
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -134,6 +134,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
     configure_logging: NotRequired[bool]
     """If True, the crawler will set up logging infrastructure automatically."""
 
+    keep_alive: NotRequired[bool]
+    """Flag that can keep crawler running even when there are no requests in queue."""
+
     _context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]
     """Enables extending the request lifecycle and modifying the crawling context. Intended for use by
     subclasses rather than direct instantiation of `BasicCrawler`."""
@@ -195,6 +198,7 @@ def __init__(
         request_handler_timeout: timedelta = timedelta(minutes=1),
         statistics: Statistics | None = None,
         abort_on_error: bool = False,
+        keep_alive: bool = False,
         configure_logging: bool = True,
         _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
         _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
@@ -215,7 +219,8 @@ def __init__(
             max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
                 this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
                 no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed
-                this value.
+                this value. If used together with `keep_alive`, then the crawler will be kept alive only until
+                `max_requests_per_crawl` is achieved.
             max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
                 if a proxy error occurs or if the website blocks the request.
             max_crawl_depth: Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond
@@ -228,6 +233,8 @@ def __init__(
             request_handler_timeout: Maximum duration allowed for a single request handler to run.
             statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
             abort_on_error: If True, the crawler stops immediately when any request handler error occurs.
+            keep_alive: If True, it will keep crawler alive even if there are no requests in queue.
+                Use `crawler.stop()` to exit the crawler.
             configure_logging: If True, the crawler will set up logging infrastructure automatically.
             _context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
                 Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
@@ -314,6 +321,7 @@ def __init__(
         )
 
         # State flags
+        self._keep_alive = keep_alive
         self._running = False
         self._has_finished_before = False
 
@@ -943,14 +951,15 @@ async def __is_finished_function(self) -> bool:
             self._logger.info('The crawler will finish any remaining ongoing requests and shut down.')
             return True
 
-        request_manager = await self.get_request_manager()
-        is_finished = await request_manager.is_finished()
-
         if self._abort_on_error and self._failed:
             self._failed = False
             return True
 
-        return is_finished
+        if self._keep_alive:
+            return False
+
+        request_manager = await self.get_request_manager()
+        return await request_manager.is_finished()
 
     async def __is_task_ready_function(self) -> bool:
         self._stop_if_max_requests_count_exceeded()
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -12,7 +12,7 @@
 from datetime import timedelta
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, cast
-from unittest.mock import AsyncMock, Mock
+from unittest.mock import AsyncMock, Mock, call
 
 import httpx
 import pytest
@@ -1124,3 +1124,48 @@ async def handler(context: BasicCrawlingContext) -> None:
     assert crawler.statistics.state.requests_finished == 1
     assert mocked_handler_before_sleep.call_count == max_request_retries
     assert mocked_handler_after_sleep.call_count == 1
+
+
+@pytest.mark.parametrize(
+    ('keep_alive', 'max_requests_per_crawl', 'expected_handled_requests_count'),
+    [
+        pytest.param(True, 2, 2, id='keep_alive, 2 requests'),
+        pytest.param(True, 1, 1, id='keep_alive, but max_requests_per_crawl achieved after 1 request'),
+        pytest.param(False, 2, 0, id='Crawler without keep_alive (default), crawler finished before adding requests'),
+    ],
+)
+async def test_keep_alive(
+    *, keep_alive: bool, max_requests_per_crawl: int, expected_handled_requests_count: int
+) -> None:
+    """Test that crawler can be kept alive without any requests and stopped with `crawler.stop()`.
+
+    Crawler should stop if `max_requests_per_crawl` is reached regardless of the `keep_alive` flag."""
+    additional_urls = ['http://a.com/', 'http://b.com/']
+    expected_handler_calls = [call(url) for url in additional_urls[:expected_handled_requests_count]]
+
+    crawler = BasicCrawler(
+        keep_alive=keep_alive,
+        max_requests_per_crawl=max_requests_per_crawl,
+        # If more request can run in parallel, then max_requests_per_crawl is not deterministic.
+        concurrency_settings=ConcurrencySettings(max_concurrency=1),
+    )
+    mocked_handler = Mock()
+
+    @crawler.router.default_handler
+    async def handler(context: BasicCrawlingContext) -> None:
+        mocked_handler(context.request.url)
+        if context.request == additional_urls[-1]:
+            crawler.stop()
+
+    crawler_run_task = asyncio.create_task(crawler.run())
+
+    # Give some time to crawler to finish(or be in keep_alive state) and add new request.
+    # TODO: Replace sleep time by waiting for specific crawler state.
+    # https://github.com/apify/crawlee-python/issues/925
+    await asyncio.sleep(1)
+    assert crawler_run_task.done() != keep_alive
+    add_request_task = asyncio.create_task(crawler.add_requests(additional_urls))
+
+    await asyncio.gather(crawler_run_task, add_request_task)
+
+    mocked_handler.assert_has_calls(expected_handler_calls)