Skip to content

Commit 7a82d0c

Browse files
authored
feat: Add keep_alive flag to crawler.__init__ (#921)
### Description Add keep_alive flag to `crawler.__init__` If True, this flag will keep crawler alive even when there are no more requests in queue. Crawler is then waiting for more requests to be added or to be explicitly stopped by `crawler.stop()`. Add test, add code example in docs. ### Issues - Closes: #891
1 parent 2a26f37 commit 7a82d0c

4 files changed

Lines changed: 122 additions & 6 deletions

File tree

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import asyncio
2+
3+
from crawlee._types import BasicCrawlingContext
4+
from crawlee.crawlers import BeautifulSoupCrawler
5+
6+
7+
async def main() -> None:
8+
crawler = BeautifulSoupCrawler(
9+
# Keep the crawler alive even when there are no requests to be processed at the moment.
10+
keep_alive=True,
11+
)
12+
13+
def stop_crawler_if_url_visited(context: BasicCrawlingContext) -> None:
14+
"""Stop crawler once specific url is visited. Just an example of guard condition to stop the crawler."""
15+
if context.request.url == 'https://crawlee.dev/docs/examples':
16+
crawler.stop('Stop crawler that was in keep_alive state after specific url was visited')
17+
else:
18+
context.log.info('keep_alive=True, waiting for more requests to come.')
19+
20+
async def add_request_later(url: str, after_s: int) -> None:
21+
"""Add requests to the queue after some time. This can be done by external code."""
22+
# Just an example of request being added to the crawler later, when it is waiting due to `keep_alive=True`.
23+
await asyncio.sleep(after_s)
24+
await crawler.add_requests([url])
25+
26+
# Define the default request handler, which will be called for every request.
27+
@crawler.router.default_handler
28+
async def request_handler(context: BasicCrawlingContext) -> None:
29+
context.log.info(f'Processing {context.request.url} ...')
30+
31+
# Stop crawler if some guard condition has been met.
32+
stop_crawler_if_url_visited(context)
33+
34+
# Start some tasks that will add some requests later to simulate real situation,
35+
# where requests are added later by external code.
36+
add_request_later_task1 = asyncio.create_task(add_request_later(url='https://crawlee.dev', after_s=1))
37+
add_request_later_task2 = asyncio.create_task(add_request_later(url='https://crawlee.dev/docs/examples', after_s=5))
38+
39+
# Run the crawler without the initial list of requests.
40+
# It will wait for more requests to be added to the queue later due to `keep_alive=True`.
41+
await crawler.run()
42+
43+
await asyncio.gather(add_request_later_task1, add_request_later_task2)
44+
45+
46+
if __name__ == '__main__':
47+
asyncio.run(main())
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
---
2+
id: crawler-keep-alive
3+
title: Keep a Crawler alive waiting for more requests
4+
---
5+
6+
import ApiLink from '@site/src/components/ApiLink';
7+
import CodeBlock from '@theme/CodeBlock';
8+
9+
import BeautifulSoupExample from '!!raw-loader!./code/beautifulsoup_crawler_keep_alive.py';
10+
11+
This example demonstrates how to keep crawler alive even when there are no requests at the moment by using `keep_alive=True` argument of <ApiLink to="class/BasicCrawler#__init__">`BasicCrawler.__init__`</ApiLink>. This is available to all crawlers that inherit from <ApiLink to="class/BasicCrawler">`BasicCrawler`</ApiLink> and in the example below it is shown on <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>. To stop the crawler that was started with `keep_alive=True` you can call `crawler.stop()`.
12+
13+
<CodeBlock className="language-python">
14+
{BeautifulSoupExample}
15+
</CodeBlock>

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,9 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
134134
configure_logging: NotRequired[bool]
135135
"""If True, the crawler will set up logging infrastructure automatically."""
136136

137+
keep_alive: NotRequired[bool]
138+
"""Flag that can keep crawler running even when there are no requests in queue."""
139+
137140
_context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]
138141
"""Enables extending the request lifecycle and modifying the crawling context. Intended for use by
139142
subclasses rather than direct instantiation of `BasicCrawler`."""
@@ -195,6 +198,7 @@ def __init__(
195198
request_handler_timeout: timedelta = timedelta(minutes=1),
196199
statistics: Statistics | None = None,
197200
abort_on_error: bool = False,
201+
keep_alive: bool = False,
198202
configure_logging: bool = True,
199203
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
200204
_additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
@@ -215,7 +219,8 @@ def __init__(
215219
max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
216220
this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
217221
no limit. Due to concurrency settings, the actual number of pages visited may slightly exceed
218-
this value.
222+
this value. If used together with `keep_alive`, then the crawler will be kept alive only until
223+
`max_requests_per_crawl` is achieved.
219224
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
220225
if a proxy error occurs or if the website blocks the request.
221226
max_crawl_depth: Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond
@@ -228,6 +233,8 @@ def __init__(
228233
request_handler_timeout: Maximum duration allowed for a single request handler to run.
229234
statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
230235
abort_on_error: If True, the crawler stops immediately when any request handler error occurs.
236+
keep_alive: If True, it will keep crawler alive even if there are no requests in queue.
237+
Use `crawler.stop()` to exit the crawler.
231238
configure_logging: If True, the crawler will set up logging infrastructure automatically.
232239
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
233240
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
@@ -314,6 +321,7 @@ def __init__(
314321
)
315322

316323
# State flags
324+
self._keep_alive = keep_alive
317325
self._running = False
318326
self._has_finished_before = False
319327

@@ -943,14 +951,15 @@ async def __is_finished_function(self) -> bool:
943951
self._logger.info('The crawler will finish any remaining ongoing requests and shut down.')
944952
return True
945953

946-
request_manager = await self.get_request_manager()
947-
is_finished = await request_manager.is_finished()
948-
949954
if self._abort_on_error and self._failed:
950955
self._failed = False
951956
return True
952957

953-
return is_finished
958+
if self._keep_alive:
959+
return False
960+
961+
request_manager = await self.get_request_manager()
962+
return await request_manager.is_finished()
954963

955964
async def __is_task_ready_function(self) -> bool:
956965
self._stop_if_max_requests_count_exceeded()

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from datetime import timedelta
1313
from pathlib import Path
1414
from typing import TYPE_CHECKING, Any, cast
15-
from unittest.mock import AsyncMock, Mock
15+
from unittest.mock import AsyncMock, Mock, call
1616

1717
import httpx
1818
import pytest
@@ -1124,3 +1124,48 @@ async def handler(context: BasicCrawlingContext) -> None:
11241124
assert crawler.statistics.state.requests_finished == 1
11251125
assert mocked_handler_before_sleep.call_count == max_request_retries
11261126
assert mocked_handler_after_sleep.call_count == 1
1127+
1128+
1129+
@pytest.mark.parametrize(
1130+
('keep_alive', 'max_requests_per_crawl', 'expected_handled_requests_count'),
1131+
[
1132+
pytest.param(True, 2, 2, id='keep_alive, 2 requests'),
1133+
pytest.param(True, 1, 1, id='keep_alive, but max_requests_per_crawl achieved after 1 request'),
1134+
pytest.param(False, 2, 0, id='Crawler without keep_alive (default), crawler finished before adding requests'),
1135+
],
1136+
)
1137+
async def test_keep_alive(
1138+
*, keep_alive: bool, max_requests_per_crawl: int, expected_handled_requests_count: int
1139+
) -> None:
1140+
"""Test that crawler can be kept alive without any requests and stopped with `crawler.stop()`.
1141+
1142+
Crawler should stop if `max_requests_per_crawl` is reached regardless of the `keep_alive` flag."""
1143+
additional_urls = ['http://a.com/', 'http://b.com/']
1144+
expected_handler_calls = [call(url) for url in additional_urls[:expected_handled_requests_count]]
1145+
1146+
crawler = BasicCrawler(
1147+
keep_alive=keep_alive,
1148+
max_requests_per_crawl=max_requests_per_crawl,
1149+
# If more request can run in parallel, then max_requests_per_crawl is not deterministic.
1150+
concurrency_settings=ConcurrencySettings(max_concurrency=1),
1151+
)
1152+
mocked_handler = Mock()
1153+
1154+
@crawler.router.default_handler
1155+
async def handler(context: BasicCrawlingContext) -> None:
1156+
mocked_handler(context.request.url)
1157+
if context.request == additional_urls[-1]:
1158+
crawler.stop()
1159+
1160+
crawler_run_task = asyncio.create_task(crawler.run())
1161+
1162+
# Give some time to crawler to finish(or be in keep_alive state) and add new request.
1163+
# TODO: Replace sleep time by waiting for specific crawler state.
1164+
# https://github.com/apify/crawlee-python/issues/925
1165+
await asyncio.sleep(1)
1166+
assert crawler_run_task.done() != keep_alive
1167+
add_request_task = asyncio.create_task(crawler.add_requests(additional_urls))
1168+
1169+
await asyncio.gather(crawler_run_task, add_request_task)
1170+
1171+
mocked_handler.assert_has_calls(expected_handler_calls)

0 commit comments

Comments
 (0)