Skip to content

Commit 5d689f6

Browse files
authored
docs: Add Storages & Request loaders guides (#896)
- Consolidate "Request storage" & "Result storage" into a single "Storages" guide and further improvesit. - Add a new "Request loaders" guide. - This can be transferred to the Crawlee JS once the request loaders are implemented there as well. - Add Mermaid support to Docusaurus. - Closes: #833
1 parent 4f12dda commit 5d689f6

25 files changed

Lines changed: 2059 additions & 1075 deletions

docs/guides/code/request_storage/rl_basic_example.py renamed to docs/guides/code/request_loaders/rl_basic_example.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@ async def main() -> None:
88
# Leave name empty to use the default request list.
99
request_list = RequestList(
1010
name='my-request-list',
11-
requests=['https://apify.com/', 'https://crawlee.dev/', 'https://crawlee.dev/python/'],
11+
requests=[
12+
'https://apify.com/',
13+
'https://crawlee.dev/',
14+
'https://crawlee.dev/python/',
15+
],
1216
)
1317

1418
# Fetch and process requests from the queue.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import asyncio
2+
3+
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
4+
from crawlee.request_loaders import RequestList
5+
6+
7+
async def main() -> None:
8+
# Create a static request list.
9+
request_list = RequestList(['https://crawlee.dev', 'https://apify.com'])
10+
11+
# Convert the request list to a request manager using the to_tandem method.
12+
# It is a tandem with the default request queue.
13+
request_manager = await request_list.to_tandem()
14+
15+
# Create a crawler and pass the request manager to it.
16+
crawler = ParselCrawler(request_manager=request_manager)
17+
18+
@crawler.router.default_handler
19+
async def handler(context: ParselCrawlingContext) -> None:
20+
# New links will be enqueued directly to the queue.
21+
await context.enqueue_links()
22+
23+
await crawler.run()
24+
25+
26+
asyncio.run(main())

docs/guides/code/request_storage/tandem_example_explicit.py renamed to docs/guides/code/request_loaders/tandem_example_explicit.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,22 @@
66

77

88
async def main() -> None:
9-
# Create a static request list
9+
# Create a static request list.
1010
request_list = RequestList(['https://crawlee.dev', 'https://apify.com'])
1111

12-
# Open the default request queue
12+
# Open the default request queue.
1313
request_queue = await RequestQueue.open()
1414

15-
crawler = ParselCrawler(
16-
# Requests from the list will be processed first, but they will be enqueued in the default request queue first
17-
request_manager=RequestManagerTandem(request_list, request_queue),
18-
)
15+
# And combine them together to a sinhle request manager.
16+
request_manager = RequestManagerTandem(request_list, request_queue)
17+
18+
# Create a crawler and pass the request manager to it.
19+
crawler = ParselCrawler(request_manager=request_manager)
1920

2021
@crawler.router.default_handler
2122
async def handler(context: ParselCrawlingContext) -> None:
22-
await context.enqueue_links() # New links will be enqueued directly to the queue
23+
# New links will be enqueued directly to the queue.
24+
await context.enqueue_links()
2325

2426
await crawler.run()
2527

docs/guides/code/request_storage/rl_with_crawler_example.py

Lines changed: 0 additions & 37 deletions
This file was deleted.

docs/guides/code/request_storage/tandem_example.py

Lines changed: 0 additions & 23 deletions
This file was deleted.

docs/guides/code/request_storage/do_not_purge_example.py renamed to docs/guides/code/storages/cleaning_do_not_purge_example.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@
55

66

77
async def main() -> None:
8+
# Set the purge_on_start field to False to avoid purging the storage on start.
89
# highlight-next-line
9-
config = Configuration(purge_on_start=False)
10-
crawler = HttpCrawler(configuration=config)
10+
configuration = Configuration(purge_on_start=False)
11+
12+
# Pass the configuration to the crawler.
13+
crawler = HttpCrawler(configuration=configuration)
1114

1215
@crawler.router.default_handler
1316
async def request_handler(context: HttpCrawlingContext) -> None:

docs/guides/code/request_storage/purge_explicitly_example.py renamed to docs/guides/code/storages/cleaning_purge_explicitly_example.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
import asyncio
22

3+
from crawlee.crawlers import HttpCrawler
34
from crawlee.storage_clients import MemoryStorageClient
45

56

67
async def main() -> None:
78
storage_client = MemoryStorageClient.from_config()
9+
10+
# Call the purge_on_start method to explicitly purge the storage.
811
# highlight-next-line
912
await storage_client.purge_on_start()
1013

14+
# Pass the storage client to the crawler.
15+
crawler = HttpCrawler(storage_client=storage_client)
16+
17+
# ...
18+
1119

1220
if __name__ == '__main__':
1321
asyncio.run(main())
File renamed without changes.

docs/guides/code/result_storage/dataset_with_crawler_example.py renamed to docs/guides/code/storages/dataset_with_crawler_example.py

File renamed without changes.

docs/guides/code/result_storage/dataset_with_crawler_explicit_example.py renamed to docs/guides/code/storages/dataset_with_crawler_explicit_example.py

File renamed without changes.

0 commit comments

Comments
 (0)