Skip to content

Commit dee5f2f

Browse files
committed
Merge remote-tracking branch 'origin/master' into rust-filesystem-storage
2 parents a9e5915 + 2eda280 commit dee5f2f

22 files changed

Lines changed: 356 additions & 1489 deletions

File tree

.github/workflows/manual_release_stable.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,12 +157,13 @@ jobs:
157157
rm -rf "versioned_docs/version-${MAJOR_MINOR}"
158158
rm -rf "versioned_sidebars/version-${MAJOR_MINOR}-sidebars.json"
159159
jq 'map(select(. != env.MAJOR_MINOR))' versions.json > tmp.json && mv tmp.json versions.json
160-
# Copy changelog
161-
cp ../CHANGELOG.md ../docs/changelog.md
162160
# Build API reference and create version snapshots
163161
bash build_api_reference.sh
164162
npx docusaurus docs:version "$MAJOR_MINOR"
165163
npx docusaurus api:version "$MAJOR_MINOR"
164+
# Changelog is not versioned - it is copied from root at build time
165+
rm -f "versioned_docs/version-${MAJOR_MINOR}/changelog.md"
166+
echo "changelog.md" > "versioned_docs/version-${MAJOR_MINOR}/.gitignore"
166167
167168
- name: Commit and push versioned docs
168169
id: commit_versioned_docs

CHANGELOG.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,16 @@
22

33
All notable changes to this project will be documented in this file.
44

5-
<!-- git-cliff-unreleased-start -->
6-
## 1.6.2 - **not yet released**
5+
## [1.6.2](https://github.com/apify/crawlee-python/releases/tag/v1.6.2) (2026-04-08)
76

87
### 🐛 Bug Fixes
98

109
- **file-system:** Reclaim orphaned in-progress requests on RQ recovery ([#1825](https://github.com/apify/crawlee-python/pull/1825)) ([e86794a](https://github.com/apify/crawlee-python/commit/e86794a6e5605432c9331c7cd99edf885527a3eb)) by [@vdusek](https://github.com/vdusek)
10+
- Prevent premature `EventManager` shutdown when multiple crawlers share it ([#1810](https://github.com/apify/crawlee-python/pull/1810)) ([2efb668](https://github.com/apify/crawlee-python/commit/2efb668ad54fb3e8d740066446563d1e8a39d2e8)) by [@Mantisus](https://github.com/Mantisus), closes [#1805](https://github.com/apify/crawlee-python/issues/1805), [#1808](https://github.com/apify/crawlee-python/issues/1808)
11+
- Apply SQLite optimizations to the custom `connection_string` in `SqlStorageClient` ([#1837](https://github.com/apify/crawlee-python/pull/1837)) ([8b53e27](https://github.com/apify/crawlee-python/commit/8b53e273067e27b4ef4b2b4bb40277b15ef6b058)) by [@Mantisus](https://github.com/Mantisus)
12+
- Apply `SharedTimeout` to post-navigation hooks ([#1839](https://github.com/apify/crawlee-python/pull/1839)) ([88bd05a](https://github.com/apify/crawlee-python/commit/88bd05a2127ebfe3cd4eb78c514a63fc9e2cd079)) by [@vdusek](https://github.com/vdusek)
1113

1214

13-
<!-- git-cliff-unreleased-end -->
1415
## [1.6.1](https://github.com/apify/crawlee-python/releases/tag/v1.6.1) (2026-03-30)
1516

1617
### 🐛 Bug Fixes

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,10 @@ exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:", "assert_never()"]
273273
[tool.ipdb]
274274
context = 7
275275

276+
[tool.uv]
277+
# Minimal defense against supply-chain attacks.
278+
exclude-newer = "24 hours"
279+
276280
# Run tasks with: uv run poe <task>
277281
[tool.poe.tasks]
278282
clean = "rm -rf .coverage .pytest_cache .ruff_cache .ty_cache .uv-cache build coverage-unit.xml dist htmlcov website/.docusaurus website/.yarn website/module_shortcuts.json website/node_modules "

renovate.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
"automergeType": "branch"
2222
}
2323
],
24+
"minimumReleaseAge": "1 day",
2425
"schedule": ["before 7am every weekday"],
2526
"ignoreDeps": ["crawlee", "docusaurus-plugin-typedoc-api"]
2627
}

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC
119119
"""Create static content crawler context pipeline with expected pipeline steps."""
120120
return (
121121
ContextPipeline()
122+
.compose(self._manage_shared_navigation_timeout)
122123
.compose(self._execute_pre_navigation_hooks)
123124
.compose(self._make_http_request)
124125
.compose(self._execute_post_navigation_hooks)
@@ -127,26 +128,37 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC
127128
.compose(self._handle_blocked_request_by_content)
128129
)
129130

130-
async def _execute_pre_navigation_hooks(
131+
async def _manage_shared_navigation_timeout(
131132
self, context: BasicCrawlingContext
132133
) -> AsyncGenerator[BasicCrawlingContext, None]:
133-
context_id = id(context)
134-
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
134+
"""Initialize and clean up the shared navigation timeout for the current request."""
135+
request_id = id(context.request)
136+
self._shared_navigation_timeouts[request_id] = SharedTimeout(self._navigation_timeout)
135137

136138
try:
137-
for hook in self._pre_navigation_hooks:
138-
async with self._shared_navigation_timeouts[context_id]:
139-
await hook(context)
140-
141139
yield context
142140
finally:
143-
self._shared_navigation_timeouts.pop(context_id, None)
141+
self._shared_navigation_timeouts.pop(request_id, None)
142+
143+
async def _execute_pre_navigation_hooks(
144+
self, context: BasicCrawlingContext
145+
) -> AsyncGenerator[BasicCrawlingContext, None]:
146+
request_id = id(context.request)
147+
148+
for hook in self._pre_navigation_hooks:
149+
async with self._shared_navigation_timeouts[request_id]:
150+
await hook(context)
151+
152+
yield context
144153

145154
async def _execute_post_navigation_hooks(
146155
self, context: HttpCrawlingContext
147156
) -> AsyncGenerator[HttpCrawlingContext, None]:
157+
request_id = id(context.request)
158+
148159
for hook in self._post_navigation_hooks:
149-
await hook(context)
160+
async with self._shared_navigation_timeouts[request_id]:
161+
await hook(context)
150162

151163
yield context
152164

@@ -262,7 +274,7 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
262274
Yields:
263275
The original crawling context enhanced by HTTP response.
264276
"""
265-
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
277+
async with self._shared_navigation_timeouts[id(context.request)] as remaining_timeout:
266278
result = await self._http_client.crawl(
267279
request=context.request,
268280
session=context.session,

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -768,27 +768,36 @@ def sigint_handler() -> None:
768768
return final_statistics
769769

770770
async def _run_crawler(self) -> None:
771-
event_manager = self._service_locator.get_event_manager()
771+
local_event_manager = self._service_locator.get_event_manager()
772+
global_event_manager = service_locator.get_event_manager()
773+
if local_event_manager is global_event_manager:
774+
local_event_manager = None # Avoid entering the same event manager context twice
775+
776+
# The event managers are always entered.
777+
contexts_to_enter: list[Any] = (
778+
[global_event_manager, local_event_manager] if local_event_manager else [global_event_manager]
779+
)
772780

773781
# Collect the context managers to be entered. Context managers that are already active are excluded,
774782
# as they were likely entered by the caller, who will also be responsible for exiting them.
775-
contexts_to_enter = [
776-
cm
777-
for cm in (
778-
event_manager,
779-
self._snapshotter,
780-
self._statistics,
781-
self._session_pool if self._use_session_pool else None,
782-
self._http_client,
783-
self._crawler_state_rec_task,
784-
*self._additional_context_managers,
785-
)
786-
if cm and getattr(cm, 'active', False) is False
787-
]
783+
contexts_to_enter.extend(
784+
[
785+
cm
786+
for cm in (
787+
self._snapshotter,
788+
self._statistics,
789+
self._session_pool if self._use_session_pool else None,
790+
self._http_client,
791+
self._crawler_state_rec_task,
792+
*self._additional_context_managers,
793+
)
794+
if cm and getattr(cm, 'active', False) is False
795+
]
796+
)
788797

789798
async with AsyncExitStack() as exit_stack:
790799
for context in contexts_to_enter:
791-
await exit_stack.enter_async_context(context) # ty: ignore[invalid-argument-type]
800+
await exit_stack.enter_async_context(context)
792801

793802
await self._autoscaled_pool.run()
794803

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ def __init__(
193193
# Compose the context pipeline with the Playwright-specific context enhancer.
194194
kwargs['_context_pipeline'] = (
195195
ContextPipeline()
196+
.compose(self._manage_shared_navigation_timeout)
196197
.compose(self._open_page)
197198
.compose(self._navigate)
198199
.compose(self._execute_post_navigation_hooks)
@@ -216,6 +217,18 @@ def __init__(
216217

217218
super().__init__(**kwargs)
218219

220+
async def _manage_shared_navigation_timeout(
221+
self, context: BasicCrawlingContext
222+
) -> AsyncGenerator[BasicCrawlingContext, None]:
223+
"""Initialize and clean up the shared navigation timeout for the current request."""
224+
request_id = id(context.request)
225+
self._shared_navigation_timeouts[request_id] = SharedTimeout(self._navigation_timeout)
226+
227+
try:
228+
yield context
229+
finally:
230+
self._shared_navigation_timeouts.pop(request_id, None)
231+
219232
async def _open_page(
220233
self,
221234
context: BasicCrawlingContext,
@@ -242,21 +255,17 @@ async def _open_page(
242255
goto_options=GotoOptions(**self._goto_options),
243256
)
244257

245-
context_id = id(pre_navigation_context)
246-
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
258+
request_id = id(pre_navigation_context.request)
247259

248-
try:
249-
# Only use the page context manager here — it sets the current page in a context variable,
250-
# making it accessible to PlaywrightHttpClient in subsequent pipeline steps.
251-
async with browser_page_context(crawlee_page.page):
252-
for hook in self._pre_navigation_hooks:
253-
async with self._shared_navigation_timeouts[context_id]:
254-
await hook(pre_navigation_context)
255-
256-
# Yield should be inside the browser_page_context.
257-
yield pre_navigation_context
258-
finally:
259-
self._shared_navigation_timeouts.pop(context_id, None)
260+
# Only use the page context manager here — it sets the current page in a context variable,
261+
# making it accessible to PlaywrightHttpClient in subsequent pipeline steps.
262+
async with browser_page_context(crawlee_page.page):
263+
for hook in self._pre_navigation_hooks:
264+
async with self._shared_navigation_timeouts[request_id]:
265+
await hook(pre_navigation_context)
266+
267+
# Yield should be inside the browser_page_context.
268+
yield pre_navigation_context
260269

261270
def _prepare_request_interceptor(
262271
self,
@@ -329,7 +338,7 @@ async def _navigate(
329338
await context.page.route(context.request.url, route_handler)
330339

331340
try:
332-
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
341+
async with self._shared_navigation_timeouts[id(context.request)] as remaining_timeout:
333342
response = await context.page.goto(
334343
context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
335344
)
@@ -496,8 +505,12 @@ async def _handle_blocked_request_by_content(
496505
async def _execute_post_navigation_hooks(
497506
self, context: PlaywrightPostNavCrawlingContext
498507
) -> AsyncGenerator[PlaywrightPostNavCrawlingContext, None]:
508+
request_id = id(context.request)
509+
499510
for hook in self._post_navigation_hooks:
500-
await hook(context)
511+
async with self._shared_navigation_timeouts[request_id]:
512+
await hook(context)
513+
501514
yield context
502515

503516
async def _create_crawling_context(

src/crawlee/events/_event_manager.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -93,24 +93,20 @@ def __init__(
9393
delay=self._persist_state_interval,
9494
)
9595

96-
# Flag to indicate the context state.
97-
self._active = False
96+
# Reference count for active contexts.
97+
self._active_ref_count = 0
9898

9999
@property
100100
def active(self) -> bool:
101101
"""Indicate whether the context is active."""
102-
return self._active
102+
return self._active_ref_count > 0
103103

104104
async def __aenter__(self) -> EventManager:
105-
"""Initialize the event manager upon entering the async context.
105+
"""Initialize the event manager upon entering the async context."""
106+
self._active_ref_count += 1
107+
if self._active_ref_count > 1:
108+
return self
106109

107-
Raises:
108-
RuntimeError: If the context manager is already active.
109-
"""
110-
if self._active:
111-
raise RuntimeError(f'The {self.__class__.__name__} is already active.')
112-
113-
self._active = True
114110
self._emit_persist_state_event_rec_task.start()
115111
return self
116112

@@ -127,17 +123,24 @@ async def __aexit__(
127123
Raises:
128124
RuntimeError: If the context manager is not active.
129125
"""
130-
if not self._active:
126+
if not self.active:
131127
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
132128

129+
if self._active_ref_count > 1:
130+
# Emit persist state event to ensure the latest state is saved before closing the context.
131+
await self._emit_persist_state_event()
132+
await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
133+
self._active_ref_count -= 1
134+
return
135+
133136
# Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
134137
await self._emit_persist_state_event_rec_task.stop()
135138
await self._emit_persist_state_event()
136139
await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
137140
self._event_emitter.remove_all_listeners()
138141
self._listener_tasks.clear()
139142
self._listeners_to_wrappers.clear()
140-
self._active = False
143+
self._active_ref_count -= 1
141144

142145
@overload
143146
def on(self, *, event: Literal[Event.PERSIST_STATE], listener: EventListener[EventPersistStateData]) -> None: ...

src/crawlee/events/_local_event_manager.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,10 @@ async def __aenter__(self) -> LocalEventManager:
7575
It starts emitting system info events at regular intervals.
7676
"""
7777
await super().__aenter__()
78-
self._emit_system_info_event_rec_task.start()
78+
79+
if self._active_ref_count == 1:
80+
self._emit_system_info_event_rec_task.start()
81+
7982
return self
8083

8184
async def __aexit__(
@@ -88,7 +91,9 @@ async def __aexit__(
8891
8992
It stops emitting system info events and closes the event manager.
9093
"""
91-
await self._emit_system_info_event_rec_task.stop()
94+
if self._active_ref_count == 1:
95+
await self._emit_system_info_event_rec_task.stop()
96+
9297
await super().__aexit__(exc_type, exc_value, exc_traceback)
9398

9499
async def _emit_system_info_event(self) -> None:

0 commit comments

Comments
 (0)