Skip to content

Commit 0a99d7f

Browse files
authored
fix: Fix unhandled ValueError in request handler result processing (#666)
We only checked the limit when committing the result, when the request handler was already considered successful. I added a check that results in an error before the request proceeds.
1 parent c1d8c0b commit 0a99d7f

3 files changed

Lines changed: 22 additions & 5 deletions

File tree

src/crawlee/_types.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,10 @@ async def push_data(
417417
**kwargs: Unpack[PushDataKwargs],
418418
) -> None:
419419
"""Track a call to the `push_data` context helper."""
420+
from crawlee.storages._dataset import Dataset
421+
422+
await Dataset.check_and_serialize(data)
423+
420424
self.push_data_calls.append(
421425
PushDataFunctionCall(
422426
data=data,

src/crawlee/storages/_dataset.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -257,11 +257,11 @@ async def push_data(self, data: JsonSerializable, **kwargs: Unpack[PushDataKwarg
257257
"""
258258
# Handle singular items
259259
if not isinstance(data, list):
260-
items = await self._check_and_serialize(data)
260+
items = await self.check_and_serialize(data)
261261
return await self._resource_client.push_items(items, **kwargs)
262262

263263
# Handle lists
264-
payloads_generator = (await self._check_and_serialize(item, index) for index, item in enumerate(data))
264+
payloads_generator = (await self.check_and_serialize(item, index) for index, item in enumerate(data))
265265

266266
# Invoke client in series to preserve the order of data
267267
async for items in self._chunk_by_size(payloads_generator):
@@ -415,7 +415,8 @@ async def iterate_items(
415415
):
416416
yield item
417417

418-
async def _check_and_serialize(self, item: JsonSerializable, index: int | None = None) -> str:
418+
@classmethod
419+
async def check_and_serialize(cls, item: JsonSerializable, index: int | None = None) -> str:
419420
"""Serializes a given item to JSON, checks its serializability and size against a limit.
420421
421422
Args:
@@ -436,8 +437,8 @@ async def _check_and_serialize(self, item: JsonSerializable, index: int | None =
436437
raise ValueError(f'Data item{s}is not serializable to JSON.') from exc
437438

438439
payload_size = ByteSize(len(payload.encode('utf-8')))
439-
if payload_size > self._EFFECTIVE_LIMIT_SIZE:
440-
raise ValueError(f'Data item{s}is too large (size: {payload_size}, limit: {self._EFFECTIVE_LIMIT_SIZE})')
440+
if payload_size > cls._EFFECTIVE_LIMIT_SIZE:
441+
raise ValueError(f'Data item{s}is too large (size: {payload_size}, limit: {cls._EFFECTIVE_LIMIT_SIZE})')
441442

442443
return payload
443444

tests/unit/basic_crawler/test_basic_crawler.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,18 @@ async def handler(context: BasicCrawlingContext) -> None:
645645
assert exported_json_str == expected_json_str
646646

647647

648+
async def test_crawler_push_data_over_limit() -> None:
649+
crawler = BasicCrawler()
650+
651+
@crawler.router.default_handler
652+
async def handler(context: BasicCrawlingContext) -> None:
653+
# Push a roughly 15MB payload - this should be enough to break the 9MB limit
654+
await context.push_data({'hello': 'world' * 3 * 1024 * 1024})
655+
656+
stats = await crawler.run(['http://example.tld/1'])
657+
assert stats.requests_failed == 1
658+
659+
648660
async def test_context_update_kv_store() -> None:
649661
crawler = BasicCrawler()
650662

0 commit comments

Comments
 (0)