2121 from crawlee .http_crawler ._http_crawling_context import HttpCrawlingContext
2222
2323
24+ # Payload, e.g. data for a form submission.
25+ PAYLOAD = {
26+ 'custname' : 'John Doe' ,
27+ 'custtel' : '1234567890' ,
28+ 'custemail' : 'johndoe@example.com' ,
29+ 'size' : 'large' ,
30+ 'topping' : '["bacon", "cheese", "mushroom"]' ,
31+ 'delivery' : '13:00' ,
32+ 'comments' : 'Please ring the doorbell upon arrival.' ,
33+ }
34+
35+
2436@pytest .fixture
2537async def mock_request_handler () -> Callable [[HttpCrawlingContext ], Awaitable [None ]] | AsyncMock :
2638 return AsyncMock ()
@@ -214,21 +226,9 @@ async def test_http_status_statistics(crawler: HttpCrawler, server: respx.MockRo
214226 [CurlImpersonateHttpClient , HttpxHttpClient ],
215227 ids = ['curl' , 'httpx' ],
216228)
217- async def test_sending_payload (http_client_class : type [BaseHttpClient ]) -> None :
229+ async def test_sending_payload_as_raw_data (http_client_class : type [BaseHttpClient ]) -> None :
218230 http_client = http_client_class ()
219231 crawler = HttpCrawler (http_client = http_client )
220-
221- # Payload, e.g. data from a form submission.
222- payload = {
223- 'custname' : 'John Doe' ,
224- 'custtel' : '1234567890' ,
225- 'custemail' : 'johndoe@example.com' ,
226- 'size' : 'large' ,
227- 'topping' : '["bacon", "cheese", "mushroom"]' ,
228- 'delivery' : '13:00' ,
229- 'comments' : 'Please ring the doorbell upon arrival.' ,
230- }
231-
232232 responses = []
233233
234234 @crawler .router .default_handler
@@ -237,35 +237,100 @@ async def request_handler(context: HttpCrawlingContext) -> None:
237237 # The httpbin.org/post endpoint returns the provided payload in the response.
238238 responses .append (response )
239239
240+ encoded_payload = urlencode (PAYLOAD ).encode ()
240241 request = Request .from_url (
241242 url = 'https://httpbin.org/post' ,
242243 method = 'POST' ,
243- payload = urlencode ( payload ). encode () ,
244+ payload = encoded_payload ,
244245 )
245246
246247 await crawler .run ([request ])
247248
248- # The request handler should be called once.
249- assert len ( responses ) == 1 , 'The request handler should be called once .'
249+ assert len ( responses ) == 1 , 'Request handler should be called exactly once.'
250+ assert responses [ 0 ][ 'data' ]. encode ( ) == encoded_payload , 'Response payload data does not match the sent payload .'
250251
251252 # The reconstructed payload data should match the original payload. We have to flatten the values, because
252253 # parse_qs returns a list of values for each key.
253- response_data = {
254- k : v [0 ] if len (v ) == 1 else v for k , v in parse_qs (responses [0 ]['data' ].strip ("b'" ).strip ("'" )).items ()
255- }
254+ response_data = {k : v [0 ] if len (v ) == 1 else v for k , v in parse_qs (responses [0 ]['data' ]).items ()}
255+ assert response_data == PAYLOAD , 'The reconstructed payload data does not match the sent payload.'
256256
257- assert response_data == payload , 'The reconstructed payload data should match the original payload.'
257+ assert responses [0 ]['json' ] is None , 'Response JSON data should be empty when only raw data is sent.'
258+ assert responses [0 ]['form' ] == {}, 'Response form data should be empty when only raw data is sent.'
258259
259260
260261@pytest .mark .parametrize (
261262 'http_client_class' ,
262263 [CurlImpersonateHttpClient , HttpxHttpClient ],
263264 ids = ['curl' , 'httpx' ],
264265)
265- async def test_sending_url_query_params (http_client_class : type [BaseHttpClient ]) -> None :
266+ async def test_sending_payload_as_form_data (http_client_class : type [BaseHttpClient ]) -> None :
266267 http_client = http_client_class ()
267268 crawler = HttpCrawler (http_client = http_client )
269+ responses = []
268270
271+ @crawler .router .default_handler
272+ async def request_handler (context : HttpCrawlingContext ) -> None :
273+ response = json .loads (context .http_response .read ())
274+ # The httpbin.org/post endpoint returns the provided payload in the response.
275+ responses .append (response )
276+
277+ request = Request .from_url (
278+ url = 'https://httpbin.org/post' ,
279+ method = 'POST' ,
280+ headers = {'content-type' : 'application/x-www-form-urlencoded' },
281+ payload = urlencode (PAYLOAD ).encode (),
282+ )
283+
284+ await crawler .run ([request ])
285+
286+ assert len (responses ) == 1 , 'Request handler should be called exactly once.'
287+ assert responses [0 ]['form' ] == PAYLOAD , 'Form data in response does not match the sent payload.'
288+
289+ assert responses [0 ]['json' ] is None , 'Response JSON data should be empty when only form data is sent.'
290+ assert responses [0 ]['data' ] == '' , 'Response raw data should be empty when only form data is sent.'
291+
292+
293+ @pytest .mark .parametrize (
294+ 'http_client_class' ,
295+ [CurlImpersonateHttpClient , HttpxHttpClient ],
296+ ids = ['curl' , 'httpx' ],
297+ )
298+ async def test_sending_payload_as_json (http_client_class : type [BaseHttpClient ]) -> None :
299+ http_client = http_client_class ()
300+ crawler = HttpCrawler (http_client = http_client )
301+ responses = []
302+
303+ @crawler .router .default_handler
304+ async def request_handler (context : HttpCrawlingContext ) -> None :
305+ response = json .loads (context .http_response .read ())
306+ # The httpbin.org/post endpoint returns the provided payload in the response.
307+ responses .append (response )
308+
309+ json_payload = json .dumps (PAYLOAD ).encode ()
310+ request = Request .from_url (
311+ url = 'https://httpbin.org/post' ,
312+ method = 'POST' ,
313+ payload = json_payload ,
314+ headers = {'content-type' : 'application/json' },
315+ )
316+
317+ await crawler .run ([request ])
318+
319+ assert len (responses ) == 1 , 'Request handler should be called exactly once.'
320+ assert responses [0 ]['data' ].encode () == json_payload , 'Response raw JSON data does not match the sent payload.'
321+ assert responses [0 ]['json' ] == PAYLOAD , 'Response JSON data does not match the sent payload.'
322+
323+ assert responses [0 ]['form' ] == {}, 'Response form data should be empty when only JSON data is sent.'
324+
325+
326+ @pytest .mark .parametrize (
327+ 'http_client_class' ,
328+ [CurlImpersonateHttpClient , HttpxHttpClient ],
329+ ids = ['curl' , 'httpx' ],
330+ )
331+ async def test_sending_url_query_params (http_client_class : type [BaseHttpClient ]) -> None :
332+ http_client = http_client_class ()
333+ crawler = HttpCrawler (http_client = http_client )
269334 responses = []
270335
271336 @crawler .router .default_handler
@@ -280,11 +345,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:
280345
281346 await crawler .run ([request ])
282347
283- # The request handler should be called once.
284- assert len (responses ) == 1 , 'The request handler should be called once.'
348+ assert len (responses ) == 1 , 'Request handler should be called exactly once.'
285349
286- # Validate the response query parameters.
287350 response_args = responses [0 ]['args' ]
288- assert (
289- response_args == query_params
290- ), 'The reconstructed query parameters should match the original query parameters.'
351+ assert response_args == query_params , 'Reconstructed query params must match the original query params.'
0 commit comments