|
3 | 3 | from collections.abc import Iterator, MutableMapping |
4 | 4 | from datetime import datetime |
5 | 5 | from enum import IntEnum |
6 | | -from typing import TYPE_CHECKING, Annotated, Any, cast |
| 6 | +from typing import TYPE_CHECKING, Annotated, Any, TypedDict, cast |
7 | 7 |
|
8 | 8 | from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PlainSerializer, PlainValidator, TypeAdapter |
9 | 9 | from yarl import URL |
|
15 | 15 | from crawlee._utils.urls import validate_http_url |
16 | 16 |
|
17 | 17 | if TYPE_CHECKING: |
18 | | - from typing_extensions import Self |
| 18 | + from typing_extensions import NotRequired, Required, Self |
19 | 19 |
|
20 | 20 |
|
21 | 21 | class RequestState(IntEnum): |
@@ -108,27 +108,57 @@ def __eq__(self, other: object) -> bool: |
108 | 108 | user_data_adapter = TypeAdapter(UserData) |
109 | 109 |
|
110 | 110 |
|
111 | | -class BaseRequestData(BaseModel): |
112 | | - """Data needed to create a new crawling request.""" |
| 111 | +class RequestOptions(TypedDict): |
| 112 | + """Options that can be used to customize request creation. |
113 | 113 |
|
114 | | - model_config = ConfigDict(populate_by_name=True) |
| 114 | + This type exactly matches the parameters of `Request.from_url` method. |
| 115 | + """ |
115 | 116 |
|
116 | | - url: Annotated[str, BeforeValidator(validate_http_url), Field()] |
117 | | - """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters |
118 | | - and fragments.""" |
| 117 | + url: Required[str] |
| 118 | + method: NotRequired[HttpMethod] |
| 119 | + headers: NotRequired[HttpHeaders | dict[str, str] | None] |
| 120 | + payload: NotRequired[HttpPayload | str | None] |
| 121 | + label: NotRequired[str | None] |
| 122 | + unique_key: NotRequired[str | None] |
| 123 | + id: NotRequired[str | None] |
| 124 | + keep_url_fragment: NotRequired[bool] |
| 125 | + use_extended_unique_key: NotRequired[bool] |
| 126 | + always_enqueue: NotRequired[bool] |
| 127 | + user_data: NotRequired[dict[str, JsonSerializable]] |
| 128 | + no_retry: NotRequired[bool] |
119 | 129 |
|
120 | | - unique_key: Annotated[str, Field(alias='uniqueKey')] |
121 | | - """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing |
122 | | - to the same URL. |
123 | 130 |
|
124 | | - If `unique_key` is not provided, then it is automatically generated by normalizing the URL. |
125 | | - For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key` |
126 | | - of `http://www.example.com/something`. |
| 131 | +@docs_group('Data structures') |
| 132 | +class Request(BaseModel): |
| 133 | + """Represents a request in the Crawlee framework, containing the necessary information for crawling operations. |
127 | 134 |
|
128 | | - Pass an arbitrary non-empty text value to the `unique_key` property |
129 | | - to override the default behavior and specify which URLs shall be considered equal. |
| 135 | + The `Request` class is one of the core components in Crawlee, utilized by various components such as request |
| 136 | + providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests, |
| 137 | + including the URL, HTTP method, headers, payload, and user data. The user data allows custom information |
| 138 | + to be stored and persisted throughout the request lifecycle, including its retries. |
| 139 | +
|
| 140 | + Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used |
| 141 | + for request deduplication, controlling retries, handling state management, and enabling configuration for session |
| 142 | + rotation and proxy handling. |
| 143 | +
|
| 144 | + The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically |
| 145 | + generates a unique key and identifier based on the URL and request parameters. |
| 146 | +
|
| 147 | + ### Usage |
| 148 | +
|
| 149 | + ```python |
| 150 | + from crawlee import Request |
| 151 | +
|
| 152 | + request = Request.from_url('https://crawlee.dev') |
| 153 | + ``` |
130 | 154 | """ |
131 | 155 |
|
| 156 | + model_config = ConfigDict(populate_by_name=True) |
| 157 | + |
| 158 | + url: Annotated[str, BeforeValidator(validate_http_url), Field()] |
| 159 | + """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters |
| 160 | + and fragments.""" |
| 161 | + |
132 | 162 | method: HttpMethod = 'GET' |
133 | 163 | """HTTP request method.""" |
134 | 164 |
|
@@ -172,79 +202,16 @@ class BaseRequestData(BaseModel): |
172 | 202 | handled_at: Annotated[datetime | None, Field(alias='handledAt')] = None |
173 | 203 | """Timestamp when the request was handled.""" |
174 | 204 |
|
175 | | - @classmethod |
176 | | - def from_url( |
177 | | - cls, |
178 | | - url: str, |
179 | | - *, |
180 | | - method: HttpMethod = 'GET', |
181 | | - headers: HttpHeaders | dict[str, str] | None = None, |
182 | | - payload: HttpPayload | str | None = None, |
183 | | - label: str | None = None, |
184 | | - unique_key: str | None = None, |
185 | | - keep_url_fragment: bool = False, |
186 | | - use_extended_unique_key: bool = False, |
187 | | - **kwargs: Any, |
188 | | - ) -> Self: |
189 | | - """Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details.""" |
190 | | - if isinstance(headers, dict) or headers is None: |
191 | | - headers = HttpHeaders(headers or {}) |
192 | | - |
193 | | - if isinstance(payload, str): |
194 | | - payload = payload.encode() |
195 | | - |
196 | | - unique_key = unique_key or compute_unique_key( |
197 | | - url, |
198 | | - method=method, |
199 | | - headers=headers, |
200 | | - payload=payload, |
201 | | - keep_url_fragment=keep_url_fragment, |
202 | | - use_extended_unique_key=use_extended_unique_key, |
203 | | - ) |
204 | | - |
205 | | - request = cls( |
206 | | - url=url, |
207 | | - unique_key=unique_key, |
208 | | - method=method, |
209 | | - headers=headers, |
210 | | - payload=payload, |
211 | | - **kwargs, |
212 | | - ) |
213 | | - |
214 | | - if label is not None: |
215 | | - request.user_data['label'] = label |
216 | | - |
217 | | - return request |
218 | | - |
219 | | - def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None: |
220 | | - """Get the value of a specific query parameter from the URL.""" |
221 | | - query_params = URL(self.url).query |
222 | | - return query_params.get(param, default) |
223 | | - |
224 | | - |
225 | | -@docs_group('Data structures') |
226 | | -class Request(BaseRequestData): |
227 | | - """Represents a request in the Crawlee framework, containing the necessary information for crawling operations. |
228 | | -
|
229 | | - The `Request` class is one of the core components in Crawlee, utilized by various components such as request |
230 | | - providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests, |
231 | | - including the URL, HTTP method, headers, payload, and user data. The user data allows custom information |
232 | | - to be stored and persisted throughout the request lifecycle, including its retries. |
233 | | -
|
234 | | - Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used |
235 | | - for request deduplication, controlling retries, handling state management, and enabling configuration for session |
236 | | - rotation and proxy handling. |
237 | | -
|
238 | | - The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically |
239 | | - generates a unique key and identifier based on the URL and request parameters. |
240 | | -
|
241 | | - ### Usage |
| 205 | + unique_key: Annotated[str, Field(alias='uniqueKey')] |
| 206 | + """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing |
| 207 | + to the same URL. |
242 | 208 |
|
243 | | - ```python |
244 | | - from crawlee import Request |
| 209 | + If `unique_key` is not provided, then it is automatically generated by normalizing the URL. |
| 210 | + For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key` |
| 211 | + of `http://www.example.com/something`. |
245 | 212 |
|
246 | | - request = Request.from_url('https://crawlee.dev') |
247 | | - ``` |
| 213 | + Pass an arbitrary non-empty text value to the `unique_key` property |
| 214 | + to override the default behavior and specify which URLs shall be considered equal. |
248 | 215 | """ |
249 | 216 |
|
250 | 217 | id: str |
@@ -331,12 +298,10 @@ def from_url( |
331 | 298 |
|
332 | 299 | return request |
333 | 300 |
|
334 | | - @classmethod |
335 | | - def from_base_request_data(cls, base_request_data: BaseRequestData, *, id: str | None = None) -> Self: |
336 | | - """Create a complete Request object based on a BaseRequestData instance.""" |
337 | | - kwargs = base_request_data.model_dump() |
338 | | - kwargs['id'] = id or unique_key_to_request_id(base_request_data.unique_key) |
339 | | - return cls(**kwargs) |
| 301 | + def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None: |
| 302 | + """Get the value of a specific query parameter from the URL.""" |
| 303 | + query_params = URL(self.url).query |
| 304 | + return query_params.get(param, default) |
340 | 305 |
|
341 | 306 | @property |
342 | 307 | def label(self) -> str | None: |
|
0 commit comments