Skip to content

Commit f74a02e

Browse files
authored
docs: improve docs of the Configuration class (#671)
- Relates #304 - Relates #670
1 parent b230510 commit f74a02e

1 file changed

Lines changed: 27 additions & 6 deletions

File tree

src/crawlee/configuration.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,23 @@
1515

1616

1717
class Configuration(BaseSettings):
18-
"""Configuration of the Crawler.
18+
"""Configuration settings for the Crawlee project.
1919
20-
Args:
21-
internal_timeout: Timeout for internal operations such as marking a request as processed.
22-
verbose_log: Allows verbose logging.
23-
default_storage_id: The default storage ID.
24-
purge_on_start: Whether to purge the storage on start.
20+
This class stores common configurable parameters for Crawlee. Default values are provided for all settings,
21+
so typically, no adjustments are necessary. However, you may modify settings for specific use cases,
22+
such as changing the default storage directory, the default storage IDs, the timeout for internal
23+
operations, and more.
24+
25+
Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
2526
"""
2627

2728
model_config = SettingsConfigDict(populate_by_name=True)
2829

2930
internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
31+
"""Timeout for the internal asynchronous operations."""
3032

3133
verbose_log: Annotated[bool, Field(alias='crawlee_verbose_log')] = False
34+
"""Whether to enable verbose logging."""
3235

3336
default_browser_path: Annotated[
3437
str | None,
@@ -39,6 +42,7 @@ class Configuration(BaseSettings):
3942
)
4043
),
4144
] = None
45+
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""
4246

4347
disable_browser_sandbox: Annotated[
4448
bool,
@@ -49,6 +53,7 @@ class Configuration(BaseSettings):
4953
)
5054
),
5155
] = False
56+
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""
5257

5358
log_level: Annotated[
5459
Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
@@ -60,6 +65,7 @@ class Configuration(BaseSettings):
6065
),
6166
BeforeValidator(lambda value: str(value).upper()),
6267
] = 'INFO'
68+
"""The logging level."""
6369

6470
default_dataset_id: Annotated[
6571
str,
@@ -71,6 +77,7 @@ class Configuration(BaseSettings):
7177
)
7278
),
7379
] = 'default'
80+
"""The default dataset ID."""
7481

7582
default_key_value_store_id: Annotated[
7683
str,
@@ -82,6 +89,7 @@ class Configuration(BaseSettings):
8289
)
8390
),
8491
] = 'default'
92+
"""The default key-value store ID."""
8593

8694
default_request_queue_id: Annotated[
8795
str,
@@ -93,6 +101,7 @@ class Configuration(BaseSettings):
93101
)
94102
),
95103
] = 'default'
104+
"""The default request queue ID."""
96105

97106
purge_on_start: Annotated[
98107
bool,
@@ -103,8 +112,10 @@ class Configuration(BaseSettings):
103112
)
104113
),
105114
] = True
115+
"""Whether to purge the storage on the start."""
106116

107117
write_metadata: Annotated[bool, Field(alias='crawlee_write_metadata')] = True
118+
"""Whether to write the storage metadata."""
108119

109120
persist_storage: Annotated[
110121
bool,
@@ -115,6 +126,7 @@ class Configuration(BaseSettings):
115126
)
116127
),
117128
] = True
129+
"""Whether to persist the storage."""
118130

119131
persist_state_interval: Annotated[
120132
timedelta_ms,
@@ -125,6 +137,7 @@ class Configuration(BaseSettings):
125137
)
126138
),
127139
] = timedelta(minutes=1)
140+
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""
128141

129142
system_info_interval: Annotated[
130143
timedelta_ms,
@@ -135,6 +148,7 @@ class Configuration(BaseSettings):
135148
)
136149
),
137150
] = timedelta(seconds=1)
151+
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""
138152

139153
max_used_cpu_ratio: Annotated[
140154
float,
@@ -145,6 +159,7 @@ class Configuration(BaseSettings):
145159
)
146160
),
147161
] = 0.95
162+
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""
148163

149164
memory_mbytes: Annotated[
150165
int | None,
@@ -156,6 +171,7 @@ class Configuration(BaseSettings):
156171
)
157172
),
158173
] = None
174+
"""The maximum memory in megabytes. The `Snapshotter.max_memory_size` is set to this value."""
159175

160176
available_memory_ratio: Annotated[
161177
float,
@@ -166,6 +182,7 @@ class Configuration(BaseSettings):
166182
)
167183
),
168184
] = 0.25
185+
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""
169186

170187
storage_dir: Annotated[
171188
str,
@@ -176,6 +193,7 @@ class Configuration(BaseSettings):
176193
),
177194
),
178195
] = './storage'
196+
"""The path to the storage directory."""
179197

180198
chrome_executable_path: Annotated[
181199
str | None,
@@ -186,6 +204,7 @@ class Configuration(BaseSettings):
186204
)
187205
),
188206
] = None
207+
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""
189208

190209
headless: Annotated[
191210
bool,
@@ -196,6 +215,7 @@ class Configuration(BaseSettings):
196215
)
197216
),
198217
] = True
218+
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""
199219

200220
xvfb: Annotated[
201221
bool,
@@ -206,6 +226,7 @@ class Configuration(BaseSettings):
206226
)
207227
),
208228
] = False
229+
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""
209230

210231
@classmethod
211232
def get_global_configuration(cls) -> Self:

0 commit comments

Comments
 (0)