Skip to content

Commit 367899c

Browse files
authored
feat: Improve project bootstrapping (#538)
This adds a unified `crawlee/project_template` template. The original `playwright` and `beautifulsoup` templates are kept for compatibility with older versions of the CLI. The user is now prompted for package manager type (pip, poetry), crawler type, start URL and whether or not Apify integration should be set up. - closes #317 - closes #414 (http client selection is not implemented) - closes #511 - closes #495 ### TODO - [x] http client selection - [x] disable poetry option if it isn't installed - [x] rectify the pip-based setup 1. **manual dependency installation** - no automatic installation, just dump requirements.txt and tell the user to handle it any way they want 2. **pip+venv** - dump requirements.txt, make a virtualenv (.venv) using the current python interpreter, install requirements and tell user to activate it - ~should be disabled if `venv` module is not present~ it's stdlib - [x] test the whole thing on Windows (mainly the various package manager configurations) - [x] fix how cookiecutter.json is read (it is not present when installing via pip)
1 parent a1cd757 commit 367899c

24 files changed

Lines changed: 595 additions & 57 deletions

File tree

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ crawlee = "crawlee._cli:cli"
102102

103103
[tool.ruff]
104104
line-length = 120
105+
extend-exclude = ["project_template"]
105106

106107
[tool.ruff.lint]
107108
select = ["ALL"]
@@ -189,6 +190,7 @@ timeout = 1200
189190
[tool.mypy]
190191
python_version = "3.9"
191192
plugins = ["pydantic.mypy"]
193+
exclude = ["project_template"]
192194
files = ["src", "tests"]
193195
check_untyped_defs = true
194196
disallow_incomplete_defs = true

src/crawlee/_cli.py

Lines changed: 125 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,27 @@
11
# ruff: noqa: TRY301, FBT002, UP007
22
from __future__ import annotations
33

4-
import os
4+
import importlib.resources
5+
import json
56
from pathlib import Path
67
from typing import Annotated, Optional, cast
78

8-
import httpx
99
import inquirer # type: ignore[import-untyped]
1010
import typer
1111
from cookiecutter.main import cookiecutter # type: ignore[import-untyped]
1212
from inquirer.render.console import ConsoleRender # type: ignore[import-untyped]
1313
from rich.progress import Progress, SpinnerColumn, TextColumn
1414

15-
TEMPLATE_LIST_URL = 'https://api.github.com/repos/apify/crawlee-python/contents/templates'
16-
1715
cli = typer.Typer(no_args_is_help=True)
1816

17+
template_directory = importlib.resources.files('crawlee') / 'project_template'
18+
cookiecutter_json = json.load((template_directory / 'cookiecutter.json').open())
19+
20+
crawler_choices = cookiecutter_json['crawler_type']
21+
http_client_choices = cookiecutter_json['http_client']
22+
package_manager_choices = cookiecutter_json['package_manager']
23+
default_start_url = cookiecutter_json['start_url']
24+
1925

2026
@cli.callback(invoke_without_command=True)
2127
def callback(
@@ -64,25 +70,42 @@ def _prompt_for_project_name(initial_project_name: str | None) -> str:
6470
return project_name
6571

6672

67-
def _prompt_for_template() -> str:
68-
"""Prompt the user to select a template from a list."""
69-
# Fetch available templates
70-
response = httpx.get(
71-
TEMPLATE_LIST_URL,
72-
timeout=httpx.Timeout(10),
73-
headers=[('Authorization', f'Bearer {os.environ["GH_TOKEN"]}')] if 'GH_TOKEN' in os.environ else [],
73+
def _prompt_text(message: str, default: str) -> str:
74+
return cast(
75+
str,
76+
ConsoleRender().render(
77+
inquirer.Text(
78+
name='text',
79+
message=message,
80+
default=default,
81+
validate=lambda _, value: bool(value.strip()),
82+
),
83+
),
7484
)
75-
response.raise_for_status()
76-
template_choices = [item['name'] for item in response.json() if item['type'] == 'dir']
7785

78-
# Prompt for template choice
86+
87+
def _prompt_choice(message: str, choices: list[str]) -> str:
88+
"""Prompt the user to pick one from a list of choices."""
7989
return cast(
8090
str,
8191
ConsoleRender().render(
8292
inquirer.List(
83-
name='template',
84-
message='Please select the template for your new Crawlee project',
85-
choices=[(choice[0].upper() + choice[1:], choice) for choice in template_choices],
93+
name='choice',
94+
message=message,
95+
choices=[(choice[0].upper() + choice[1:], choice) for choice in choices],
96+
),
97+
),
98+
)
99+
100+
101+
def _prompt_bool(message: str, *, default: bool) -> bool:
102+
return cast(
103+
bool,
104+
ConsoleRender().render(
105+
inquirer.Confirm(
106+
name='confirm',
107+
message=message,
108+
default=default,
86109
),
87110
),
88111
)
@@ -92,26 +115,77 @@ def _prompt_for_template() -> str:
92115
def create(
93116
project_name: Optional[str] = typer.Argument(
94117
default=None,
118+
show_default=False,
95119
help='The name of the project and the directory that will be created to contain it. '
96120
'If none is given, you will be prompted.',
121+
),
122+
crawler_type: Optional[str] = typer.Option(
123+
None,
124+
'--crawler-type',
125+
'--template',
126+
show_default=False,
127+
help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.',
128+
),
129+
http_client: Optional[str] = typer.Option(
130+
None,
131+
show_default=False,
132+
help='The library that will be used to make HTTP requests in your crawler. '
133+
'If none is given, you will be prompted.',
134+
),
135+
package_manager: Optional[str] = typer.Option(
136+
default=None,
97137
show_default=False,
138+
help='Package manager to be used in the new project. If none is given, you will be prompted.',
98139
),
99-
template: Optional[str] = typer.Option(
140+
start_url: Optional[str] = typer.Option(
100141
default=None,
101-
help='The template to be used to create the project. If none is given, you will be prompted.',
102142
show_default=False,
143+
help='The URL where crawling should start. If none is given, you will be prompted.',
144+
),
145+
enable_apify_integration: Optional[bool] = typer.Option(
146+
None,
147+
'--apify/--no-apify',
148+
show_default=False,
149+
help='Should Apify integration be set up for you? If not given, you will be prompted.',
103150
),
104151
) -> None:
105152
"""Bootstrap a new Crawlee project."""
106153
try:
107154
# Prompt for project name if not provided.
108155
project_name = _prompt_for_project_name(project_name)
109156

110-
# Prompt for template choice if not provided.
111-
if template is None:
112-
template = _prompt_for_template()
157+
# Prompt for crawler_type if not provided.
158+
if crawler_type is None:
159+
crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices)
160+
161+
# Prompt for http_client if not provided.
162+
if http_client is None:
163+
http_client = _prompt_choice('Please select the HTTP client', http_client_choices)
164+
165+
# Prompt for package manager if not provided.
166+
if package_manager is None:
167+
package_manager = _prompt_choice('Please select the package manager', package_manager_choices)
168+
169+
# Prompt for start URL
170+
if start_url is None:
171+
start_url = _prompt_text('Please specify the start URL', default=default_start_url)
172+
173+
# Ask about Apify integration if not explicitly configured
174+
if enable_apify_integration is None:
175+
enable_apify_integration = _prompt_bool('Should Apify integration be set up for you?', default=False)
176+
177+
if all(
178+
[
179+
project_name,
180+
crawler_type,
181+
http_client,
182+
package_manager,
183+
start_url,
184+
enable_apify_integration is not None,
185+
]
186+
):
187+
package_name = project_name.replace('-', '_')
113188

114-
if project_name and template:
115189
# Start the bootstrap process.
116190
with Progress(
117191
SpinnerColumn(),
@@ -120,21 +194,39 @@ def create(
120194
) as progress:
121195
progress.add_task(description='Bootstrapping...', total=None)
122196
cookiecutter(
123-
template='gh:apify/crawlee-python',
124-
directory=f'templates/{template}',
197+
template=str(template_directory),
125198
no_input=True,
126-
extra_context={'project_name': project_name},
199+
extra_context={
200+
'project_name': project_name,
201+
'package_manager': package_manager,
202+
'crawler_type': crawler_type,
203+
'http_client': http_client,
204+
'enable_apify_integration': enable_apify_integration,
205+
'start_url': start_url,
206+
},
127207
)
128208

129209
typer.echo(f'Your project "{project_name}" was created.')
130-
typer.echo(
131-
f'To run it, navigate to the directory: "cd {project_name}", '
132-
'install dependencies with "poetry install", '
133-
f'and run it using "poetry run python -m {project_name}".'
134-
)
210+
211+
if package_manager == 'manual':
212+
typer.echo(
213+
f'To run it, navigate to the directory: "cd {project_name}", '
214+
f'install the dependencies listed in "requirements.txt" '
215+
f'and run it using "python -m {package_name}".'
216+
)
217+
elif package_manager == 'pip':
218+
typer.echo(
219+
f'To run it, navigate to the directory: "cd {project_name}", '
220+
f'activate the virtual environment in ".venv" ("source .venv/bin/activate") '
221+
f'and run your project using "python -m {package_name}".'
222+
)
223+
elif package_manager == 'poetry':
224+
typer.echo(
225+
f'To run it, navigate to the directory: "cd {project_name}", '
226+
f'and run it using "poetry run python -m {package_name}".'
227+
)
228+
135229
typer.echo(f'See the "{project_name}/README.md" for more information.')
136230

137-
except httpx.HTTPStatusError as exc:
138-
typer.echo(f'Failed to fetch templates: {exc}.', err=True)
139231
except KeyboardInterrupt:
140232
typer.echo('Operation cancelled by user.')
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"project_name": "crawlee-python-project",
3+
"__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
4+
"crawler_type": ["beautifulsoup", "parsel", "playwright"],
5+
"http_client": ["httpx", "curl-impersonate"],
6+
"package_manager": ["poetry", "pip", "manual"],
7+
"enable_apify_integration": false,
8+
"start_url": "https://crawlee.dev",
9+
"_jinja2_env_vars": {
10+
"line_statement_prefix": "# %"
11+
},
12+
"_extensions": ["jinja2.ext.do"]
13+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import platform
2+
import subprocess
3+
from pathlib import Path
4+
5+
Path('_pyproject.toml').rename('pyproject.toml')
6+
7+
# % if cookiecutter.package_manager == 'poetry'
8+
Path('requirements.txt').unlink()
9+
10+
subprocess.check_call(['poetry', 'install'])
11+
# % if cookiecutter.crawler_type == 'playwright'
12+
subprocess.check_call(['poetry', 'run', 'playwright', 'install'])
13+
# % endif
14+
# % elif cookiecutter.package_manager == 'pip'
15+
import venv # noqa: E402
16+
17+
# Create a virtual environment
18+
venv_root = Path('.venv')
19+
venv.main([str(venv_root)])
20+
21+
if platform.system() == 'Windows': # noqa: SIM108
22+
path = venv_root / 'Scripts'
23+
else:
24+
path = venv_root / 'bin'
25+
26+
# Install requirements and generate requirements.txt as an impromptu lockfile
27+
subprocess.check_call([str(path / 'pip'), 'install', '-r', 'requirements.txt'])
28+
with open('requirements.txt', 'w') as requirements_txt:
29+
subprocess.check_call([str(path / 'pip'), 'freeze'], stdout=requirements_txt)
30+
31+
# % if cookiecutter.crawler_type == 'playwright'
32+
subprocess.check_call([str(path / 'playwright'), 'install'])
33+
# % endif
34+
# % endif
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# % if cookiecutter.package_manager == 'poetry'
2+
import subprocess
3+
import re
4+
5+
try:
6+
version = subprocess.check_output(['poetry', '--version']).decode().strip()
7+
except OSError as exc:
8+
raise RuntimeError('You chose to use the Poetry package manager, but it does not seem to be installed') from exc
9+
10+
if not re.match(r'Poetry \(version 1\..*\)', version):
11+
raise RuntimeError(f'Poetry 1.x is required, but "{version}" is installed')
12+
# % endif
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# % if cookiecutter.enable_apify_integration
2+
from apify import Actor
3+
# % endif
4+
# % block import required
5+
# % endblock
6+
# % if cookiecutter.http_client == 'curl-impersonate'
7+
from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient
8+
# % elif cookiecutter.http_client == 'httpx'
9+
from crawlee.http_clients._httpx import HttpxHttpClient
10+
# % endif
11+
12+
from .routes import router
13+
14+
# % filter truncate(0, end='')
15+
# % block http_client_instantiation
16+
# % if cookiecutter.http_client == 'curl-impersonate'
17+
http_client=CurlImpersonateHttpClient(),
18+
# % elif cookiecutter.http_client == 'httpx'
19+
http_client=HttpxHttpClient(),
20+
# % endif
21+
# % endblock
22+
# % endfilter
23+
24+
async def main() -> None:
25+
"""The crawler entry point."""
26+
# % filter truncate(0, end='')
27+
# % block instantiation required
28+
# % endblock
29+
# % endfilter
30+
31+
# % if cookiecutter.enable_apify_integration
32+
async with Actor:
33+
# % filter indent(width=8, first=False)
34+
{{ self.instantiation() }}
35+
# % endfilter
36+
# % else
37+
# % filter indent(width=4, first=False)
38+
{{ self.instantiation() }}
39+
# % endfilter
40+
# % endif
41+
42+
await crawler.run(
43+
[
44+
'{{ cookiecutter.start_url }}',
45+
]
46+
)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# % extends 'main.py'
2+
3+
# % block import
4+
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
5+
# % endblock
6+
7+
# % block instantiation
8+
crawler = BeautifulSoupCrawler(
9+
request_handler=router,
10+
max_requests_per_crawl=50,
11+
{{ self.http_client_instantiation() }})
12+
# % endblock
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# % extends 'main.py'
2+
3+
# % block import
4+
from crawlee.parsel_crawler import ParselCrawler
5+
# % endblock
6+
7+
# % block instantiation
8+
crawler = ParselCrawler(
9+
request_handler=router,
10+
max_requests_per_crawl=50,
11+
{{ self.http_client_instantiation() }})
12+
# % endblock
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# % extends 'main.py'
2+
3+
# % block import
4+
from crawlee.playwright_crawler import PlaywrightCrawler
5+
# % endblock
6+
7+
# % block instantiation
8+
crawler = PlaywrightCrawler(
9+
request_handler=router,
10+
headless=True,
11+
max_requests_per_crawl=50,
12+
{{ self.http_client_instantiation() }})
13+
# % endblock

0 commit comments

Comments
 (0)