chore: Fix new dir structure-related issues (#854)

vdusek · web-flow · commit 9a0725b7e9a2 · 2025-01-02T15:41:41.000+01:00
diff --git a/docs/upgrading/upgrading_to_v0x.md b/docs/upgrading/upgrading_to_v0x.md
@@ -49,14 +49,16 @@ Example update:
 ### Service locator
 
 - The `crawlee.service_container` was completely refactored and renamed to `crawlee.service_locator`.
+- You can use it to set the configuration, event manager or storage client globally. Or you can pass them to your crawler instance directly and it will use the service locator under the hood.
 
 ### Statistics
 
 - The `crawlee.statistics.Statistics` class do not accept an event manager as an input argument anymore. It uses the default, global one.
+- If you want to set your custom event manager, do it either via the service locator or pass it to the crawler.
 
 ### Request
 
-- Removed properties `json_` and `order_no`.
+- The properties `json_` and `order_no` were removed. They were there only for the internal purpose of the memory storage client, you should not need them.
 
 ### Request storages and loaders
 
diff --git a/src/crawlee/browsers/__init__.py b/src/crawlee/browsers/__init__.py
@@ -4,7 +4,7 @@
     from ._playwright_browser_plugin import PlaywrightBrowserPlugin
 except ImportError as exc:
     raise ImportError(
-        "To import anything from this subpackage, you need to install the 'playwright' extra. "
+        "To import this, you need to install the 'playwright' extra. "
         "For example, if you use pip, run `pip install 'crawlee[playwright]'`.",
     ) from exc
 
diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -40,23 +40,23 @@ class HttpCrawlerOptions(Generic[TCrawlingContext], BasicCrawlerOptions[TCrawlin
     """Additional HTTP status codes to treat as errors, triggering automatic retries when encountered."""
 
     ignore_http_error_status_codes: NotRequired[Iterable[int]]
-    """HTTP status codes typically considered errors but to be treated as successful responses."""
+    """HTTP status codes that are typically considered errors but should be treated as successful responses."""
 
 
 @docs_group('Abstract classes')
 class AbstractHttpCrawler(Generic[TCrawlingContext, TParseResult], BasicCrawler[TCrawlingContext], ABC):
     """A web crawler for performing HTTP requests.
 
-    The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. On top
-    of that it implements the HTTP communication using the HTTP clients. The class allows integration with
-    any HTTP client that implements the `BaseHttpClient` interface. The HTTP client is provided to the crawler
-    as an input parameter to the constructor.
-    AbstractHttpCrawler is generic class and is expected to be used together with specific parser that will be used to
-    parse http response and type of expected TCrawlingContext which is available to the user function.
-    See prepared specific version of it: BeautifulSoupCrawler, ParselCrawler or HttpCrawler for example.
+    The `AbstractHttpCrawler` builds on top of the `BasicCrawler`, inheriting all its features. Additionally,
+    it implements HTTP communication using HTTP clients. The class allows integration with any HTTP client
+    that implements the `BaseHttpClient` interface, provided as an input parameter to the constructor.
 
-    The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However,
-    if you need to execute client-side JavaScript, consider using a browser-based crawler like the `PlaywrightCrawler`.
+    `AbstractHttpCrawler` is a generic class intended to be used with a specific parser for parsing HTTP responses
+    and the expected type of `TCrawlingContext` available to the user function. Examples of specific versions include
+    `BeautifulSoupCrawler`, `ParselCrawler`, and `HttpCrawler`.
+
+    HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. For websites that
+    require client-side JavaScript execution, consider using a browser-based crawler like the `PlaywrightCrawler`.
     """
 
     def __init__(
diff --git a/src/crawlee/crawlers/_beautifulsoup/__init__.py b/src/crawlee/crawlers/_beautifulsoup/__init__.py
@@ -4,7 +4,7 @@
     from ._beautifulsoup_parser import BeautifulSoupParserType
 except ImportError as exc:
     raise ImportError(
-        "To import anything from this subpackage, you need to install the 'beautifulsoup' extra. "
+        "To import this, you need to install the 'beautifulsoup' extra. "
         "For example, if you use pip, run `pip install 'crawlee[beautifulsoup]'`.",
     ) from exc
 
diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py
@@ -32,7 +32,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
     ### Usage
 
     ```python
-    from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+    from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 
     crawler = BeautifulSoupCrawler()
 
diff --git a/src/crawlee/crawlers/_http/_http_crawler.py b/src/crawlee/crawlers/_http/_http_crawler.py
@@ -22,7 +22,7 @@ class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes]):
     ### Usage
 
     ```python
-    from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext
+    from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
 
     crawler = HttpCrawler()
 
diff --git a/src/crawlee/crawlers/_parsel/__init__.py b/src/crawlee/crawlers/_parsel/__init__.py
@@ -3,7 +3,7 @@
     from ._parsel_crawling_context import ParselCrawlingContext
 except ImportError as exc:
     raise ImportError(
-        "To import anything from this subpackage, you need to install the 'parsel' extra. "
+        "To import this, you need to install the 'parsel' extra. "
         "For example, if you use pip, run `pip install 'crawlee[parsel]'`.",
     ) from exc
 
diff --git a/src/crawlee/crawlers/_parsel/_parsel_crawler.py b/src/crawlee/crawlers/_parsel/_parsel_crawler.py
@@ -32,7 +32,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector]):
     ### Usage
 
     ```python
-    from crawlee.parsel_crawler import ParselCrawler, ParselCrawlingContext
+    from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
 
     crawler = ParselCrawler()
 
diff --git a/src/crawlee/crawlers/_playwright/__init__.py b/src/crawlee/crawlers/_playwright/__init__.py
@@ -4,7 +4,7 @@
     from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
 except ImportError as exc:
     raise ImportError(
-        "To import anything from this subpackage, you need to install the 'playwright' extra. "
+        "To import this, you need to install the 'playwright' extra. "
         "For example, if you use pip, run `pip install 'crawlee[playwright]'`.",
     ) from exc
 
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -45,7 +45,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]):
     ### Usage
 
     ```python
-    from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
+    from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
 
     crawler = PlaywrightCrawler()
 
diff --git a/src/crawlee/crawlers/_types.py b/src/crawlee/crawlers/_types.py
@@ -2,10 +2,7 @@
 
 from dataclasses import dataclass
 
-from crawlee._utils.docs import docs_group
 
-
-@docs_group('Data structures')
 @dataclass(frozen=True)
 class BlockedInfo:
     """Information about whether the crawling is blocked. If reason is empty, then it means it is not blocked."""
diff --git a/src/crawlee/http_clients/__init__.py b/src/crawlee/http_clients/__init__.py
@@ -8,7 +8,7 @@
 
 # The following imports use try_import to handle optional dependencies, as they may not always be available.
 
-with _try_import(__name__, 'BeautifulSoupCrawler', 'BeautifulSoupCrawlingContext', 'BeautifulSoupParserType'):
+with _try_import(__name__, 'CurlImpersonateHttpClient'):
     from ._curl_impersonate import CurlImpersonateHttpClient
 
 
diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py
@@ -11,7 +11,7 @@
     from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
 except ImportError as exc:
     raise ImportError(
-        "To import anything from this subpackage, you need to install the 'curl-impersonate' extra. "
+        "To import this, you need to install the 'curl-impersonate' extra. "
         "For example, if you use pip, run `pip install 'crawlee[curl-impersonate]'`.",
     ) from exc
 
@@ -84,8 +84,8 @@ class CurlImpersonateHttpClient(BaseHttpClient):
     ### Usage
 
     ```python
-    from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient
-    from crawlee.http_crawler import HttpCrawler  # or any other HTTP client-based crawler
+    from crawlee.crawlers import HttpCrawler  # or any other HTTP client-based crawler
+    from crawlee.http_clients import CurlImpersonateHttpClient
 
     http_client = CurlImpersonateHttpClient()
     crawler = HttpCrawler(http_client=http_client)
diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py
@@ -84,8 +84,8 @@ class HttpxHttpClient(BaseHttpClient):
     ### Usage
 
     ```python
+    from crawlee.crawlers import HttpCrawler  # or any other HTTP client-based crawler
     from crawlee.http_clients import HttpxHttpClient
-    from crawlee.http_crawler import HttpCrawler  # or any other HTTP client-based crawler
 
     http_client = HttpxHttpClient()
     crawler = HttpCrawler(http_client=http_client)
diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py
@@ -245,7 +245,7 @@ def test_import_error_handled() -> None:
 
     # Check if the raised ImportError contains the expected message
     assert str(import_error.value) == (
-        "To import anything from this subpackage, you need to install the 'parsel' extra."
+        "To import this, you need to install the 'parsel' extra."
         "For example, if you use pip, run `pip install 'crawlee[parsel]'`."
     )