Refactoring

Denis-Averin · Denis-Averin · commit 638b8eafcfd0 · 2025-10-16T18:23:20.000+07:00
diff --git a/scripts/check-urls.py b/scripts/check-urls.py
@@ -11,7 +11,6 @@
 from github_job_summary import JobSummary
 from subdomains import Subdomains
 from curl_wrapper import EXIT_CODES as CURL_EXIT_CODES
-from curl_wrapper import CurlWrapper
 from url_checker import UrlChecker
 
 """
@@ -21,9 +20,9 @@
 Check them with CURL
 """
 
-JOIN_TIMEOUT_SEC = 120
+JOIN_TIMEOUT_SEC: int = 120
 
-CURL_EXIT_CODES_AND_HTTP_CODES = {
+CURL_EXIT_CODES_AND_HTTP_CODES: dict[str, tuple[int, int | None]] = {
     "https://api.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400),
     "https://api.aspose.cloud/v3.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
     "https://api.aspose.cloud/v4.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
@@ -45,7 +44,7 @@
     ]
 )
 
-IGNORE_DOMAINS = Subdomains(
+IGNORE_DOMAINS: Subdomains = Subdomains(
     [
         ".android.com",
         ".apache.org",
@@ -82,10 +81,10 @@
     ]
 )
 
-URL_END_CHARS = r",#\)\"'<>\*\s\\"
-URL_RE_PATTERN = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS)
+URL_END_CHARS: str = r",#\)\"'<>\*\s\\"
+URL_RE_PATTERN: str = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS)
 # print(URL_RE_PATTERN)
-EXTRACT_URL_REGEX = re.compile(URL_RE_PATTERN, re.MULTILINE)
+EXTRACT_URL_REGEX: re.Pattern[str] = re.compile(URL_RE_PATTERN, re.MULTILINE)
 
 # URL : [Files]
 EXTRACTED_URLS_WITH_FILES: dict[str, list[str]] = {k: [] for k in URLS_TO_IGNORE}
@@ -129,7 +128,7 @@ def url_extractor(text: str, filename: str) -> typing.Generator[str, None, None]
             EXTRACTED_URLS_WITH_FILES[url].append(filename)
 
 
-FILES_TO_IGNORE = frozenset(
+FILES_TO_IGNORE: frozenset[str] = frozenset(
     [
         ".jar",
         ".jar",
@@ -154,38 +153,13 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
                     raise
 
 
-def process_finished_task(task) -> None:
-    # print("Finish task:", task.url)
-    expected_ret_code, expected_http_code = CURL_EXIT_CODES_AND_HTTP_CODES.get(task.url, (0, None))
-    if task.ret_code == 0 or task.ret_code == expected_ret_code:
-        print("OK:", "'%s' %.2fs" % (task.url, task.age))
-        JOB_SUMMARY.add_success(task.url)
-        return
-
-    if task.ret_code == CURL_EXIT_CODES.HTTP_RETURNED_ERROR and expected_http_code:
-        # Try parse stderr for HTTP code
-        match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr)
-        assert match, "Unexpected output: %s" % task.stderr
-        http_code = int(match.groupdict()["http_code"])
-        if http_code == expected_http_code:
-            print("OK HTTP:", "'%s' %.2fs" % (task.url, task.age))
-            JOB_SUMMARY.add_success(task.url)
-            return
-
-    print(
-        "Expected %d got %d for '%s': %s" % (expected_ret_code, task.ret_code, task.url, task.stderr),
-        file=sys.stderr,
-    )
-    JOB_SUMMARY.add_error(f"Broken URL '{task.url}': {task.stderr}Files: {EXTRACTED_URLS_WITH_FILES[task.url]}")
-
-
-JOB_SUMMARY = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md"))
+JOB_SUMMARY: JobSummary = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md"))
 JOB_SUMMARY.add_header("Test all URLs")
 
 
 def main(files: list[str]) -> int:
     url_checker = UrlChecker(
-        on_finish=process_finished_task,
+        expectations=CURL_EXIT_CODES_AND_HTTP_CODES,
     )
 
     # Setup signal handlers for graceful shutdown
@@ -212,6 +186,14 @@ def _handle_signal(_sig: int, _frame: typing.Any) -> None:
             flush=True,
         )
 
+    # Collect results and write summary
+    for res in url_checker.results:
+        if res.ok:
+            JOB_SUMMARY.add_success(res.url)
+        else:
+            files = EXTRACTED_URLS_WITH_FILES.get(res.url, [])
+            JOB_SUMMARY.add_error(f"Broken URL '{res.url}': {res.stderr}Files: {files}")
+
     JOB_SUMMARY.finalize("Checked {total} failed **{failed}**\nGood={success}")
     if JOB_SUMMARY.has_errors:
         print(JOB_SUMMARY, file=sys.stderr, flush=True)
diff --git a/scripts/url_checker.py b/scripts/url_checker.py
@@ -1,9 +1,23 @@
 import contextlib
+import sys
 import time
+from dataclasses import dataclass
 from queue import Queue, Empty
 from typing import Callable, Optional
 
-from curl_wrapper import CurlWrapper
+from curl_wrapper import CurlWrapper, EXIT_CODES
+
+
+@dataclass
+class CheckResult:
+    url: str
+    ok: bool
+    ret_code: int
+    age: float
+    stderr: str
+    expected_ret_code: int
+    expected_http_code: int | None
+    http_code: int | None
 
 
 class UrlChecker:
@@ -12,18 +26,19 @@ def __init__(
         *,
         num_workers: int = 8,
         hard_kill_sec: int = 15,
-        on_finish: Optional[Callable[[CurlWrapper], None]] = None,
+        expectations: dict[str, tuple[int, int | None]] | None = None,
         worker_factory: Optional[Callable[[str], CurlWrapper]] = None,
     ) -> None:
         self.num_workers = num_workers
         self.hard_kill_sec = hard_kill_sec
-        self.on_finish = on_finish
+        self.expectations = expectations or {}
         self.worker_factory = worker_factory or (lambda url: CurlWrapper(url))
 
         self.queue: Queue[str | None] = Queue()
         self.workers: list[CurlWrapper | None] = [None for _ in range(self.num_workers)]
         self.stop_event = False
         self.next_report_age_sec = 5
+        self.results: list[CheckResult] = []
 
     def add_url(self, url: str) -> None:
         self.queue.put_nowait(url)
@@ -53,8 +68,7 @@ def run(self) -> None:
                 if task is None:
                     continue
                 if not task.running:
-                    if self.on_finish is not None:
-                        self.on_finish(task)
+                    self._process_finished(task)
                     self.workers[i] = None
                 elif task.age > self.next_report_age_sec:
                     print("Long request: '%s' %.2fs" % (task.url, task.age))
@@ -80,3 +94,46 @@ def run(self) -> None:
                     self.workers[i] = self.worker_factory(url)
             time.sleep(0.2)
         print("Worker finished")
+
+    def _process_finished(self, task: CurlWrapper) -> None:
+        expected_ret_code, expected_http_code = self.expectations.get(task.url, (0, None))
+
+        ok: bool = False
+        http_code_val: int | None = None
+        stderr_out: str = task.stderr
+
+        # Fast path: exact expected ret code or success
+        if task.ret_code == 0 or task.ret_code == expected_ret_code:
+            print("OK:", "'%s' %.2fs" % (task.url, task.age))
+            ok = True
+            stderr_out = ""
+        else:
+            # If curl reports HTTP error (22), attempt to parse HTTP code to compare
+            if task.ret_code == EXIT_CODES.HTTP_RETURNED_ERROR and expected_http_code:
+                match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr)
+                assert match, "Unexpected output: %s" % task.stderr
+                http_code_val = int(match.groupdict()["http_code"])
+                if http_code_val == expected_http_code:
+                    print("OK HTTP:", "'%s' %.2fs" % (task.url, task.age))
+                    ok = True
+
+        if not ok:
+            # Otherwise, report error
+            print(
+                "Expected %d got %d for '%s': %s" % (expected_ret_code, task.ret_code, task.url, task.stderr),
+                file=sys.stderr,
+            )
+
+        # Append exactly once
+        self.results.append(
+            CheckResult(
+                url=task.url,
+                ok=ok,
+                ret_code=task.ret_code,
+                age=task.age,
+                stderr=stderr_out,
+                expected_ret_code=expected_ret_code,
+                expected_http_code=expected_http_code,
+                http_code=http_code_val,
+            )
+        )