Skip to content

Commit 638b8ea

Browse files
committed
Refactoring
1 parent 77f85ed commit 638b8ea

2 files changed

Lines changed: 79 additions & 40 deletions

File tree

scripts/check-urls.py

Lines changed: 17 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from github_job_summary import JobSummary
1212
from subdomains import Subdomains
1313
from curl_wrapper import EXIT_CODES as CURL_EXIT_CODES
14-
from curl_wrapper import CurlWrapper
1514
from url_checker import UrlChecker
1615

1716
"""
@@ -21,9 +20,9 @@
2120
Check them with CURL
2221
"""
2322

24-
JOIN_TIMEOUT_SEC = 120
23+
JOIN_TIMEOUT_SEC: int = 120
2524

26-
CURL_EXIT_CODES_AND_HTTP_CODES = {
25+
CURL_EXIT_CODES_AND_HTTP_CODES: dict[str, tuple[int, int | None]] = {
2726
"https://api.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400),
2827
"https://api.aspose.cloud/v3.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
2928
"https://api.aspose.cloud/v4.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
@@ -45,7 +44,7 @@
4544
]
4645
)
4746

48-
IGNORE_DOMAINS = Subdomains(
47+
IGNORE_DOMAINS: Subdomains = Subdomains(
4948
[
5049
".android.com",
5150
".apache.org",
@@ -82,10 +81,10 @@
8281
]
8382
)
8483

85-
URL_END_CHARS = r",#\)\"'<>\*\s\\"
86-
URL_RE_PATTERN = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS)
84+
URL_END_CHARS: str = r",#\)\"'<>\*\s\\"
85+
URL_RE_PATTERN: str = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS)
8786
# print(URL_RE_PATTERN)
88-
EXTRACT_URL_REGEX = re.compile(URL_RE_PATTERN, re.MULTILINE)
87+
EXTRACT_URL_REGEX: re.Pattern[str] = re.compile(URL_RE_PATTERN, re.MULTILINE)
8988

9089
# URL : [Files]
9190
EXTRACTED_URLS_WITH_FILES: dict[str, list[str]] = {k: [] for k in URLS_TO_IGNORE}
@@ -129,7 +128,7 @@ def url_extractor(text: str, filename: str) -> typing.Generator[str, None, None]
129128
EXTRACTED_URLS_WITH_FILES[url].append(filename)
130129

131130

132-
FILES_TO_IGNORE = frozenset(
131+
FILES_TO_IGNORE: frozenset[str] = frozenset(
133132
[
134133
".jar",
135134
".jar",
@@ -154,38 +153,13 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
154153
raise
155154

156155

157-
def process_finished_task(task) -> None:
158-
# print("Finish task:", task.url)
159-
expected_ret_code, expected_http_code = CURL_EXIT_CODES_AND_HTTP_CODES.get(task.url, (0, None))
160-
if task.ret_code == 0 or task.ret_code == expected_ret_code:
161-
print("OK:", "'%s' %.2fs" % (task.url, task.age))
162-
JOB_SUMMARY.add_success(task.url)
163-
return
164-
165-
if task.ret_code == CURL_EXIT_CODES.HTTP_RETURNED_ERROR and expected_http_code:
166-
# Try parse stderr for HTTP code
167-
match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr)
168-
assert match, "Unexpected output: %s" % task.stderr
169-
http_code = int(match.groupdict()["http_code"])
170-
if http_code == expected_http_code:
171-
print("OK HTTP:", "'%s' %.2fs" % (task.url, task.age))
172-
JOB_SUMMARY.add_success(task.url)
173-
return
174-
175-
print(
176-
"Expected %d got %d for '%s': %s" % (expected_ret_code, task.ret_code, task.url, task.stderr),
177-
file=sys.stderr,
178-
)
179-
JOB_SUMMARY.add_error(f"Broken URL '{task.url}': {task.stderr}Files: {EXTRACTED_URLS_WITH_FILES[task.url]}")
180-
181-
182-
JOB_SUMMARY = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md"))
156+
JOB_SUMMARY: JobSummary = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md"))
183157
JOB_SUMMARY.add_header("Test all URLs")
184158

185159

186160
def main(files: list[str]) -> int:
187161
url_checker = UrlChecker(
188-
on_finish=process_finished_task,
162+
expectations=CURL_EXIT_CODES_AND_HTTP_CODES,
189163
)
190164

191165
# Setup signal handlers for graceful shutdown
@@ -212,6 +186,14 @@ def _handle_signal(_sig: int, _frame: typing.Any) -> None:
212186
flush=True,
213187
)
214188

189+
# Collect results and write summary
190+
for res in url_checker.results:
191+
if res.ok:
192+
JOB_SUMMARY.add_success(res.url)
193+
else:
194+
files = EXTRACTED_URLS_WITH_FILES.get(res.url, [])
195+
JOB_SUMMARY.add_error(f"Broken URL '{res.url}': {res.stderr}Files: {files}")
196+
215197
JOB_SUMMARY.finalize("Checked {total} failed **{failed}**\nGood={success}")
216198
if JOB_SUMMARY.has_errors:
217199
print(JOB_SUMMARY, file=sys.stderr, flush=True)

scripts/url_checker.py

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,23 @@
11
import contextlib
2+
import sys
23
import time
4+
from dataclasses import dataclass
35
from queue import Queue, Empty
46
from typing import Callable, Optional
57

6-
from curl_wrapper import CurlWrapper
8+
from curl_wrapper import CurlWrapper, EXIT_CODES
9+
10+
11+
@dataclass
12+
class CheckResult:
13+
url: str
14+
ok: bool
15+
ret_code: int
16+
age: float
17+
stderr: str
18+
expected_ret_code: int
19+
expected_http_code: int | None
20+
http_code: int | None
721

822

923
class UrlChecker:
@@ -12,18 +26,19 @@ def __init__(
1226
*,
1327
num_workers: int = 8,
1428
hard_kill_sec: int = 15,
15-
on_finish: Optional[Callable[[CurlWrapper], None]] = None,
29+
expectations: dict[str, tuple[int, int | None]] | None = None,
1630
worker_factory: Optional[Callable[[str], CurlWrapper]] = None,
1731
) -> None:
1832
self.num_workers = num_workers
1933
self.hard_kill_sec = hard_kill_sec
20-
self.on_finish = on_finish
34+
self.expectations = expectations or {}
2135
self.worker_factory = worker_factory or (lambda url: CurlWrapper(url))
2236

2337
self.queue: Queue[str | None] = Queue()
2438
self.workers: list[CurlWrapper | None] = [None for _ in range(self.num_workers)]
2539
self.stop_event = False
2640
self.next_report_age_sec = 5
41+
self.results: list[CheckResult] = []
2742

2843
def add_url(self, url: str) -> None:
2944
self.queue.put_nowait(url)
@@ -53,8 +68,7 @@ def run(self) -> None:
5368
if task is None:
5469
continue
5570
if not task.running:
56-
if self.on_finish is not None:
57-
self.on_finish(task)
71+
self._process_finished(task)
5872
self.workers[i] = None
5973
elif task.age > self.next_report_age_sec:
6074
print("Long request: '%s' %.2fs" % (task.url, task.age))
@@ -80,3 +94,46 @@ def run(self) -> None:
8094
self.workers[i] = self.worker_factory(url)
8195
time.sleep(0.2)
8296
print("Worker finished")
97+
98+
def _process_finished(self, task: CurlWrapper) -> None:
99+
expected_ret_code, expected_http_code = self.expectations.get(task.url, (0, None))
100+
101+
ok: bool = False
102+
http_code_val: int | None = None
103+
stderr_out: str = task.stderr
104+
105+
# Fast path: exact expected ret code or success
106+
if task.ret_code == 0 or task.ret_code == expected_ret_code:
107+
print("OK:", "'%s' %.2fs" % (task.url, task.age))
108+
ok = True
109+
stderr_out = ""
110+
else:
111+
# If curl reports HTTP error (22), attempt to parse HTTP code to compare
112+
if task.ret_code == EXIT_CODES.HTTP_RETURNED_ERROR and expected_http_code:
113+
match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr)
114+
assert match, "Unexpected output: %s" % task.stderr
115+
http_code_val = int(match.groupdict()["http_code"])
116+
if http_code_val == expected_http_code:
117+
print("OK HTTP:", "'%s' %.2fs" % (task.url, task.age))
118+
ok = True
119+
120+
if not ok:
121+
# Otherwise, report error
122+
print(
123+
"Expected %d got %d for '%s': %s" % (expected_ret_code, task.ret_code, task.url, task.stderr),
124+
file=sys.stderr,
125+
)
126+
127+
# Append exactly once
128+
self.results.append(
129+
CheckResult(
130+
url=task.url,
131+
ok=ok,
132+
ret_code=task.ret_code,
133+
age=task.age,
134+
stderr=stderr_out,
135+
expected_ret_code=expected_ret_code,
136+
expected_http_code=expected_http_code,
137+
http_code=http_code_val,
138+
)
139+
)

0 commit comments

Comments
 (0)