Skip to content

Commit 77f85ed

Browse files
committed
Split into files
1 parent 48594fe commit 77f85ed

3 files changed

Lines changed: 196 additions & 121 deletions

File tree

scripts/check-urls.py

Lines changed: 28 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,18 @@
11
import contextlib
22
import fileinput
3+
import signal
34
import os
45
import re
5-
import subprocess
66
import sys
77
import threading
8-
import time
98
import typing
109
import urllib.parse
11-
from queue import Queue, Empty
1210

1311
from github_job_summary import JobSummary
1412
from subdomains import Subdomains
13+
from curl_wrapper import EXIT_CODES as CURL_EXIT_CODES
14+
from curl_wrapper import CurlWrapper
15+
from url_checker import UrlChecker
1516

1617
"""
1718
Read file names from stdin (feed from git ls-files)
@@ -20,33 +21,16 @@
2021
Check them with CURL
2122
"""
2223

23-
# To avoid 403 responses
24-
USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
25-
26-
CONNECT_TIMEOUT_SEC = 5
27-
MAX_TIME_SEC = 10
2824
JOIN_TIMEOUT_SEC = 120
2925

30-
31-
class Curl:
32-
"""
33-
See: https://curl.se/libcurl/c/libcurl-errors.html
34-
"""
35-
36-
CURL_STDERR_HTTP_RE = re.compile(r"^curl: \(22\) The requested URL returned error: (?P<http_code>\d+)")
37-
OK = 0
38-
COULDNT_RESOLVE_HOST = 6
39-
HTTP_RETURNED_ERROR = 22
40-
41-
4226
CURL_EXIT_CODES_AND_HTTP_CODES = {
43-
"https://api.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400),
44-
"https://api.aspose.cloud/v3.0": (Curl.HTTP_RETURNED_ERROR, 404),
45-
"https://api.aspose.cloud/v4.0": (Curl.HTTP_RETURNED_ERROR, 404),
46-
"https://api.aspose.cloud/v4.0/": (Curl.HTTP_RETURNED_ERROR, 404),
47-
"https://id.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400),
27+
"https://api.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400),
28+
"https://api.aspose.cloud/v3.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
29+
"https://api.aspose.cloud/v4.0": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
30+
"https://api.aspose.cloud/v4.0/": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
31+
"https://id.aspose.cloud/connect/token": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 400),
4832
# TODO: Temporary fix
49-
"https://dashboard.aspose.cloud/applications": (Curl.HTTP_RETURNED_ERROR, 404),
33+
"https://dashboard.aspose.cloud/applications": (CURL_EXIT_CODES.HTTP_RETURNED_ERROR, 404),
5034
}
5135

5236
REGEX_TO_IGNORE: list[re.Pattern[str]] = [
@@ -170,69 +154,17 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
170154
raise
171155

172156

173-
class Task:
174-
_proc: subprocess.Popen[bytes]
175-
_stderr: str | None
176-
177-
def __init__(self, url: str):
178-
self.url = url
179-
self._proc = subprocess.Popen(
180-
[
181-
"curl",
182-
"-sSf",
183-
"--output",
184-
"-",
185-
"--connect-timeout",
186-
str(CONNECT_TIMEOUT_SEC),
187-
"--max-time",
188-
str(MAX_TIME_SEC),
189-
"--user-agent",
190-
USER_AGENT,
191-
self.url,
192-
],
193-
stdout=open(os.devnull, "w"),
194-
stderr=subprocess.PIPE,
195-
)
196-
self._stderr = None
197-
self._started = time.time()
198-
199-
@property
200-
def running(self) -> bool:
201-
return self._proc.poll() is None
202-
203-
@property
204-
def ret_code(self) -> int:
205-
assert not self.running
206-
return self._proc.returncode
207-
208-
@property
209-
def stderr(self) -> str:
210-
assert not self.running
211-
if self._stderr is None:
212-
self._stderr = self._proc.stderr.read().decode()
213-
return self._stderr
214-
215-
@property
216-
def age(self) -> float:
217-
return time.time() - self._started
218-
219-
220-
def create_new_task(url: str) -> Task:
221-
# print("Create task:", url)
222-
return Task(url)
223-
224-
225-
def process_finished_task(task: Task) -> None:
157+
def process_finished_task(task) -> None:
226158
# print("Finish task:", task.url)
227159
expected_ret_code, expected_http_code = CURL_EXIT_CODES_AND_HTTP_CODES.get(task.url, (0, None))
228160
if task.ret_code == 0 or task.ret_code == expected_ret_code:
229161
print("OK:", "'%s' %.2fs" % (task.url, task.age))
230162
JOB_SUMMARY.add_success(task.url)
231163
return
232164

233-
if task.ret_code == Curl.HTTP_RETURNED_ERROR and expected_http_code:
165+
if task.ret_code == CURL_EXIT_CODES.HTTP_RETURNED_ERROR and expected_http_code:
234166
# Try parse stderr for HTTP code
235-
match = Curl.CURL_STDERR_HTTP_RE.match(task.stderr)
167+
match = CurlWrapper.CURL_STDERR_HTTP_RE.match(task.stderr)
236168
assert match, "Unexpected output: %s" % task.stderr
237169
http_code = int(match.groupdict()["http_code"])
238170
if http_code == expected_http_code:
@@ -247,56 +179,31 @@ def process_finished_task(task: Task) -> None:
247179
JOB_SUMMARY.add_error(f"Broken URL '{task.url}': {task.stderr}Files: {EXTRACTED_URLS_WITH_FILES[task.url]}")
248180

249181

250-
WORKER_QUEUE: Queue[str | None] = Queue()
251-
252-
253-
def url_checker(num_workers: int = 8) -> None:
254-
next_report_age_sec = 5
255-
workers: list[Task | None] = [None for _ in range(num_workers)]
256-
257-
queue_is_empty = False
258-
259-
while not queue_is_empty or any(workers):
260-
for i, task in enumerate(workers):
261-
if task is None:
262-
continue
263-
if not task.running:
264-
process_finished_task(task)
265-
workers[i] = None
266-
elif task.age > next_report_age_sec:
267-
print("Long request: '%s' %.2fs" % (task.url, task.age))
268-
next_report_age_sec += 3
269-
270-
if not queue_is_empty:
271-
for i in (i for (i, w) in enumerate(workers) if w is None):
272-
# Avoid blocking forever if the queue is currently empty
273-
try:
274-
item = WORKER_QUEUE.get_nowait()
275-
except Empty:
276-
break
277-
if item is None:
278-
queue_is_empty = True
279-
print("--- url queue is over ---")
280-
break
281-
url = item
282-
workers[i] = create_new_task(url)
283-
time.sleep(0.2)
284-
print("Worker finished")
285-
286-
287182
JOB_SUMMARY = JobSummary(os.environ.get("GITHUB_STEP_SUMMARY", "step_summary.md"))
288183
JOB_SUMMARY.add_header("Test all URLs")
289184

290185

291186
def main(files: list[str]) -> int:
292-
checker = threading.Thread(target=url_checker, daemon=True)
187+
url_checker = UrlChecker(
188+
on_finish=process_finished_task,
189+
)
190+
191+
# Setup signal handlers for graceful shutdown
192+
def _handle_signal(_sig: int, _frame: typing.Any) -> None:
193+
url_checker.stop()
194+
195+
with contextlib.suppress(Exception):
196+
signal.signal(signal.SIGINT, _handle_signal)
197+
signal.signal(signal.SIGTERM, _handle_signal)
198+
199+
checker = threading.Thread(target=url_checker.run, daemon=True)
293200
checker.start()
294201

295202
for filename, text in text_extractor(files):
296203
for url in url_extractor(text, filename):
297204
# print("In:", url)
298-
WORKER_QUEUE.put_nowait(url)
299-
WORKER_QUEUE.put_nowait(None)
205+
url_checker.add_url(url)
206+
url_checker.close()
300207
checker.join(timeout=JOIN_TIMEOUT_SEC)
301208
if checker.is_alive():
302209
print(

scripts/curl_wrapper.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import contextlib
2+
import os
3+
import re
4+
import subprocess
5+
import time
6+
from typing import Optional
7+
8+
# To avoid 403 responses (default); caller may override per instance
9+
DEFAULT_USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
10+
11+
12+
class EXIT_CODES:
13+
OK = 0
14+
COULDNT_RESOLVE_HOST = 6
15+
HTTP_RETURNED_ERROR = 22
16+
17+
18+
class CurlWrapper:
19+
"""
20+
Encapsulates a single curl execution with timeouts and helpers.
21+
See: https://curl.se/libcurl/c/libcurl-errors.html
22+
"""
23+
24+
CURL_STDERR_HTTP_RE = re.compile(r"^curl: \(22\) The requested URL returned error: (?P<http_code>\d+)")
25+
26+
def __init__(
27+
self,
28+
url: str,
29+
*,
30+
user_agent: str = DEFAULT_USER_AGENT,
31+
connect_timeout: int = 5,
32+
max_time: int = 10,
33+
) -> None:
34+
self.url = url
35+
self._stderr: Optional[str] = None
36+
self._started = time.time()
37+
self._proc = subprocess.Popen(
38+
[
39+
"curl",
40+
"-sSf",
41+
"--output",
42+
"-",
43+
"--connect-timeout",
44+
str(connect_timeout),
45+
"--max-time",
46+
str(max_time),
47+
"--user-agent",
48+
user_agent,
49+
self.url,
50+
],
51+
stdout=open(os.devnull, "w"),
52+
stderr=subprocess.PIPE,
53+
)
54+
55+
@property
56+
def running(self) -> bool:
57+
return self._proc.poll() is None
58+
59+
@property
60+
def ret_code(self) -> int:
61+
assert not self.running
62+
return self._proc.returncode
63+
64+
@property
65+
def stderr(self) -> str:
66+
assert not self.running
67+
if self._stderr is None:
68+
assert self._proc.stderr is not None
69+
self._stderr = self._proc.stderr.read().decode()
70+
return self._stderr
71+
72+
@property
73+
def age(self) -> float:
74+
return time.time() - self._started
75+
76+
def terminate(self, timeout: float | None = None) -> None:
77+
try:
78+
self._proc.terminate()
79+
if timeout is not None:
80+
self._proc.wait(timeout=timeout)
81+
except Exception:
82+
pass
83+
84+
def kill(self) -> None:
85+
with contextlib.suppress(Exception):
86+
self._proc.kill()

scripts/url_checker.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import contextlib
2+
import time
3+
from queue import Queue, Empty
4+
from typing import Callable, Optional
5+
6+
from curl_wrapper import CurlWrapper
7+
8+
9+
class UrlChecker:
10+
def __init__(
11+
self,
12+
*,
13+
num_workers: int = 8,
14+
hard_kill_sec: int = 15,
15+
on_finish: Optional[Callable[[CurlWrapper], None]] = None,
16+
worker_factory: Optional[Callable[[str], CurlWrapper]] = None,
17+
) -> None:
18+
self.num_workers = num_workers
19+
self.hard_kill_sec = hard_kill_sec
20+
self.on_finish = on_finish
21+
self.worker_factory = worker_factory or (lambda url: CurlWrapper(url))
22+
23+
self.queue: Queue[str | None] = Queue()
24+
self.workers: list[CurlWrapper | None] = [None for _ in range(self.num_workers)]
25+
self.stop_event = False
26+
self.next_report_age_sec = 5
27+
28+
def add_url(self, url: str) -> None:
29+
self.queue.put_nowait(url)
30+
31+
def close(self) -> None:
32+
self.queue.put_nowait(None)
33+
34+
def stop(self) -> None:
35+
self.stop_event = True
36+
with contextlib.suppress(Exception):
37+
self.queue.put_nowait(None)
38+
39+
def run(self) -> None:
40+
queue_is_empty = False
41+
while not queue_is_empty or any(self.workers):
42+
# Graceful stop: cancel running curls
43+
if self.stop_event:
44+
queue_is_empty = True
45+
for t in self.workers:
46+
if t is not None and t.running:
47+
t.terminate(timeout=1)
48+
if t.running:
49+
t.kill()
50+
51+
# Tick workers
52+
for i, task in enumerate(self.workers):
53+
if task is None:
54+
continue
55+
if not task.running:
56+
if self.on_finish is not None:
57+
self.on_finish(task)
58+
self.workers[i] = None
59+
elif task.age > self.next_report_age_sec:
60+
print("Long request: '%s' %.2fs" % (task.url, task.age))
61+
self.next_report_age_sec += 3
62+
if task.age > self.hard_kill_sec:
63+
task.terminate(timeout=2)
64+
if task.running:
65+
task.kill()
66+
print("Killed long request: '%s' %.2fs" % (task.url, task.age))
67+
68+
# Fill idle workers
69+
if not queue_is_empty:
70+
for i in (i for (i, w) in enumerate(self.workers) if w is None):
71+
try:
72+
item = self.queue.get_nowait()
73+
except Empty:
74+
break
75+
if item is None:
76+
queue_is_empty = True
77+
print("--- url queue is over ---")
78+
break
79+
url = item
80+
self.workers[i] = self.worker_factory(url)
81+
time.sleep(0.2)
82+
print("Worker finished")

0 commit comments

Comments
 (0)