11import contextlib
22import fileinput
3+ import signal
34import os
45import re
5- import subprocess
66import sys
77import threading
8- import time
98import typing
109import urllib .parse
11- from queue import Queue , Empty
1210
1311from github_job_summary import JobSummary
1412from subdomains import Subdomains
13+ from curl_wrapper import EXIT_CODES as CURL_EXIT_CODES
14+ from curl_wrapper import CurlWrapper
15+ from url_checker import UrlChecker
1516
1617"""
1718Read file names from stdin (feed from git ls-files)
2021Check them with CURL
2122"""
2223
23- # To avoid 403 responses
24- USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
25-
26- CONNECT_TIMEOUT_SEC = 5
27- MAX_TIME_SEC = 10
2824JOIN_TIMEOUT_SEC = 120
2925
30-
31- class Curl :
32- """
33- See: https://curl.se/libcurl/c/libcurl-errors.html
34- """
35-
36- CURL_STDERR_HTTP_RE = re .compile (r"^curl: \(22\) The requested URL returned error: (?P<http_code>\d+)" )
37- OK = 0
38- COULDNT_RESOLVE_HOST = 6
39- HTTP_RETURNED_ERROR = 22
40-
41-
4226CURL_EXIT_CODES_AND_HTTP_CODES = {
43- "https://api.aspose.cloud/connect/token" : (Curl .HTTP_RETURNED_ERROR , 400 ),
44- "https://api.aspose.cloud/v3.0" : (Curl .HTTP_RETURNED_ERROR , 404 ),
45- "https://api.aspose.cloud/v4.0" : (Curl .HTTP_RETURNED_ERROR , 404 ),
46- "https://api.aspose.cloud/v4.0/" : (Curl .HTTP_RETURNED_ERROR , 404 ),
47- "https://id.aspose.cloud/connect/token" : (Curl .HTTP_RETURNED_ERROR , 400 ),
27+ "https://api.aspose.cloud/connect/token" : (CURL_EXIT_CODES .HTTP_RETURNED_ERROR , 400 ),
28+ "https://api.aspose.cloud/v3.0" : (CURL_EXIT_CODES .HTTP_RETURNED_ERROR , 404 ),
29+ "https://api.aspose.cloud/v4.0" : (CURL_EXIT_CODES .HTTP_RETURNED_ERROR , 404 ),
30+ "https://api.aspose.cloud/v4.0/" : (CURL_EXIT_CODES .HTTP_RETURNED_ERROR , 404 ),
31+ "https://id.aspose.cloud/connect/token" : (CURL_EXIT_CODES .HTTP_RETURNED_ERROR , 400 ),
4832 # TODO: Temporary fix
49- "https://dashboard.aspose.cloud/applications" : (Curl .HTTP_RETURNED_ERROR , 404 ),
33+ "https://dashboard.aspose.cloud/applications" : (CURL_EXIT_CODES .HTTP_RETURNED_ERROR , 404 ),
5034}
5135
5236REGEX_TO_IGNORE : list [re .Pattern [str ]] = [
@@ -170,69 +154,17 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
170154 raise
171155
172156
173- class Task :
174- _proc : subprocess .Popen [bytes ]
175- _stderr : str | None
176-
177- def __init__ (self , url : str ):
178- self .url = url
179- self ._proc = subprocess .Popen (
180- [
181- "curl" ,
182- "-sSf" ,
183- "--output" ,
184- "-" ,
185- "--connect-timeout" ,
186- str (CONNECT_TIMEOUT_SEC ),
187- "--max-time" ,
188- str (MAX_TIME_SEC ),
189- "--user-agent" ,
190- USER_AGENT ,
191- self .url ,
192- ],
193- stdout = open (os .devnull , "w" ),
194- stderr = subprocess .PIPE ,
195- )
196- self ._stderr = None
197- self ._started = time .time ()
198-
199- @property
200- def running (self ) -> bool :
201- return self ._proc .poll () is None
202-
203- @property
204- def ret_code (self ) -> int :
205- assert not self .running
206- return self ._proc .returncode
207-
208- @property
209- def stderr (self ) -> str :
210- assert not self .running
211- if self ._stderr is None :
212- self ._stderr = self ._proc .stderr .read ().decode ()
213- return self ._stderr
214-
215- @property
216- def age (self ) -> float :
217- return time .time () - self ._started
218-
219-
220- def create_new_task (url : str ) -> Task :
221- # print("Create task:", url)
222- return Task (url )
223-
224-
225- def process_finished_task (task : Task ) -> None :
157+ def process_finished_task (task ) -> None :
226158 # print("Finish task:", task.url)
227159 expected_ret_code , expected_http_code = CURL_EXIT_CODES_AND_HTTP_CODES .get (task .url , (0 , None ))
228160 if task .ret_code == 0 or task .ret_code == expected_ret_code :
229161 print ("OK:" , "'%s' %.2fs" % (task .url , task .age ))
230162 JOB_SUMMARY .add_success (task .url )
231163 return
232164
233- if task .ret_code == Curl .HTTP_RETURNED_ERROR and expected_http_code :
165+ if task .ret_code == CURL_EXIT_CODES .HTTP_RETURNED_ERROR and expected_http_code :
234166 # Try parse stderr for HTTP code
235- match = Curl .CURL_STDERR_HTTP_RE .match (task .stderr )
167+ match = CurlWrapper .CURL_STDERR_HTTP_RE .match (task .stderr )
236168 assert match , "Unexpected output: %s" % task .stderr
237169 http_code = int (match .groupdict ()["http_code" ])
238170 if http_code == expected_http_code :
@@ -247,56 +179,31 @@ def process_finished_task(task: Task) -> None:
247179 JOB_SUMMARY .add_error (f"Broken URL '{ task .url } ': { task .stderr } Files: { EXTRACTED_URLS_WITH_FILES [task .url ]} " )
248180
249181
250- WORKER_QUEUE : Queue [str | None ] = Queue ()
251-
252-
253- def url_checker (num_workers : int = 8 ) -> None :
254- next_report_age_sec = 5
255- workers : list [Task | None ] = [None for _ in range (num_workers )]
256-
257- queue_is_empty = False
258-
259- while not queue_is_empty or any (workers ):
260- for i , task in enumerate (workers ):
261- if task is None :
262- continue
263- if not task .running :
264- process_finished_task (task )
265- workers [i ] = None
266- elif task .age > next_report_age_sec :
267- print ("Long request: '%s' %.2fs" % (task .url , task .age ))
268- next_report_age_sec += 3
269-
270- if not queue_is_empty :
271- for i in (i for (i , w ) in enumerate (workers ) if w is None ):
272- # Avoid blocking forever if the queue is currently empty
273- try :
274- item = WORKER_QUEUE .get_nowait ()
275- except Empty :
276- break
277- if item is None :
278- queue_is_empty = True
279- print ("--- url queue is over ---" )
280- break
281- url = item
282- workers [i ] = create_new_task (url )
283- time .sleep (0.2 )
284- print ("Worker finished" )
285-
286-
287182JOB_SUMMARY = JobSummary (os .environ .get ("GITHUB_STEP_SUMMARY" , "step_summary.md" ))
288183JOB_SUMMARY .add_header ("Test all URLs" )
289184
290185
291186def main (files : list [str ]) -> int :
292- checker = threading .Thread (target = url_checker , daemon = True )
187+ url_checker = UrlChecker (
188+ on_finish = process_finished_task ,
189+ )
190+
191+ # Setup signal handlers for graceful shutdown
192+ def _handle_signal (_sig : int , _frame : typing .Any ) -> None :
193+ url_checker .stop ()
194+
195+ with contextlib .suppress (Exception ):
196+ signal .signal (signal .SIGINT , _handle_signal )
197+ signal .signal (signal .SIGTERM , _handle_signal )
198+
199+ checker = threading .Thread (target = url_checker .run , daemon = True )
293200 checker .start ()
294201
295202 for filename , text in text_extractor (files ):
296203 for url in url_extractor (text , filename ):
297204 # print("In:", url)
298- WORKER_QUEUE . put_nowait (url )
299- WORKER_QUEUE . put_nowait ( None )
205+ url_checker . add_url (url )
206+ url_checker . close ( )
300207 checker .join (timeout = JOIN_TIMEOUT_SEC )
301208 if checker .is_alive ():
302209 print (
0 commit comments