@@ -45,6 +45,10 @@ class Curl:
4545 "https://dashboard.aspose.cloud/applications" : (Curl .HTTP_RETURNED_ERROR , 404 ),
4646}
4747
48+ REGEX_TO_IGNORE : list [re .Pattern ] = [
49+ re .compile (r"^https://github\.com/(?P<user>[^/]+)/(?P<repo>[^/]+)/(?:blob|issues|pull)/.+$" ),
50+ ]
51+
4852URLS_TO_IGNORE : frozenset [str ] = frozenset (
4953 [
5054 "https://api.aspose.cloud" ,
@@ -62,6 +66,7 @@ class Curl:
6266 ".dartlang.org" ,
6367 ".getcomposer.org" ,
6468 ".go.dev" ,
69+ ".golang.org" ,
6570 ".google.com" ,
6671 ".gradle.org" ,
6772 ".ietf.org" ,
@@ -83,19 +88,21 @@ class Curl:
8388 ".sonatype.org" ,
8489 ".w3.org" ,
8590 ".wikipedia.org" ,
91+ # Regular domains
92+ "editorconfig.org" ,
8693 ]
8794)
8895
8996URL_END_CHARS = r",#\)\"'<>\*\s\\"
9097URL_RE_PATTERN = r"(https*://[^{0}]+)[{0}]?" .format (URL_END_CHARS )
9198# print(URL_RE_PATTERN)
92- URL_REGEX = re .compile (URL_RE_PATTERN , re .MULTILINE )
99+ EXTRACT_URL_REGEX = re .compile (URL_RE_PATTERN , re .MULTILINE )
93100
94101# URL : [Files]
95102EXTRACTED_URLS_WITH_FILES : dict [str , list [str ]] = {k : [] for k in URLS_TO_IGNORE }
96103
97104
98- def valid_url (url : str ) -> bool :
105+ def should_check_url (url : str ) -> bool :
99106 try :
100107 parsed : urllib .parse .ParseResult = urllib .parse .urlparse (url )
101108 except :
@@ -113,12 +120,17 @@ def valid_url(url: str) -> bool:
113120 # Ignore templates with {{var}}
114121 return False
115122
123+ for r in REGEX_TO_IGNORE :
124+ if r .match (url ):
125+ # print("Ignore by regex", r.pattern, ":", url, file=sys.stderr)
126+ return False
127+
116128 return True
117129
118130
119131def url_extractor (text : str , filename : str ) -> typing .Generator [str , None , None ]:
120- for url in URL_REGEX .findall (text ):
121- if not valid_url (url ):
132+ for url in EXTRACT_URL_REGEX .findall (text ):
133+ if not should_check_url (url ):
122134 # print("Ignore:", url)
123135 continue
124136 if url not in EXTRACTED_URLS_WITH_FILES :
0 commit comments