Skip to content

Commit d8763a7

Browse files
committed
Exclude some additional URLs from check
1 parent 02ef2f9 commit d8763a7

2 files changed

Lines changed: 24 additions & 7 deletions

File tree

scripts/check-urls.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ class Curl:
4545
"https://dashboard.aspose.cloud/applications": (Curl.HTTP_RETURNED_ERROR, 404),
4646
}
4747

48+
REGEX_TO_IGNORE: list[re.Pattern] = [
49+
re.compile(r"^https://github\.com/(?P<user>[^/]+)/(?P<repo>[^/]+)/(?:blob|issues|pull)/.+$"),
50+
]
51+
4852
URLS_TO_IGNORE: frozenset[str] = frozenset(
4953
[
5054
"https://api.aspose.cloud",
@@ -62,6 +66,7 @@ class Curl:
6266
".dartlang.org",
6367
".getcomposer.org",
6468
".go.dev",
69+
".golang.org",
6570
".google.com",
6671
".gradle.org",
6772
".ietf.org",
@@ -83,19 +88,21 @@ class Curl:
8388
".sonatype.org",
8489
".w3.org",
8590
".wikipedia.org",
91+
# Regular domains
92+
"editorconfig.org",
8693
]
8794
)
8895

8996
URL_END_CHARS = r",#\)\"'<>\*\s\\"
9097
URL_RE_PATTERN = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS)
9198
# print(URL_RE_PATTERN)
92-
URL_REGEX = re.compile(URL_RE_PATTERN, re.MULTILINE)
99+
EXTRACT_URL_REGEX = re.compile(URL_RE_PATTERN, re.MULTILINE)
93100

94101
# URL : [Files]
95102
EXTRACTED_URLS_WITH_FILES: dict[str, list[str]] = {k: [] for k in URLS_TO_IGNORE}
96103

97104

98-
def valid_url(url: str) -> bool:
105+
def should_check_url(url: str) -> bool:
99106
try:
100107
parsed: urllib.parse.ParseResult = urllib.parse.urlparse(url)
101108
except:
@@ -113,12 +120,17 @@ def valid_url(url: str) -> bool:
113120
# Ignore templates with {{var}}
114121
return False
115122

123+
for r in REGEX_TO_IGNORE:
124+
if r.match(url):
125+
# print("Ignore by regex", r.pattern, ":", url, file=sys.stderr)
126+
return False
127+
116128
return True
117129

118130

119131
def url_extractor(text: str, filename: str) -> typing.Generator[str, None, None]:
120-
for url in URL_REGEX.findall(text):
121-
if not valid_url(url):
132+
for url in EXTRACT_URL_REGEX.findall(text):
133+
if not should_check_url(url):
122134
# print("Ignore:", url)
123135
continue
124136
if url not in EXTRACTED_URLS_WITH_FILES:

scripts/subdomains.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,17 @@ def __init__(self, domains: typing.Sequence[str]):
1111
self.plain_domains = set()
1212

1313
tmp_level_with_dom: defaultdict[int, list[tuple[str, ...]]] = collections.defaultdict(list)
14-
for d in domains:
14+
for d in map(self.normalize_domain, domains):
1515
if d.startswith("."):
1616
level, parts = self.get_level(d)
1717
tmp_level_with_dom[level].append(parts)
1818
else:
1919
self.plain_domains.add(d)
20-
2120
# Ensure sorted by level
2221
self.domains_by_levels = tuple((key, tmp_level_with_dom[key]) for key in sorted(tmp_level_with_dom.keys()))
2322

2423
def exists(self, domain_name: str) -> bool:
24+
domain_name = self.normalize_domain(domain_name)
2525
if domain_name in self.plain_domains:
2626
return True
2727

@@ -48,11 +48,16 @@ def get_level(domain_name: str) -> tuple[int, tuple[str, ...]]:
4848
parts = domain_name.strip(".").split(".")
4949
return len(parts), tuple(reversed(parts))
5050

51+
@staticmethod
52+
def normalize_domain(domain_name: str) -> str:
53+
return domain_name.lower()
54+
5155

5256
def test() -> None:
53-
sd = Subdomains([".very.long.domain.name", "android.com", ".google.com"])
57+
sd = Subdomains([".very.long.domain.name", "android.com", ".google.com", "editorconfig.org"])
5458
assert sd.exists("test.google.com")
5559
assert not sd.exists("test.android.com")
60+
assert sd.exists("EditorConfig.org")
5661

5762

5863
if __name__ == "__main__":

0 commit comments

Comments
 (0)