From 2939b0797341608ad9bcb206ac328969359ff790 Mon Sep 17 00:00:00 2001 From: Denis Averin Date: Mon, 7 Apr 2025 17:44:04 +0700 Subject: [PATCH] Add subdomains to ignore --- scripts/check-urls.py | 78 ++++++++++++++++++++----------------------- scripts/subdomains.py | 59 ++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 42 deletions(-) create mode 100644 scripts/subdomains.py diff --git a/scripts/check-urls.py b/scripts/check-urls.py index 66ce3dc..944ff75 100644 --- a/scripts/check-urls.py +++ b/scripts/check-urls.py @@ -10,6 +10,7 @@ from queue import SimpleQueue from github_job_summary import JobSummary +from subdomains import Subdomains """ Read file names from stdin (feed from git ls-files) @@ -18,6 +19,9 @@ Check them with CURL """ +# To avoid 403 responses +USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)" + class Curl: """ @@ -49,35 +53,36 @@ class Curl: ] ) -IGNORE_DOMAINS: frozenset[str] = frozenset( +IGNORE_DOMAINS = Subdomains( [ - "central.sonatype.org", - "curl.se", - "dart.dev", - "getcomposer.org", - "go.dev", - "maven.apache.org", - "mvnrepository.com", - "mvnrepository.com", - "nodejs.org", - "packagist.org", - "pkg.go.dev", - "pub.dev", - "pypi.org", - "pypi.python.org", - "repo1.maven.org", - "tools.ietf.org", - "urllib3.readthedocs.io", - "www.apache.org", - "www.dartlang.org", - "www.gradle.org", - "www.mojohaus.org", - "www.npmjs.com", - "www.nuget.org", - "www.opensource.org", - "www.php.net", - "www.python.org", - "www.w3.org", + ".android.com", + ".apache.org", + ".curl.se", + ".dart.dev", + ".dartlang.org", + ".getcomposer.org", + ".go.dev", + ".google.com", + ".gradle.org", + ".ietf.org", + ".maven.org", + ".microsoft.com", + ".mojohaus.org", + ".mvnrepository.com", + ".nodejs.org", + ".npmjs.com", + ".nuget.org", + ".opensource.org", + ".packagist.org", + ".php.net", + ".phpunit.de", + ".pub.dev", + ".pypi.org", + ".python.org", + ".readthedocs.io", + ".sonatype.org", + ".w3.org", + ".wikipedia.org", ] ) @@ -101,16 +106,7 @@ def valid_url(url: str) -> bool: if "." not in domain: # Ignore "localhost" and other domains without . return False - if domain in IGNORE_DOMAINS: - return False - - if ( - domain.endswith("android.com") - or domain.endswith(".google.com") - or domain.endswith(".microsoft.com") - or domain.endswith(".wikipedia.org") - ): - # Ignore popular domain + if IGNORE_DOMAINS.exists(domain): return False if "{{" in url or "}}" in url: @@ -159,8 +155,6 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None, class Task: _proc: subprocess.Popen[bytes] _stderr: str | None - # To avoid 403 responses - USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)" def __init__(self, url: str): self.url = url @@ -171,7 +165,7 @@ def __init__(self, url: str): "--output", "-", "--user-agent", - self.USER_AGENT, + USER_AGENT, self.url, ], stdout=open(os.devnull, "w"), @@ -256,7 +250,7 @@ def url_checker(num_workers: int = 8) -> None: item = WORKER_QUEUE.get() if item is None: queue_is_empty = True - print("URL queue is over") + print("--- url queue is over ---") break url = item workers[i] = create_new_task(url) diff --git a/scripts/subdomains.py b/scripts/subdomains.py new file mode 100644 index 0000000..7dcb385 --- /dev/null +++ b/scripts/subdomains.py @@ -0,0 +1,59 @@ +import collections +import typing +from collections import defaultdict + + +class Subdomains: + plain_domains: set[str] + domains_by_levels: tuple[tuple[int, list[tuple[str, ...]]], ...] + + def __init__(self, domains: typing.Sequence[str]): + self.plain_domains = set() + + tmp_level_with_dom: defaultdict[int, list[tuple[str, ...]]] = collections.defaultdict(list) + for d in domains: + if d.startswith("."): + level, parts = self.get_level(d) + tmp_level_with_dom[level].append(parts) + else: + self.plain_domains.add(d) + + # Ensure sorted by level + self.domains_by_levels = tuple((key, tmp_level_with_dom[key]) for key in sorted(tmp_level_with_dom.keys())) + + def exists(self, domain_name: str) -> bool: + if domain_name in self.plain_domains: + return True + + level: int + parts: tuple[str, ...] + level, parts = self.get_level(domain_name) + + domains: list[tuple[str, ...]] + for known_level, domains in self.domains_by_levels: + if known_level > level: + # Do not search in upper domains + # This means nothing could be found since search is from lower to upper + return False + + dom: tuple[str, ...] + for dom in domains: + if parts[:known_level] == tuple(dom): + return True + + return False + + @staticmethod + def get_level(domain_name: str) -> tuple[int, tuple[str, ...]]: + parts = domain_name.strip(".").split(".") + return len(parts), tuple(reversed(parts)) + + +def test() -> None: + sd = Subdomains([".very.long.domain.name", "android.com", ".google.com"]) + assert sd.exists("test.google.com") + assert not sd.exists("test.android.com") + + +if __name__ == "__main__": + test()