Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 36 additions & 42 deletions scripts/check-urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from queue import SimpleQueue

from github_job_summary import JobSummary
from subdomains import Subdomains

"""
Read file names from stdin (feed from git ls-files)
Expand All @@ -18,6 +19,9 @@
Check them with CURL
"""

# To avoid 403 responses
USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"


class Curl:
"""
Expand Down Expand Up @@ -49,35 +53,36 @@ class Curl:
]
)

IGNORE_DOMAINS: frozenset[str] = frozenset(
IGNORE_DOMAINS = Subdomains(
[
"central.sonatype.org",
"curl.se",
"dart.dev",
"getcomposer.org",
"go.dev",
"maven.apache.org",
"mvnrepository.com",
"mvnrepository.com",
"nodejs.org",
"packagist.org",
"pkg.go.dev",
"pub.dev",
"pypi.org",
"pypi.python.org",
"repo1.maven.org",
"tools.ietf.org",
"urllib3.readthedocs.io",
"www.apache.org",
"www.dartlang.org",
"www.gradle.org",
"www.mojohaus.org",
"www.npmjs.com",
"www.nuget.org",
"www.opensource.org",
"www.php.net",
"www.python.org",
"www.w3.org",
".android.com",
".apache.org",
".curl.se",
".dart.dev",
".dartlang.org",
".getcomposer.org",
".go.dev",
".google.com",
".gradle.org",
".ietf.org",
".maven.org",
".microsoft.com",
".mojohaus.org",
".mvnrepository.com",
".nodejs.org",
".npmjs.com",
".nuget.org",
".opensource.org",
".packagist.org",
".php.net",
".phpunit.de",
".pub.dev",
".pypi.org",
".python.org",
".readthedocs.io",
".sonatype.org",
".w3.org",
".wikipedia.org",
]
)

Expand All @@ -101,16 +106,7 @@ def valid_url(url: str) -> bool:
if "." not in domain:
# Ignore "localhost" and other domains without .
return False
if domain in IGNORE_DOMAINS:
return False

if (
domain.endswith("android.com")
or domain.endswith(".google.com")
or domain.endswith(".microsoft.com")
or domain.endswith(".wikipedia.org")
):
# Ignore popular domain
if IGNORE_DOMAINS.exists(domain):
return False

if "{{" in url or "}}" in url:
Expand Down Expand Up @@ -159,8 +155,6 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
class Task:
_proc: subprocess.Popen[bytes]
_stderr: str | None
# To avoid 403 responses
USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"

def __init__(self, url: str):
self.url = url
Expand All @@ -171,7 +165,7 @@ def __init__(self, url: str):
"--output",
"-",
"--user-agent",
self.USER_AGENT,
USER_AGENT,
self.url,
],
stdout=open(os.devnull, "w"),
Expand Down Expand Up @@ -256,7 +250,7 @@ def url_checker(num_workers: int = 8) -> None:
item = WORKER_QUEUE.get()
if item is None:
queue_is_empty = True
print("URL queue is over")
print("--- url queue is over ---")
break
url = item
workers[i] = create_new_task(url)
Expand Down
59 changes: 59 additions & 0 deletions scripts/subdomains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import collections
import typing
from collections import defaultdict


class Subdomains:
plain_domains: set[str]
domains_by_levels: tuple[tuple[int, list[tuple[str, ...]]], ...]

def __init__(self, domains: typing.Sequence[str]):
self.plain_domains = set()

tmp_level_with_dom: defaultdict[int, list[tuple[str, ...]]] = collections.defaultdict(list)
for d in domains:
if d.startswith("."):
level, parts = self.get_level(d)
tmp_level_with_dom[level].append(parts)
else:
self.plain_domains.add(d)

# Ensure sorted by level
self.domains_by_levels = tuple((key, tmp_level_with_dom[key]) for key in sorted(tmp_level_with_dom.keys()))

def exists(self, domain_name: str) -> bool:
if domain_name in self.plain_domains:
return True

level: int
parts: tuple[str, ...]
level, parts = self.get_level(domain_name)

domains: list[tuple[str, ...]]
for known_level, domains in self.domains_by_levels:
if known_level > level:
# Do not search in upper domains
# This means nothing could be found since search is from lower to upper
return False

dom: tuple[str, ...]
for dom in domains:
if parts[:known_level] == tuple(dom):
return True

return False

@staticmethod
def get_level(domain_name: str) -> tuple[int, tuple[str, ...]]:
parts = domain_name.strip(".").split(".")
return len(parts), tuple(reversed(parts))


def test() -> None:
sd = Subdomains([".very.long.domain.name", "android.com", ".google.com"])
assert sd.exists("test.google.com")
assert not sd.exists("test.android.com")


if __name__ == "__main__":
test()