From 9011281388968accb621d48ac53de34bad7baa44 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 19 May 2026 07:27:09 -0400 Subject: [PATCH 1/4] =?UTF-8?q?feat(#61):=20DOWNLOAD=5FALWAYS=5FWHITELIST?= =?UTF-8?q?=20=E2=80=94=20operator-controlled=20re-processing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds ``ckanext.datapusher_plus.download_always_whitelist``: a whitespace-separated list of hostnames whose resources always get re-downloaded + re-analyzed, bypassing the file-hash-based upload-skip optimization in ``DownloadStage._should_skip_upload``. Use cases: * Hosts that update content in place without changing the published byte hash (e.g., a daily report that overwrites the same URL with the same template but new underlying data). * Local / peered hosts where re-download cost is negligible and operators want forced re-analysis on every push. Issue #61 was originally filed alongside a never-built ``DOWNLOAD_PREVIEW_ONLY`` partial-download mode (DP+'s architecture landed on full-file download for comprehensive metadata inference, so that parent feature is moot). Per the 2026-05-19 BACKLOG-SWEEP3 reinterpretation, the name is kept but the semantic is repurposed as a re-processing trigger — strictly additive to the existing ``IGNORE_FILE_HASH`` global override and the per-job ``ignore_hash`` metadata flag. Implementation: * ``config.py`` parses the value into a ``frozenset`` of lowercased hostnames at import time (mirrors the ``FORMATS`` list-config pattern; empty default preserves legacy behavior). * New ``_host_in_always_whitelist(url)`` helper in ``jobs/stages/download.py``: case-insensitive exact hostname match, port-stripped, defensive against missing/unparseable URLs (returns ``False`` rather than raising on the hot path). * ``_should_skip_upload`` short-circuits to ``False`` when the resource URL's host is in the whitelist, with an INFO log so operators can see WHY a whitelisted host bypassed the dedup path. * ``config_declaration.yaml`` declares the key as ``editable: true`` with operator-facing docs and an example. Tests (18, all passing — suite 251 → 269 Python): * Config parsing: empty / whitespace-string / list-value / empty-entries-filtered (4 tests). * Helper: exact match, case-insensitive, port-stripped, IP literal, unparseable URL, empty URL, ``None`` URL, hostless URL (8 tests). * Integration: ``_should_skip_upload`` bypasses skip when whitelisted, preserves skip when NOT whitelisted, preserves legacy behavior when whitelist is empty (3 tests). * Drift guard: ``config.py`` inline fallback agrees with ``config_declaration.yaml`` default — the #179-style declaration-vs-fallback regression that bit DP+ once before (1 test, AST-parsed to avoid CKAN bootstrap). Closes #61. Co-Authored-By: Claude Opus 4.7 (1M context) --- ckanext/datapusher_plus/config.py | 18 + .../datapusher_plus/config_declaration.yaml | 26 ++ .../datapusher_plus/jobs/stages/download.py | 49 +++ ...test_issue_61_download_always_whitelist.py | 367 ++++++++++++++++++ 4 files changed, 460 insertions(+) create mode 100644 tests/test_issue_61_download_always_whitelist.py diff --git a/ckanext/datapusher_plus/config.py b/ckanext/datapusher_plus/config.py index 979ddfc..bda11c8 100644 --- a/ckanext/datapusher_plus/config.py +++ b/ckanext/datapusher_plus/config.py @@ -112,6 +112,24 @@ tk.config.get("ckanext.datapusher_plus.ignore_file_hash", False) ) +# Issue #61: hostnames whose resources should always be re-processed by +# DP+, bypassing the file-hash-based upload-skip optimization in the +# download stage's ``_should_skip_upload``. Use this for hosts that +# update content in place without changing the published byte hash +# (e.g., daily-refreshed reports that overwrite the same URL), or for +# local/peered hosts where re-download is cheap and operators want +# forced re-analysis. Whitespace-separated string in ckan.ini; parsed +# once at import and lowercased into a ``set`` for O(1) lookups. +# Empty (default) → all hosts go through the normal hash-skip path. +DOWNLOAD_ALWAYS_WHITELIST = tk.config.get( + "ckanext.datapusher_plus.download_always_whitelist", "" +) +if isinstance(DOWNLOAD_ALWAYS_WHITELIST, str): + DOWNLOAD_ALWAYS_WHITELIST = DOWNLOAD_ALWAYS_WHITELIST.split() +DOWNLOAD_ALWAYS_WHITELIST = frozenset( + h.lower() for h in DOWNLOAD_ALWAYS_WHITELIST if h +) + # Issue #221: ``ckanext.datapusher_plus.file_hash_algorithm`` is declared in # ``config_declaration.yaml`` but intentionally NOT mirrored here as a # module-level constant. The setting is ``editable: true`` and the download diff --git a/ckanext/datapusher_plus/config_declaration.yaml b/ckanext/datapusher_plus/config_declaration.yaml index a3acd8f..69f03e6 100644 --- a/ckanext/datapusher_plus/config_declaration.yaml +++ b/ckanext/datapusher_plus/config_declaration.yaml @@ -63,6 +63,32 @@ groups: description: | Download proxy + - key: ckanext.datapusher_plus.download_always_whitelist + editable: true + default: "" + description: | + Whitespace-separated list of hostnames whose resources should + always be re-processed by DP+, bypassing the file-hash-based + upload-skip optimization (see ``_should_skip_upload`` in the + download stage). Use this for hosts that update content in + place without changing the published byte hash (e.g., daily- + refreshed reports that overwrite the same URL with the same + bytes but logically-different rows), or for local/peered + hosts where re-download cost is negligible and operators + want forced re-analysis on every push. + + Matching is case-insensitive on the URL hostname; ports are + ignored. Empty (default) → all hosts go through the normal + hash-skip path. Worker restart is required for changes to + take effect. + + Example:: + + ckanext.datapusher_plus.download_always_whitelist = + data.internal.gov data.partner.org localhost + + Issue #61. + - key: ckanext.datapusher_plus.dictionary_stash_dir editable: true default: "" diff --git a/ckanext/datapusher_plus/jobs/stages/download.py b/ckanext/datapusher_plus/jobs/stages/download.py index dc8d220..f8d8862 100644 --- a/ckanext/datapusher_plus/jobs/stages/download.py +++ b/ckanext/datapusher_plus/jobs/stages/download.py @@ -76,6 +76,38 @@ def _get_file_hasher(): ) +def _host_in_always_whitelist(url: str) -> bool: + """Return True if ``url``'s hostname is in DOWNLOAD_ALWAYS_WHITELIST. + + Used by the download stage to bypass the hash-skip optimization + in ``_should_skip_upload`` for trusted/peered hosts or hosts + that update content in place without changing the published + byte hash. See issue #61 for the operator-side rationale and + ``config.py:DOWNLOAD_ALWAYS_WHITELIST`` for the parsed shape. + + Args: + url: The resource URL as stored in ``resource["url"]``. + + Returns: + ``True`` if the URL's hostname (case-folded, port stripped) + is in the configured whitelist. ``False`` when the whitelist + is empty, the URL is missing or unparseable, or the URL has + no hostname (relative URLs, ``file://`` with empty host, + etc.) — i.e., the safe default is "do not bypass." + """ + if not conf.DOWNLOAD_ALWAYS_WHITELIST: + return False + if not url: + return False + try: + host = urlparse(url).hostname + except (ValueError, AttributeError): + return False + if not host: + return False + return host.lower() in conf.DOWNLOAD_ALWAYS_WHITELIST + + class DownloadStage(BaseStage): """ Downloads the resource file, validates it, and handles ZIP extraction. @@ -383,6 +415,23 @@ def _should_skip_upload( Returns: True if upload should be skipped, False otherwise """ + # Issue #61: forced re-processing for operator-whitelisted hosts. + # Bypasses the hash-skip path entirely so resources from these + # hosts get re-downloaded + re-analyzed on every push. This is + # intended for hosts that update content in place without + # changing the byte hash (e.g., a daily report that overwrites + # the same URL with the same template but new underlying data), + # or for local/peered hosts where re-download is essentially + # free and operators want forced re-analysis. + url = context.resource.get("url", "") if context.resource else "" + if _host_in_always_whitelist(url): + context.logger.info( + "Host %r is in DOWNLOAD_ALWAYS_WHITELIST; forcing " + "re-processing (bypassing hash-skip).", + urlparse(url).hostname, + ) + return False + # Check if resource metadata was updated resource_updated = False resource_last_modified = context.resource.get("last_modified") diff --git a/tests/test_issue_61_download_always_whitelist.py b/tests/test_issue_61_download_always_whitelist.py new file mode 100644 index 0000000..0031b72 --- /dev/null +++ b/tests/test_issue_61_download_always_whitelist.py @@ -0,0 +1,367 @@ +# -*- coding: utf-8 -*- +""" +Regression coverage for issue #61 ("Smart resource download"). + +Issue #61 (filed 2023-02-03) originally proposed a +``DOWNLOAD_ALWAYS_WHITELIST`` that paired with a never-built +``DOWNLOAD_PREVIEW_ONLY`` partial-download mode. DP+'s architecture +landed on "always download the full file for comprehensive metadata +inference," so the partial-download parent is moot. + +The 2026-05-19 reinterpretation (kept in the BACKLOG-SWEEP3 handoff +doc): keep the ``DOWNLOAD_ALWAYS_WHITELIST`` name but repurpose it +as an operator-controlled list of hostnames that **always trigger +re-processing**, bypassing the file-hash-based upload-skip +optimization in ``DownloadStage._should_skip_upload``. The use +cases: + + * Daily-refreshed reports that overwrite the same URL with the same + bytes (so the file hash matches) but logically-different data — + operators want each push re-analyzed regardless of hash. + * Local / peered hosts where re-download cost is negligible and the + operator wants forced re-analysis on every push. + +These tests pin six properties: + +1. The config parses an empty default to an empty ``frozenset`` + (default behavior is "no host bypass"). +2. A whitespace-separated string of hostnames parses into a + lowercased ``frozenset`` (case folding makes operator-typed + ``DATA.GOV`` match ``data.gov`` on the wire). +3. ``_host_in_always_whitelist`` matches exact hostnames + case-insensitively, strips ports, and is robust against + missing/unparseable URLs (returns ``False`` rather than raising). +4. ``_should_skip_upload`` returns ``False`` when the host is in the + whitelist even when the byte hash matches — the core promise. +5. ``_should_skip_upload`` preserves its prior skip behavior for + hosts NOT in the whitelist (regression guard against an + inadvertent "always re-process everyone" bug). +6. The YAML declaration default agrees with the ``config.py`` + import-time fallback (the #179-style declaration-vs-fallback + drift guard). +""" + +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace +from unittest import mock + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parent.parent + + +# --------------------------------------------------------------------------- +# Config parsing +# --------------------------------------------------------------------------- + + +def _reimport_conf(monkeypatch, value): + """Reload config.py with the given whitelist value applied to + ``tk.config`` so the import-time parse picks it up. Returns the + freshly-imported module.""" + import importlib + + pytest.importorskip("ckan") + import ckan.plugins.toolkit as tk + + monkeypatch.setitem( + tk.config, "ckanext.datapusher_plus.download_always_whitelist", value + ) + import ckanext.datapusher_plus.config as conf + + return importlib.reload(conf) + + +def test_empty_config_parses_to_empty_frozenset(monkeypatch): + # Default behavior — operators without the setting must not get a + # surprise "everyone is whitelisted" or a crash on first download. + conf = _reimport_conf(monkeypatch, "") + assert conf.DOWNLOAD_ALWAYS_WHITELIST == frozenset() + assert isinstance(conf.DOWNLOAD_ALWAYS_WHITELIST, frozenset) + + +def test_whitespace_separated_string_parses_to_lowercased_frozenset( + monkeypatch, +): + # Mirrors the existing ``FORMATS`` pattern: whitespace-separated + # in ``ckan.ini``, parsed at import. Lowercasing happens here so + # the hot-path ``_host_in_always_whitelist`` doesn't have to + # re-fold the set on every call. + conf = _reimport_conf( + monkeypatch, + "Data.Internal.Gov data.partner.org\tlocalhost", + ) + assert conf.DOWNLOAD_ALWAYS_WHITELIST == frozenset( + {"data.internal.gov", "data.partner.org", "localhost"} + ) + + +def test_list_value_parses_to_lowercased_frozenset(monkeypatch): + # Some CKAN config layers (env-vars via ``CKAN_INI`` overrides, + # programmatic ``tk.config`` patches in tests) pass list values + # through directly. The branch that handles ``isinstance(str)`` + # must leave a real list alone. + conf = _reimport_conf(monkeypatch, ["Foo.example", "BAR.example"]) + assert conf.DOWNLOAD_ALWAYS_WHITELIST == frozenset( + {"foo.example", "bar.example"} + ) + + +def test_empty_strings_in_split_are_filtered(monkeypatch): + # ``" data.gov ".split()`` already drops empties, but if the + # operator writes ``"data.gov, , partner.org"`` and a downstream + # comma-tolerant parser is added later, the comprehension's + # ``if h`` guard catches the empty entries so the frozenset + # doesn't contain ``""`` (which would match every hostless URL). + conf = _reimport_conf(monkeypatch, "data.gov partner.org") + assert "" not in conf.DOWNLOAD_ALWAYS_WHITELIST + + +# --------------------------------------------------------------------------- +# Helper: _host_in_always_whitelist +# --------------------------------------------------------------------------- + + +@pytest.fixture +def helper(monkeypatch): + """Return the freshly-imported ``_host_in_always_whitelist`` with + a known whitelist applied. Each test gets its own fresh module + state so monkeypatching is hermetic.""" + pytest.importorskip("ckan") + _reimport_conf(monkeypatch, "data.internal.gov localhost 10.0.0.5") + # Reload the download module too so its ``conf`` reference picks + # up the reloaded constant. Without this, the function closes + # over an old conf object and the test would see the old value. + import importlib + import ckanext.datapusher_plus.jobs.stages.download as download + + download = importlib.reload(download) + return download._host_in_always_whitelist + + +def test_empty_whitelist_returns_false(monkeypatch): + # If no whitelist is configured, the helper must short-circuit + # ``False`` so it can't even mis-match the empty string against a + # hostless URL. + pytest.importorskip("ckan") + _reimport_conf(monkeypatch, "") + import importlib + import ckanext.datapusher_plus.jobs.stages.download as download + + download = importlib.reload(download) + assert download._host_in_always_whitelist("https://anything.example") is False + + +def test_exact_hostname_match_returns_true(helper): + assert helper("https://data.internal.gov/csv/file.csv") is True + + +def test_non_matching_hostname_returns_false(helper): + assert helper("https://data.external.gov/csv/file.csv") is False + + +def test_match_is_case_insensitive(helper): + # URL hosts are DNS-case-insensitive; the helper must fold the + # URL's host to lowercase before set lookup or it'd miss + # ``https://DATA.INTERNAL.GOV/...`` (which a CDN or legacy CMS + # could plausibly emit). + assert helper("https://DATA.INTERNAL.GOV/file.csv") is True + + +def test_port_is_stripped_before_match(helper): + # ``urlparse.hostname`` already drops the port, but pin this so a + # future "use raw netloc" refactor would break this test before + # it breaks production for ``localhost:5050`` deployments. + assert helper("http://localhost:5050/api/3/action/...") is True + + +def test_ip_address_host_matches(helper): + # IPv4 literals are valid hosts; ``hostname`` returns them as-is. + # Used by deployments that pin internal services by IP. + assert helper("http://10.0.0.5/csv/file.csv") is True + + +def test_unparseable_url_returns_false(helper): + # Garbage in → ``False`` out, not an exception. The download + # stage handles its own URL validation elsewhere + # (``_validate_url_scheme``); this helper should not become a + # second exception source on the hot path. + assert helper("://://broken") is False + + +def test_empty_url_returns_false(helper): + assert helper("") is False + + +def test_none_url_returns_false(helper): + # Defensive against ``context.resource.get("url")`` returning + # ``None`` (which is technically possible if the resource record + # is malformed — surface as a non-match rather than an + # ``AttributeError`` on ``.hostname``). + assert helper(None) is False + + +def test_hostless_url_returns_false(helper): + # ``file:///tmp/foo.csv`` parses cleanly but has an empty host; + # must not match an empty-string entry in the set (the + # ``if h`` filter in config.py guarantees the set has no ``""``, + # but pin the helper's own guard too). + assert helper("file:///tmp/foo.csv") is False + + +# --------------------------------------------------------------------------- +# Integration: _should_skip_upload honors the whitelist +# --------------------------------------------------------------------------- + + +def _make_context(url, file_hash="abc123", last_modified=None): + """Build a minimal stand-in for ``ProcessingContext`` shaped just + enough for ``_should_skip_upload`` to run. Uses ``SimpleNamespace`` + + ``MagicMock`` for the logger so the production code's + ``context.logger.info`` call is silent + assertable.""" + return SimpleNamespace( + resource={ + "url": url, + "hash": file_hash, + "last_modified": last_modified, + }, + metadata={}, + logger=mock.MagicMock(), + ) + + +def test_should_skip_upload_returns_false_when_host_whitelisted(monkeypatch): + # The core promise of #61: a host in the whitelist forces + # re-processing even when the byte hash matches what's already + # stored on the resource. + pytest.importorskip("ckan") + _reimport_conf(monkeypatch, "data.internal.gov") + import importlib + import ckanext.datapusher_plus.jobs.stages.download as download + + download = importlib.reload(download) + + stage = download.DownloadStage() + ctx = _make_context( + url="https://data.internal.gov/csv/file.csv", + file_hash="matching-hash", + ) + + assert stage._should_skip_upload(ctx, "matching-hash", {}) is False + # And the operator-facing log line fired so they can see WHY a + # whitelisted host bypassed the dedup path. + ctx.logger.info.assert_called_once() + + +def test_should_skip_upload_preserves_skip_for_non_whitelisted(monkeypatch): + # Regression guard: a host NOT in the whitelist must still skip + # when the hash matches, ``ignore_hash`` is false, and + # ``IGNORE_FILE_HASH`` is false. Without this test, an + # accidentally-inverted whitelist check (e.g., ``not in``) would + # silently re-process every resource — bypassing the whole + # dedup optimization the stage was built around. + pytest.importorskip("ckan") + _reimport_conf(monkeypatch, "data.internal.gov") + import importlib + import ckanext.datapusher_plus.jobs.stages.download as download + + download = importlib.reload(download) + # Force IGNORE_FILE_HASH false so the hash check is the deciding + # factor (otherwise the existing global override would also + # return False and mask a buggy whitelist). + monkeypatch.setattr(download.conf, "IGNORE_FILE_HASH", False) + + stage = download.DownloadStage() + ctx = _make_context( + url="https://data.external.gov/csv/file.csv", + file_hash="matching-hash", + ) + + assert stage._should_skip_upload(ctx, "matching-hash", {}) is True + + +def test_should_skip_upload_empty_whitelist_preserves_legacy_behavior( + monkeypatch, +): + # Operators who never set the new key get exactly the pre-#61 + # behavior — hash match means skip. Without this test, a future + # refactor that accidentally inverted the "empty whitelist + # short-circuit" path could break upgrade-in-place deployments. + pytest.importorskip("ckan") + _reimport_conf(monkeypatch, "") + import importlib + import ckanext.datapusher_plus.jobs.stages.download as download + + download = importlib.reload(download) + monkeypatch.setattr(download.conf, "IGNORE_FILE_HASH", False) + + stage = download.DownloadStage() + ctx = _make_context( + url="https://data.internal.gov/csv/file.csv", + file_hash="matching-hash", + ) + + assert stage._should_skip_upload(ctx, "matching-hash", {}) is True + + +# --------------------------------------------------------------------------- +# Declaration-vs-fallback drift guard (#179-style) +# --------------------------------------------------------------------------- + + +def test_config_declaration_default_matches_config_py_fallback(): + """Both sources of truth for the default whitelist must agree. + + Drift between ``config_declaration.yaml`` and ``config.py``'s + ``tk.config.get(..., default)`` was the root cause of issue #179 + (date → timestamp casting). Pin the same property for #61 so a + future "let's only update one of them" PR fails this test + instead of silently shipping inconsistent defaults. + """ + import ast + import yaml + + # Read declaration default + yaml_path = ( + REPO_ROOT + / "ckanext" + / "datapusher_plus" + / "config_declaration.yaml" + ) + with yaml_path.open() as fh: + decl = yaml.safe_load(fh) + options = decl["groups"][0]["options"] + whitelist_decl = next( + opt + for opt in options + if opt["key"] == "ckanext.datapusher_plus.download_always_whitelist" + ) + assert whitelist_decl["default"] == "" + assert whitelist_decl["editable"] is True + + # Read config.py inline fallback via AST (avoid CKAN bootstrap) + config_py = ( + REPO_ROOT / "ckanext" / "datapusher_plus" / "config.py" + ).read_text() + tree = ast.parse(config_py) + found_default = None + for node in ast.walk(tree): + if ( + isinstance(node, ast.Call) + and getattr(node.func, "attr", None) == "get" + and node.args + and isinstance(node.args[0], ast.Constant) + and node.args[0].value + == "ckanext.datapusher_plus.download_always_whitelist" + ): + # Second positional arg is the fallback + if len(node.args) >= 2 and isinstance(node.args[1], ast.Constant): + found_default = node.args[1].value + break + assert found_default == "", ( + "config.py inline fallback for download_always_whitelist must " + "match the YAML declaration default (both empty string)." + ) From ffb7f5f35ae8dc69df629bc40e612c816ed0ccb6 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 19 May 2026 07:33:30 -0400 Subject: [PATCH 2/4] =?UTF-8?q?fix(#61):=20address=20roborev=20#2293=20?= =?UTF-8?q?=E2=80=94=20live-read=20+=20return=20matched=20host?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Roborev review #2293 surfaced 4 LOW findings on the initial DOWNLOAD_ALWAYS_WHITELIST implementation. All addressed: * **L1 — ``editable: true`` semantics:** The whitelist was declared ``editable: true`` (implying admin-UI live edits should take effect) but parsed once at module import into a frozenset constant. Operators editing via the admin UI would see no effect until worker restart — inconsistent with sibling ``file_hash_algorithm`` (#221) which is read live. Fix: removed the ``DOWNLOAD_ALWAYS_WHITELIST`` module constant from ``config.py`` (replaced with a docstring mirroring the ``file_hash_algorithm`` pattern). Added ``_get_download_always_whitelist()`` to ``download.py`` that reads + parses ``tk.config`` live on each call. Parse cost is negligible since hash-skip fires at most once per download. * **L2 — comment said ``set`` but constant was ``frozenset``:** Obsoleted by L1 (the comment block was rewritten). * **L3 — double ``urlparse(url)`` per skip-check:** The original ``_should_skip_upload`` called ``urlparse`` once inside ``_host_in_always_whitelist`` (for the match) and again in the ``info`` log to recover the hostname. Fix: ``_host_in_always_whitelist`` now returns ``Optional[str]`` — the matched lowercased host (port stripped) or ``None``. The caller passes that returned host directly into the log call. One urlparse per whitelist check. * **L4 — no test pinned subdomain non-matching semantics:** The implementation does exact-hostname matching by design (so ``data.gov`` does NOT match ``subdomain.data.gov``), but no test pinned that. A future "support subdomain wildcards" patch could silently change behavior. Fix: added two regression tests (``test_subdomain_does_not_match_parent_domain``, ``test_parent_domain_does_not_match_subdomain_entry``) and called out "**Exact-match only**" in the YAML declaration's description. Tests: 18 → 23 (+5): * Subdomain non-match × 2 directions (L4). * Live-read picks up runtime changes (L1 — proves the ``editable: true`` declaration is now truthful). * Malformed-value defense (a non-iterable int in tk.config degrades to empty frozenset, doesn't crash). * Empty-string parse split from the unset-key case. Full Python suite: 269 → 274, all passing. Co-Authored-By: Claude Opus 4.7 (1M context) --- ckanext/datapusher_plus/config.py | 24 +- .../datapusher_plus/config_declaration.yaml | 15 +- .../datapusher_plus/jobs/stages/download.py | 69 +++- ...test_issue_61_download_always_whitelist.py | 389 ++++++++++++------ 4 files changed, 323 insertions(+), 174 deletions(-) diff --git a/ckanext/datapusher_plus/config.py b/ckanext/datapusher_plus/config.py index bda11c8..4f89146 100644 --- a/ckanext/datapusher_plus/config.py +++ b/ckanext/datapusher_plus/config.py @@ -112,23 +112,13 @@ tk.config.get("ckanext.datapusher_plus.ignore_file_hash", False) ) -# Issue #61: hostnames whose resources should always be re-processed by -# DP+, bypassing the file-hash-based upload-skip optimization in the -# download stage's ``_should_skip_upload``. Use this for hosts that -# update content in place without changing the published byte hash -# (e.g., daily-refreshed reports that overwrite the same URL), or for -# local/peered hosts where re-download is cheap and operators want -# forced re-analysis. Whitespace-separated string in ckan.ini; parsed -# once at import and lowercased into a ``set`` for O(1) lookups. -# Empty (default) → all hosts go through the normal hash-skip path. -DOWNLOAD_ALWAYS_WHITELIST = tk.config.get( - "ckanext.datapusher_plus.download_always_whitelist", "" -) -if isinstance(DOWNLOAD_ALWAYS_WHITELIST, str): - DOWNLOAD_ALWAYS_WHITELIST = DOWNLOAD_ALWAYS_WHITELIST.split() -DOWNLOAD_ALWAYS_WHITELIST = frozenset( - h.lower() for h in DOWNLOAD_ALWAYS_WHITELIST if h -) +# Issue #61: ``ckanext.datapusher_plus.download_always_whitelist`` is +# declared in ``config_declaration.yaml`` (``editable: true``) but is +# intentionally NOT mirrored here as a module-level constant. The +# download stage's ``_host_in_always_whitelist`` reads it from +# ``tk.config`` live at each call so a runtime change via the admin UI +# (adding/removing a host mid-incident, e.g.) takes effect without a +# worker restart. Mirrors the ``file_hash_algorithm`` pattern below. # Issue #221: ``ckanext.datapusher_plus.file_hash_algorithm`` is declared in # ``config_declaration.yaml`` but intentionally NOT mirrored here as a diff --git a/ckanext/datapusher_plus/config_declaration.yaml b/ckanext/datapusher_plus/config_declaration.yaml index 69f03e6..448dc8c 100644 --- a/ckanext/datapusher_plus/config_declaration.yaml +++ b/ckanext/datapusher_plus/config_declaration.yaml @@ -77,10 +77,17 @@ groups: hosts where re-download cost is negligible and operators want forced re-analysis on every push. - Matching is case-insensitive on the URL hostname; ports are - ignored. Empty (default) → all hosts go through the normal - hash-skip path. Worker restart is required for changes to - take effect. + The download stage reads this value live from ``tk.config`` + (per the ``file_hash_algorithm`` pattern from #221), so + admin-UI edits take effect on the next download — no worker + restart needed. + + Matching is case-insensitive on the URL hostname and ports + are ignored. **Exact-match only** — ``data.gov`` in the + whitelist does NOT match ``subdomain.data.gov``; list each + host separately if you need multiple subdomains. Empty + (default) → all hosts go through the normal hash-skip + path. Example:: diff --git a/ckanext/datapusher_plus/jobs/stages/download.py b/ckanext/datapusher_plus/jobs/stages/download.py index f8d8862..4d5bc49 100644 --- a/ckanext/datapusher_plus/jobs/stages/download.py +++ b/ckanext/datapusher_plus/jobs/stages/download.py @@ -76,36 +76,70 @@ def _get_file_hasher(): ) -def _host_in_always_whitelist(url: str) -> bool: - """Return True if ``url``'s hostname is in DOWNLOAD_ALWAYS_WHITELIST. +def _get_download_always_whitelist(): + """Return the parsed DOWNLOAD_ALWAYS_WHITELIST from live config. + + Reads ``ckanext.datapusher_plus.download_always_whitelist`` from + ``tk.config`` live (declared ``editable: true`` in + ``config_declaration.yaml``) so admin-UI edits take effect without + a worker restart. Mirrors the ``_get_file_hasher`` pattern from + issue #221. Parses on each call; the cost is negligible since the + hash-skip check fires at most once per download. + + Returns: + ``frozenset`` of lowercased hostnames. Empty set if the + config is unset, empty, or the value is malformed. + """ + import ckan.plugins.toolkit as tk + + raw = tk.config.get( + "ckanext.datapusher_plus.download_always_whitelist", "" + ) + if isinstance(raw, str): + raw = raw.split() + try: + return frozenset(h.lower() for h in raw if h) + except (TypeError, AttributeError): + # Defensive: malformed config (e.g., a non-iterable) should + # degrade to "no host bypass" rather than crash the download. + return frozenset() + + +def _host_in_always_whitelist(url): + """Return the matched hostname if ``url``'s host is whitelisted, else ``None``. Used by the download stage to bypass the hash-skip optimization in ``_should_skip_upload`` for trusted/peered hosts or hosts that update content in place without changing the published - byte hash. See issue #61 for the operator-side rationale and - ``config.py:DOWNLOAD_ALWAYS_WHITELIST`` for the parsed shape. + byte hash. See issue #61 for the operator-side rationale. + + Returning the matched host (rather than a bare ``bool``) lets + the caller log the host without a second ``urlparse`` call. Args: url: The resource URL as stored in ``resource["url"]``. Returns: - ``True`` if the URL's hostname (case-folded, port stripped) - is in the configured whitelist. ``False`` when the whitelist - is empty, the URL is missing or unparseable, or the URL has - no hostname (relative URLs, ``file://`` with empty host, - etc.) — i.e., the safe default is "do not bypass." + The lowercased hostname (with port stripped) when matched; + ``None`` when the whitelist is empty, the URL is missing or + unparseable, the URL has no hostname (relative URLs, + ``file://`` with empty host, etc.), or the host is not in + the whitelist. Matching is exact: ``data.gov`` in the + whitelist does NOT match ``subdomain.data.gov``. """ - if not conf.DOWNLOAD_ALWAYS_WHITELIST: - return False + whitelist = _get_download_always_whitelist() + if not whitelist: + return None if not url: - return False + return None try: host = urlparse(url).hostname except (ValueError, AttributeError): - return False + return None if not host: - return False - return host.lower() in conf.DOWNLOAD_ALWAYS_WHITELIST + return None + host = host.lower() + return host if host in whitelist else None class DownloadStage(BaseStage): @@ -424,11 +458,12 @@ def _should_skip_upload( # or for local/peered hosts where re-download is essentially # free and operators want forced re-analysis. url = context.resource.get("url", "") if context.resource else "" - if _host_in_always_whitelist(url): + matched_host = _host_in_always_whitelist(url) + if matched_host: context.logger.info( "Host %r is in DOWNLOAD_ALWAYS_WHITELIST; forcing " "re-processing (bypassing hash-skip).", - urlparse(url).hostname, + matched_host, ) return False diff --git a/tests/test_issue_61_download_always_whitelist.py b/tests/test_issue_61_download_always_whitelist.py index 0031b72..1e43dad 100644 --- a/tests/test_issue_61_download_always_whitelist.py +++ b/tests/test_issue_61_download_always_whitelist.py @@ -21,24 +21,29 @@ * Local / peered hosts where re-download cost is negligible and the operator wants forced re-analysis on every push. -These tests pin six properties: +These tests pin seven properties: -1. The config parses an empty default to an empty ``frozenset`` - (default behavior is "no host bypass"). -2. A whitespace-separated string of hostnames parses into a - lowercased ``frozenset`` (case folding makes operator-typed +1. The live-read parser handles an unset / empty value as + "no host bypass" (the default behavior). +2. Whitespace-separated strings and direct list values both parse + into a lowercased ``frozenset`` (case folding makes operator-typed ``DATA.GOV`` match ``data.gov`` on the wire). 3. ``_host_in_always_whitelist`` matches exact hostnames - case-insensitively, strips ports, and is robust against - missing/unparseable URLs (returns ``False`` rather than raising). -4. ``_should_skip_upload`` returns ``False`` when the host is in the + case-insensitively, strips ports, returns the matched host (so + the caller can log it without a second ``urlparse``), and is + robust against missing/unparseable URLs (returns ``None`` + rather than raising). +4. Matching is exact — ``data.gov`` in the whitelist does NOT + match ``subdomain.data.gov``. Pinned so a future "support + subdomains" patch breaks this test rather than quietly + changing behavior. +5. ``_should_skip_upload`` returns ``False`` when the host is in the whitelist even when the byte hash matches — the core promise. -5. ``_should_skip_upload`` preserves its prior skip behavior for +6. ``_should_skip_upload`` preserves its prior skip behavior for hosts NOT in the whitelist (regression guard against an inadvertent "always re-process everyone" bug). -6. The YAML declaration default agrees with the ``config.py`` - import-time fallback (the #179-style declaration-vs-fallback - drift guard). +7. Live-read takes effect on the next call (roborev #2293 LOW — + the ``editable: true`` declaration is now truthful). """ from __future__ import annotations @@ -54,162 +59,271 @@ # --------------------------------------------------------------------------- -# Config parsing +# Fixtures: set the whitelist via tk.config, since reads are now live # --------------------------------------------------------------------------- -def _reimport_conf(monkeypatch, value): - """Reload config.py with the given whitelist value applied to - ``tk.config`` so the import-time parse picks it up. Returns the - freshly-imported module.""" - import importlib - +@pytest.fixture +def set_whitelist(monkeypatch): + """Set ``ckanext.datapusher_plus.download_always_whitelist`` live + on ``tk.config``. Returns a setter the test can call multiple + times to simulate admin-UI runtime edits.""" pytest.importorskip("ckan") import ckan.plugins.toolkit as tk - monkeypatch.setitem( - tk.config, "ckanext.datapusher_plus.download_always_whitelist", value - ) - import ckanext.datapusher_plus.config as conf + def _set(value): + monkeypatch.setitem( + tk.config, + "ckanext.datapusher_plus.download_always_whitelist", + value, + ) + + return _set + + +@pytest.fixture +def download_module(): + """Return the download stage module without reloading (live-read + means no reload dance is needed).""" + pytest.importorskip("ckan") + import ckanext.datapusher_plus.jobs.stages.download as download - return importlib.reload(conf) + return download -def test_empty_config_parses_to_empty_frozenset(monkeypatch): +# --------------------------------------------------------------------------- +# Live-read parsing +# --------------------------------------------------------------------------- + + +def test_unset_config_parses_to_empty_frozenset(monkeypatch, download_module): # Default behavior — operators without the setting must not get a # surprise "everyone is whitelisted" or a crash on first download. - conf = _reimport_conf(monkeypatch, "") - assert conf.DOWNLOAD_ALWAYS_WHITELIST == frozenset() - assert isinstance(conf.DOWNLOAD_ALWAYS_WHITELIST, frozenset) + pytest.importorskip("ckan") + import ckan.plugins.toolkit as tk + + monkeypatch.delitem( + tk.config, + "ckanext.datapusher_plus.download_always_whitelist", + raising=False, + ) + parsed = download_module._get_download_always_whitelist() + assert parsed == frozenset() + assert isinstance(parsed, frozenset) + + +def test_empty_string_parses_to_empty_frozenset(set_whitelist, download_module): + set_whitelist("") + assert download_module._get_download_always_whitelist() == frozenset() def test_whitespace_separated_string_parses_to_lowercased_frozenset( - monkeypatch, + set_whitelist, download_module ): # Mirrors the existing ``FORMATS`` pattern: whitespace-separated - # in ``ckan.ini``, parsed at import. Lowercasing happens here so - # the hot-path ``_host_in_always_whitelist`` doesn't have to - # re-fold the set on every call. - conf = _reimport_conf( - monkeypatch, - "Data.Internal.Gov data.partner.org\tlocalhost", - ) - assert conf.DOWNLOAD_ALWAYS_WHITELIST == frozenset( + # in ``ckan.ini``, parsed by the live-read function. Lowercasing + # happens here so the hot-path lookup can use ``in`` directly + # without re-folding. + set_whitelist("Data.Internal.Gov data.partner.org\tlocalhost") + assert download_module._get_download_always_whitelist() == frozenset( {"data.internal.gov", "data.partner.org", "localhost"} ) -def test_list_value_parses_to_lowercased_frozenset(monkeypatch): +def test_list_value_parses_to_lowercased_frozenset( + set_whitelist, download_module +): # Some CKAN config layers (env-vars via ``CKAN_INI`` overrides, # programmatic ``tk.config`` patches in tests) pass list values - # through directly. The branch that handles ``isinstance(str)`` - # must leave a real list alone. - conf = _reimport_conf(monkeypatch, ["Foo.example", "BAR.example"]) - assert conf.DOWNLOAD_ALWAYS_WHITELIST == frozenset( + # through directly. The ``isinstance(str)`` branch must leave a + # real list alone. + set_whitelist(["Foo.example", "BAR.example"]) + assert download_module._get_download_always_whitelist() == frozenset( {"foo.example", "bar.example"} ) -def test_empty_strings_in_split_are_filtered(monkeypatch): - # ``" data.gov ".split()`` already drops empties, but if the - # operator writes ``"data.gov, , partner.org"`` and a downstream - # comma-tolerant parser is added later, the comprehension's - # ``if h`` guard catches the empty entries so the frozenset - # doesn't contain ``""`` (which would match every hostless URL). - conf = _reimport_conf(monkeypatch, "data.gov partner.org") - assert "" not in conf.DOWNLOAD_ALWAYS_WHITELIST +def test_empty_strings_in_split_are_filtered(set_whitelist, download_module): + # If a future comma-tolerant parser is added, the ``if h`` guard + # catches empty entries so the frozenset doesn't contain ``""`` + # (which would match every hostless URL). + set_whitelist("data.gov partner.org") + assert "" not in download_module._get_download_always_whitelist() -# --------------------------------------------------------------------------- -# Helper: _host_in_always_whitelist -# --------------------------------------------------------------------------- +def test_malformed_value_degrades_to_empty_frozenset( + set_whitelist, download_module +): + # Defensive: a non-iterable / non-string value (e.g., someone + # pokes an int into the live config via a broken admin UI) must + # not crash the download stage — falls back to "no host bypass." + set_whitelist(12345) + assert download_module._get_download_always_whitelist() == frozenset() -@pytest.fixture -def helper(monkeypatch): - """Return the freshly-imported ``_host_in_always_whitelist`` with - a known whitelist applied. Each test gets its own fresh module - state so monkeypatching is hermetic.""" - pytest.importorskip("ckan") - _reimport_conf(monkeypatch, "data.internal.gov localhost 10.0.0.5") - # Reload the download module too so its ``conf`` reference picks - # up the reloaded constant. Without this, the function closes - # over an old conf object and the test would see the old value. - import importlib - import ckanext.datapusher_plus.jobs.stages.download as download - - download = importlib.reload(download) - return download._host_in_always_whitelist +# --------------------------------------------------------------------------- +# Helper: _host_in_always_whitelist returns matched host (or None) +# --------------------------------------------------------------------------- -def test_empty_whitelist_returns_false(monkeypatch): +def test_empty_whitelist_returns_none(set_whitelist, download_module): # If no whitelist is configured, the helper must short-circuit - # ``False`` so it can't even mis-match the empty string against a + # ``None`` so it can't even mis-match the empty string against a # hostless URL. - pytest.importorskip("ckan") - _reimport_conf(monkeypatch, "") - import importlib - import ckanext.datapusher_plus.jobs.stages.download as download - - download = importlib.reload(download) - assert download._host_in_always_whitelist("https://anything.example") is False + set_whitelist("") + assert ( + download_module._host_in_always_whitelist("https://anything.example") + is None + ) -def test_exact_hostname_match_returns_true(helper): - assert helper("https://data.internal.gov/csv/file.csv") is True +def test_exact_hostname_match_returns_lowercased_host( + set_whitelist, download_module +): + # Returning the matched host (not just True) lets the caller + # log it without a second urlparse call (roborev #2293 LOW). + set_whitelist("data.internal.gov localhost 10.0.0.5") + matched = download_module._host_in_always_whitelist( + "https://data.internal.gov/csv/file.csv" + ) + assert matched == "data.internal.gov" -def test_non_matching_hostname_returns_false(helper): - assert helper("https://data.external.gov/csv/file.csv") is False +def test_non_matching_hostname_returns_none(set_whitelist, download_module): + set_whitelist("data.internal.gov") + assert ( + download_module._host_in_always_whitelist( + "https://data.external.gov/csv/file.csv" + ) + is None + ) -def test_match_is_case_insensitive(helper): +def test_match_is_case_insensitive(set_whitelist, download_module): # URL hosts are DNS-case-insensitive; the helper must fold the # URL's host to lowercase before set lookup or it'd miss # ``https://DATA.INTERNAL.GOV/...`` (which a CDN or legacy CMS # could plausibly emit). - assert helper("https://DATA.INTERNAL.GOV/file.csv") is True + set_whitelist("data.internal.gov") + matched = download_module._host_in_always_whitelist( + "https://DATA.INTERNAL.GOV/file.csv" + ) + assert matched == "data.internal.gov" -def test_port_is_stripped_before_match(helper): +def test_port_is_stripped_before_match(set_whitelist, download_module): # ``urlparse.hostname`` already drops the port, but pin this so a # future "use raw netloc" refactor would break this test before # it breaks production for ``localhost:5050`` deployments. - assert helper("http://localhost:5050/api/3/action/...") is True + set_whitelist("localhost") + assert ( + download_module._host_in_always_whitelist( + "http://localhost:5050/api/3/action/..." + ) + == "localhost" + ) -def test_ip_address_host_matches(helper): +def test_ip_address_host_matches(set_whitelist, download_module): # IPv4 literals are valid hosts; ``hostname`` returns them as-is. # Used by deployments that pin internal services by IP. - assert helper("http://10.0.0.5/csv/file.csv") is True + set_whitelist("10.0.0.5") + assert ( + download_module._host_in_always_whitelist( + "http://10.0.0.5/csv/file.csv" + ) + == "10.0.0.5" + ) + + +def test_subdomain_does_not_match_parent_domain( + set_whitelist, download_module +): + # roborev #2293 LOW pin: matching is exact-hostname, not + # suffix. Listing ``data.internal.gov`` must NOT also match + # ``sub.data.internal.gov``. Operators who want subdomain + # behavior list each host separately. A future "support + # subdomain wildcards" change should break this test rather + # than quietly altering the semantic. + set_whitelist("data.internal.gov") + assert ( + download_module._host_in_always_whitelist( + "https://sub.data.internal.gov/file.csv" + ) + is None + ) -def test_unparseable_url_returns_false(helper): - # Garbage in → ``False`` out, not an exception. The download +def test_parent_domain_does_not_match_subdomain_entry( + set_whitelist, download_module +): + # The reverse direction — listing the subdomain must not match + # the parent. Catches a naive ``endswith`` implementation. + set_whitelist("sub.data.internal.gov") + assert ( + download_module._host_in_always_whitelist( + "https://data.internal.gov/file.csv" + ) + is None + ) + + +def test_unparseable_url_returns_none(set_whitelist, download_module): + # Garbage in → ``None`` out, not an exception. The download # stage handles its own URL validation elsewhere # (``_validate_url_scheme``); this helper should not become a # second exception source on the hot path. - assert helper("://://broken") is False + set_whitelist("data.gov") + assert ( + download_module._host_in_always_whitelist("://://broken") is None + ) -def test_empty_url_returns_false(helper): - assert helper("") is False +def test_empty_url_returns_none(set_whitelist, download_module): + set_whitelist("data.gov") + assert download_module._host_in_always_whitelist("") is None -def test_none_url_returns_false(helper): +def test_none_url_returns_none(set_whitelist, download_module): # Defensive against ``context.resource.get("url")`` returning # ``None`` (which is technically possible if the resource record # is malformed — surface as a non-match rather than an # ``AttributeError`` on ``.hostname``). - assert helper(None) is False + set_whitelist("data.gov") + assert download_module._host_in_always_whitelist(None) is None -def test_hostless_url_returns_false(helper): +def test_hostless_url_returns_none(set_whitelist, download_module): # ``file:///tmp/foo.csv`` parses cleanly but has an empty host; - # must not match an empty-string entry in the set (the - # ``if h`` filter in config.py guarantees the set has no ``""``, - # but pin the helper's own guard too). - assert helper("file:///tmp/foo.csv") is False + # must not match an empty-string entry in the set. + set_whitelist("data.gov") + assert ( + download_module._host_in_always_whitelist("file:///tmp/foo.csv") + is None + ) + + +def test_live_read_picks_up_runtime_changes(set_whitelist, download_module): + # The live-read pattern (issue #221 / file_hash_algorithm) must + # actually pick up a runtime change without any reload. This is + # the "editable: true is truthful" property — without it, the + # admin-UI knob would be a lie. + set_whitelist("first.example") + assert ( + download_module._host_in_always_whitelist("https://first.example/x") + == "first.example" + ) + # Operator edits the whitelist via the admin UI… + set_whitelist("second.example") + # …and the next call sees the new value, no restart needed. + assert ( + download_module._host_in_always_whitelist("https://first.example/x") + is None + ) + assert ( + download_module._host_in_always_whitelist("https://second.example/x") + == "second.example" + ) # --------------------------------------------------------------------------- @@ -233,18 +347,15 @@ def _make_context(url, file_hash="abc123", last_modified=None): ) -def test_should_skip_upload_returns_false_when_host_whitelisted(monkeypatch): +def test_should_skip_upload_returns_false_when_host_whitelisted( + set_whitelist, download_module, monkeypatch +): # The core promise of #61: a host in the whitelist forces # re-processing even when the byte hash matches what's already # stored on the resource. - pytest.importorskip("ckan") - _reimport_conf(monkeypatch, "data.internal.gov") - import importlib - import ckanext.datapusher_plus.jobs.stages.download as download + set_whitelist("data.internal.gov") - download = importlib.reload(download) - - stage = download.DownloadStage() + stage = download_module.DownloadStage() ctx = _make_context( url="https://data.internal.gov/csv/file.csv", file_hash="matching-hash", @@ -252,29 +363,30 @@ def test_should_skip_upload_returns_false_when_host_whitelisted(monkeypatch): assert stage._should_skip_upload(ctx, "matching-hash", {}) is False # And the operator-facing log line fired so they can see WHY a - # whitelisted host bypassed the dedup path. + # whitelisted host bypassed the dedup path. The matched host + # is one of the positional args (so the log adapter formats it + # in regardless of %-vs-{} style). ctx.logger.info.assert_called_once() + log_args = ctx.logger.info.call_args + assert "data.internal.gov" in log_args.args -def test_should_skip_upload_preserves_skip_for_non_whitelisted(monkeypatch): +def test_should_skip_upload_preserves_skip_for_non_whitelisted( + set_whitelist, download_module, monkeypatch +): # Regression guard: a host NOT in the whitelist must still skip # when the hash matches, ``ignore_hash`` is false, and # ``IGNORE_FILE_HASH`` is false. Without this test, an # accidentally-inverted whitelist check (e.g., ``not in``) would # silently re-process every resource — bypassing the whole # dedup optimization the stage was built around. - pytest.importorskip("ckan") - _reimport_conf(monkeypatch, "data.internal.gov") - import importlib - import ckanext.datapusher_plus.jobs.stages.download as download - - download = importlib.reload(download) + set_whitelist("data.internal.gov") # Force IGNORE_FILE_HASH false so the hash check is the deciding # factor (otherwise the existing global override would also # return False and mask a buggy whitelist). - monkeypatch.setattr(download.conf, "IGNORE_FILE_HASH", False) + monkeypatch.setattr(download_module.conf, "IGNORE_FILE_HASH", False) - stage = download.DownloadStage() + stage = download_module.DownloadStage() ctx = _make_context( url="https://data.external.gov/csv/file.csv", file_hash="matching-hash", @@ -284,21 +396,16 @@ def test_should_skip_upload_preserves_skip_for_non_whitelisted(monkeypatch): def test_should_skip_upload_empty_whitelist_preserves_legacy_behavior( - monkeypatch, + set_whitelist, download_module, monkeypatch ): # Operators who never set the new key get exactly the pre-#61 # behavior — hash match means skip. Without this test, a future # refactor that accidentally inverted the "empty whitelist # short-circuit" path could break upgrade-in-place deployments. - pytest.importorskip("ckan") - _reimport_conf(monkeypatch, "") - import importlib - import ckanext.datapusher_plus.jobs.stages.download as download - - download = importlib.reload(download) - monkeypatch.setattr(download.conf, "IGNORE_FILE_HASH", False) + set_whitelist("") + monkeypatch.setattr(download_module.conf, "IGNORE_FILE_HASH", False) - stage = download.DownloadStage() + stage = download_module.DownloadStage() ctx = _make_context( url="https://data.internal.gov/csv/file.csv", file_hash="matching-hash", @@ -308,18 +415,22 @@ def test_should_skip_upload_empty_whitelist_preserves_legacy_behavior( # --------------------------------------------------------------------------- -# Declaration-vs-fallback drift guard (#179-style) +# Declaration-vs-config drift guard (#179-style) # --------------------------------------------------------------------------- -def test_config_declaration_default_matches_config_py_fallback(): +def test_config_declaration_default_matches_live_read_fallback(): """Both sources of truth for the default whitelist must agree. - Drift between ``config_declaration.yaml`` and ``config.py``'s - ``tk.config.get(..., default)`` was the root cause of issue #179 + Drift between ``config_declaration.yaml`` and the live-read + fallback in ``download.py`` was the root cause of issue #179 (date → timestamp casting). Pin the same property for #61 so a future "let's only update one of them" PR fails this test instead of silently shipping inconsistent defaults. + + Now that the value is read live (roborev #2293 LOW resolution), + the fallback lives in ``download.py:_get_download_always_whitelist`` + rather than ``config.py`` — AST-parse that file instead. """ import ast import yaml @@ -342,11 +453,16 @@ def test_config_declaration_default_matches_config_py_fallback(): assert whitelist_decl["default"] == "" assert whitelist_decl["editable"] is True - # Read config.py inline fallback via AST (avoid CKAN bootstrap) - config_py = ( - REPO_ROOT / "ckanext" / "datapusher_plus" / "config.py" + # Read download.py live-read fallback via AST (avoid CKAN bootstrap) + download_py = ( + REPO_ROOT + / "ckanext" + / "datapusher_plus" + / "jobs" + / "stages" + / "download.py" ).read_text() - tree = ast.parse(config_py) + tree = ast.parse(download_py) found_default = None for node in ast.walk(tree): if ( @@ -362,6 +478,7 @@ def test_config_declaration_default_matches_config_py_fallback(): found_default = node.args[1].value break assert found_default == "", ( - "config.py inline fallback for download_always_whitelist must " - "match the YAML declaration default (both empty string)." + "download.py live-read fallback for " + "download_always_whitelist must match the YAML declaration " + "default (both empty string)." ) From f00bbd8b303e45894ec9d28fea01d9939aa19687 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 19 May 2026 10:57:45 -0400 Subject: [PATCH 3/4] =?UTF-8?q?fix(#61):=20address=20Copilot=20review=20on?= =?UTF-8?q?=20PR=20#323=20=E2=80=94=20drift-guard=20test=20robustness?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two LOW nits on tests/test_issue_61_download_always_whitelist.py from Copilot's review of PR #323: * **Brittle group-index lookup**: ``decl["groups"][0]["options"]`` assumed the target option lived in the first group, breaking the drift-guard if ``config_declaration.yaml`` ever grows additional groups or the key is moved between groups. Switched to iterating all ``groups`` / ``options`` to find the target key by name — matches the pattern in ``test_issue_112_decimal_comma.py:175-185``. Also added a descriptive assertion message ("key is missing from config_declaration.yaml — issue #61 should declare it") if the declaration is ever removed. * **Unconditional ``import yaml``**: Other drift-guard tests in the repo (``test_issue_112_*``, ``test_issue_142_*``) wrap ``import yaml`` in ``try/except ImportError → pytest.skip(...)`` so CI environments running unit tests without the full extension dep set degrade cleanly. Adopted the same pattern. Tests: still 23 in the file, full Python suite still 274 passing. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...test_issue_61_download_always_whitelist.py | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/tests/test_issue_61_download_always_whitelist.py b/tests/test_issue_61_download_always_whitelist.py index 1e43dad..dbbdbdf 100644 --- a/tests/test_issue_61_download_always_whitelist.py +++ b/tests/test_issue_61_download_always_whitelist.py @@ -433,22 +433,37 @@ def test_config_declaration_default_matches_live_read_fallback(): rather than ``config.py`` — AST-parse that file instead. """ import ast - import yaml - # Read declaration default + # PyYAML import guarded the same way the #112 / #142 drift-guards + # do — CI environments that run pytest without the extension's + # full dep set still get a useful skip rather than an ImportError. + try: + import yaml + except ImportError: + pytest.skip("PyYAML not available") + + # Iterate all groups + options so this guard survives a future + # ``config_declaration.yaml`` restructure (additional groups, key + # moved between groups, etc.) — same shape as #112's drift guard. yaml_path = ( REPO_ROOT / "ckanext" / "datapusher_plus" / "config_declaration.yaml" ) - with yaml_path.open() as fh: - decl = yaml.safe_load(fh) - options = decl["groups"][0]["options"] - whitelist_decl = next( - opt - for opt in options - if opt["key"] == "ckanext.datapusher_plus.download_always_whitelist" + decl = yaml.safe_load(yaml_path.read_text(encoding="utf-8")) + target_key = "ckanext.datapusher_plus.download_always_whitelist" + whitelist_decl = None + for group in decl.get("groups", []): + for opt in group.get("options", []): + if opt.get("key") == target_key: + whitelist_decl = opt + break + if whitelist_decl is not None: + break + assert whitelist_decl is not None, ( + f"{target_key} is missing from config_declaration.yaml — " + "issue #61 should declare it." ) assert whitelist_decl["default"] == "" assert whitelist_decl["editable"] is True From 9cb043bb83e242440d8b0842586dbc480162916f Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 19 May 2026 13:09:05 -0400 Subject: [PATCH 4/4] ci: pin importlib_metadata to unblock prefect worker startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prefect 3.7.1's ``workers/base.py`` imports ``importlib_metadata`` unconditionally but doesn't declare it as a direct dependency. Until this morning the import was satisfied transitively via ``opentelemetry-api`` 1.41.x; ``opentelemetry-api`` 1.42.0 (released 2026-05-19, ~11:00 UTC) dropped that transitive, so every PR CI run since started failing in the "Start Prefect worker" step: ModuleNotFoundError: No module named 'importlib_metadata' Verified by diffing the ``pip install`` output between the last successful main run (26077880669) and PR #323's failing runs — the sole packaging difference is ``importlib-metadata-8.7.1`` present in the former, absent in the latter, alongside ``opentelemetry-api`` bumping 1.41.1 → 1.42.0. Fix: declare ``importlib_metadata`` as a direct DP+ requirement so its presence doesn't depend on which transitive deps Prefect's sibling packages happen to pull in this week. Pinned loose because the backport's API is stable across versions. This is functionally the same problem class as issue #149 — a runtime dependency on a transitive package that was working by accident until an upstream version bump exposed it. Different package, same shape. Co-Authored-By: Claude Opus 4.7 (1M context) --- requirements.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/requirements.txt b/requirements.txt index a6331b2..0ee1502 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,17 @@ pandas==2.2.3 shapely==2.1.0 pyproj>=3.7.1 prefect>=3.7,<3.8 +# importlib_metadata: declared explicitly because prefect 3.7.1's +# ``workers/base.py`` imports it unconditionally but doesn't list it +# as a direct dep — it was being satisfied transitively by +# ``opentelemetry-api`` until 1.42.0 (released 2026-05-19) dropped +# that transitive. Without this pin, ``prefect worker start`` fails +# with ``ModuleNotFoundError: No module named 'importlib_metadata'`` +# and breaks the integration-test workflow. Pinned loose because +# the backport's API is stable across versions; the stdlib has +# ``importlib.metadata`` since Python 3.8 but Prefect specifically +# imports the backport spelling. +importlib_metadata>=4.6 # blake3: default for ckanext.datapusher_plus.file_hash_algorithm (issue #221). # Pure-Python fallback is in stdlib's hashlib for sha256/md5; blake3 is the # only one that needs an external package. Pinned loose because the wheel