Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions ckanext/datapusher_plus/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,14 @@
tk.config.get("ckanext.datapusher_plus.ignore_file_hash", False)
)

# Issue #61: ``ckanext.datapusher_plus.download_always_whitelist`` is
# declared in ``config_declaration.yaml`` (``editable: true``) but is
# intentionally NOT mirrored here as a module-level constant. The
# download stage's ``_host_in_always_whitelist`` reads it from
# ``tk.config`` live at each call so a runtime change via the admin UI
# (adding/removing a host mid-incident, e.g.) takes effect without a
# worker restart. Mirrors the ``file_hash_algorithm`` pattern below.

# Issue #221: ``ckanext.datapusher_plus.file_hash_algorithm`` is declared in
# ``config_declaration.yaml`` but intentionally NOT mirrored here as a
# module-level constant. The setting is ``editable: true`` and the download
Expand Down
33 changes: 33 additions & 0 deletions ckanext/datapusher_plus/config_declaration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,39 @@ groups:
description: |
Download proxy

- key: ckanext.datapusher_plus.download_always_whitelist
editable: true
default: ""
description: |
Whitespace-separated list of hostnames whose resources should
always be re-processed by DP+, bypassing the file-hash-based
upload-skip optimization (see ``_should_skip_upload`` in the
download stage). Use this for hosts that update content in
place without changing the published byte hash (e.g., daily-
refreshed reports that overwrite the same URL with the same
bytes but logically-different rows), or for local/peered
hosts where re-download cost is negligible and operators
want forced re-analysis on every push.

The download stage reads this value live from ``tk.config``
(per the ``file_hash_algorithm`` pattern from #221), so
admin-UI edits take effect on the next download — no worker
restart needed.

Matching is case-insensitive on the URL hostname and ports
are ignored. **Exact-match only** — ``data.gov`` in the
whitelist does NOT match ``subdomain.data.gov``; list each
host separately if you need multiple subdomains. Empty
(default) → all hosts go through the normal hash-skip
path.

Example::

ckanext.datapusher_plus.download_always_whitelist =
data.internal.gov data.partner.org localhost

Issue #61.

- key: ckanext.datapusher_plus.dictionary_stash_dir
editable: true
default: ""
Expand Down
84 changes: 84 additions & 0 deletions ckanext/datapusher_plus/jobs/stages/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,72 @@ def _get_file_hasher():
)


def _get_download_always_whitelist():
"""Return the parsed DOWNLOAD_ALWAYS_WHITELIST from live config.

Reads ``ckanext.datapusher_plus.download_always_whitelist`` from
``tk.config`` live (declared ``editable: true`` in
``config_declaration.yaml``) so admin-UI edits take effect without
a worker restart. Mirrors the ``_get_file_hasher`` pattern from
issue #221. Parses on each call; the cost is negligible since the
hash-skip check fires at most once per download.

Returns:
``frozenset`` of lowercased hostnames. Empty set if the
config is unset, empty, or the value is malformed.
"""
import ckan.plugins.toolkit as tk

raw = tk.config.get(
"ckanext.datapusher_plus.download_always_whitelist", ""
)
if isinstance(raw, str):
raw = raw.split()
try:
return frozenset(h.lower() for h in raw if h)
except (TypeError, AttributeError):
# Defensive: malformed config (e.g., a non-iterable) should
# degrade to "no host bypass" rather than crash the download.
return frozenset()


def _host_in_always_whitelist(url):
"""Return the matched hostname if ``url``'s host is whitelisted, else ``None``.

Used by the download stage to bypass the hash-skip optimization
in ``_should_skip_upload`` for trusted/peered hosts or hosts
that update content in place without changing the published
byte hash. See issue #61 for the operator-side rationale.

Returning the matched host (rather than a bare ``bool``) lets
the caller log the host without a second ``urlparse`` call.

Args:
url: The resource URL as stored in ``resource["url"]``.

Returns:
The lowercased hostname (with port stripped) when matched;
``None`` when the whitelist is empty, the URL is missing or
unparseable, the URL has no hostname (relative URLs,
``file://`` with empty host, etc.), or the host is not in
the whitelist. Matching is exact: ``data.gov`` in the
whitelist does NOT match ``subdomain.data.gov``.
"""
whitelist = _get_download_always_whitelist()
if not whitelist:
return None
if not url:
return None
try:
host = urlparse(url).hostname
except (ValueError, AttributeError):
return None
if not host:
return None
host = host.lower()
return host if host in whitelist else None


class DownloadStage(BaseStage):
"""
Downloads the resource file, validates it, and handles ZIP extraction.
Expand Down Expand Up @@ -383,6 +449,24 @@ def _should_skip_upload(
Returns:
True if upload should be skipped, False otherwise
"""
# Issue #61: forced re-processing for operator-whitelisted hosts.
# Bypasses the hash-skip path entirely so resources from these
# hosts get re-downloaded + re-analyzed on every push. This is
# intended for hosts that update content in place without
# changing the byte hash (e.g., a daily report that overwrites
# the same URL with the same template but new underlying data),
# or for local/peered hosts where re-download is essentially
# free and operators want forced re-analysis.
url = context.resource.get("url", "") if context.resource else ""
matched_host = _host_in_always_whitelist(url)
if matched_host:
context.logger.info(
"Host %r is in DOWNLOAD_ALWAYS_WHITELIST; forcing "
"re-processing (bypassing hash-skip).",
matched_host,
)
return False

# Check if resource metadata was updated
resource_updated = False
resource_last_modified = context.resource.get("last_modified")
Expand Down
11 changes: 11 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,17 @@ pandas==2.2.3
shapely==2.1.0
pyproj>=3.7.1
prefect>=3.7,<3.8
# importlib_metadata: declared explicitly because prefect 3.7.1's
# ``workers/base.py`` imports it unconditionally but doesn't list it
# as a direct dep — it was being satisfied transitively by
# ``opentelemetry-api`` until 1.42.0 (released 2026-05-19) dropped
# that transitive. Without this pin, ``prefect worker start`` fails
# with ``ModuleNotFoundError: No module named 'importlib_metadata'``
# and breaks the integration-test workflow. Pinned loose because
# the backport's API is stable across versions; the stdlib has
# ``importlib.metadata`` since Python 3.8 but Prefect specifically
# imports the backport spelling.
importlib_metadata>=4.6
# blake3: default for ckanext.datapusher_plus.file_hash_algorithm (issue #221).
# Pure-Python fallback is in stdlib's hashlib for sha256/md5; blake3 is the
# only one that needs an external package. Pinned loose because the wheel
Expand Down
Loading
Loading