From 13b90322b279cf80924788b850208c04b5af7569 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 8 May 2026 16:44:10 -0500 Subject: [PATCH 1/7] Surface real failures in paginated waterdata responses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs in _walk_pages (and the parallel get_stats_data pagination loop) caused silent or misleading failures when a paginated request was interrupted mid-walk: 1. The except handler called _error_body(resp) — but when client.request() itself raised (ConnectionError, Timeout, etc.), `resp` still pointed at the *previous successful page*. The "incomplete" log message therefore described the wrong request, and on a non-JSON 200 body would itself raise inside resp.json() and bury the original failure. 2. No status-code check was performed on paginated responses. The initial request guards `if resp.status_code != 200`, but the loop doesn't. A 5xx body that didn't include "numberReturned" was silently turned into an empty DataFrame by _get_resp_data, the `next` link wasn't found, and the loop quietly exited — handing the user truncated data with no error logged anywhere. This affects every paginated waterdata getter (get_daily, get_continuous, get_monitoring_locations, …). Fix: - Guard the loop body with `if resp.status_code != 200: raise RuntimeError(_error_body(resp))`, mirroring the initial request. - Capture the exception with `as e` and log `e` instead of touching the stale `resp`. Same change in get_stats_data. Behavior preserved: on failure the loop still logs and returns whatever pages were collected ("best effort"). The change is purely to make the failure observable and the log message accurate. Two new tests cover (a) network-exception mid-pagination and (b) 5xx mid-pagination, asserting that the actual failure surfaces in the error log. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 + dataretrieval/waterdata/utils.py | 22 +++++--- tests/waterdata_utils_test.py | 87 ++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2faaeb42..548d4878 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**05/08/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved. + **05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**. **05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`. diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 784f2969..7756e101 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -635,11 +635,18 @@ def _walk_pages( headers=headers, data=content if method == "POST" else None, ) + # Mirror the initial-request check; otherwise a 5xx body + # without "numberReturned" silently yields an empty frame + # and the loop quietly stops. + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) dfs.append(_get_resp_data(resp, geopd=geopd)) curr_url = _next_req_url(resp) - except Exception: # noqa: BLE001 - error_text = _error_body(resp) - logger.error("Request incomplete. %s", error_text) + except Exception as e: # noqa: BLE001 + # Report the *actual* failure — not _error_body(resp) on a + # stale prior-page response (which would describe the wrong + # request and can itself raise on non-JSON bodies). + logger.error("Request incomplete: %s", e) logger.warning( "Request failed for URL: %s. Data download interrupted.", curr_url ) @@ -1099,14 +1106,15 @@ def get_stats_data( params=args, headers=headers, ) + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) body = resp.json() all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) next_token = body["next"] - except Exception: # noqa: BLE001 - error_text = _error_body(resp) - logger.error("Request incomplete. %s", error_text) + except Exception as e: # noqa: BLE001 + logger.error("Request incomplete: %s", e) logger.warning( - "Request failed for URL: %s. Data download interrupted.", resp.url + "Request failed for URL: %s. Data download interrupted.", url ) next_token = None diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 8151ab37..3008be15 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -86,6 +86,93 @@ def test_walk_pages_multiple_mocked(): assert mock_client.request.call_args[0][1] == "https://example.com/page2" +def _resp_ok(features): + """Build a 200-OK mock response carrying the given features list.""" + resp = mock.MagicMock() + resp.json.return_value = { + "numberReturned": len(features), + "features": features, + "links": [{"rel": "next", "href": "https://example.com/page2"}] + if features + else [], + } + resp.headers = {} + resp.links = {"next": {"url": "https://example.com/page2"}} if features else {} + resp.status_code = 200 + resp.url = "https://example.com/page1" + return resp + + +def _walk_pages_with_failure(failure_resp_or_exc): + """Run _walk_pages where page 1 succeeds and page 2 fails as given.""" + resp1 = _resp_ok([{"id": "1", "properties": {"val": "a"}}]) + + mock_client = mock.MagicMock(spec=requests.Session) + mock_client.send.return_value = resp1 + if isinstance(failure_resp_or_exc, BaseException): + mock_client.request.side_effect = failure_resp_or_exc + else: + mock_client.request.return_value = failure_resp_or_exc + + mock_req = mock.MagicMock(spec=requests.PreparedRequest) + mock_req.method = "GET" + mock_req.headers = {} + mock_req.url = "https://example.com/page1" + + return _walk_pages(geopd=False, req=mock_req, client=mock_client) + + +def test_walk_pages_logs_actual_exception_when_request_raises(caplog): + """When client.request() itself raises (e.g. a connection error), the + logged failure must reflect that exception — not _error_body() on the + stale prior-page response, which described the wrong request and could + itself crash on non-JSON bodies.""" + import logging + + caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") + + df, _ = _walk_pages_with_failure(requests.ConnectionError("boom")) + + # First page's data is preserved (best-effort behavior). + assert list(df["val"]) == ["a"] + # Logged error mentions the actual ConnectionError, not a stale page body. + error_messages = [ + r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR + ] + assert any("boom" in m for m in error_messages), error_messages + + +def test_walk_pages_surfaces_5xx_mid_pagination(caplog): + """A non-200 response on a paginated page must be detected; previously + the loop happily called _get_resp_data() on the 5xx body, which — + lacking 'numberReturned' — silently produced an empty DataFrame and + the loop quietly stopped with no error logged.""" + import logging + + caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") + + page2_500 = mock.MagicMock() + page2_500.status_code = 503 + page2_500.json.return_value = { + "code": "ServiceUnavailable", + "description": "upstream timeout", + } + page2_500.url = "https://example.com/page2" + + df, _ = _walk_pages_with_failure(page2_500) + + assert list(df["val"]) == ["a"] + error_messages = [ + r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR + ] + # The 5xx is now visible in the error log (it would previously have + # been silently swallowed because _get_resp_data returned an empty + # frame and the loop stopped quietly). + assert any("503" in m or "ServiceUnavailable" in m for m in error_messages), ( + error_messages + ) + + def test_handle_stats_nesting_tolerates_missing_drop_columns(): """If the upstream stats response shape ever changes such that one of the columns we try to drop ("type", "properties.data") is absent, the From 13a63fc4e030cb4b4c9d2106f75caf36debd6869 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 12 May 2026 19:48:48 -0500 Subject: [PATCH 2/7] Extract _raise_for_non_200 helper and trim PR #273 comments The status-code guard appeared four times in waterdata/utils.py (initial-request and pagination paths in both _walk_pages and get_stats_data) with bit-identical bodies. Extract into a single named helper; the helper's docstring carries the "silent-empty-frame" WHY that the inline comment was apologizing for. Also: - trim the second multi-line comment in _walk_pages to a single line noting the actual constraint (`resp` may be stale) - hoist `import logging` to module level in waterdata_utils_test.py - condense the two new test docstrings to one-line contract statements; the bug history lives in the commit message Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 30 ++++++++++++++++-------------- tests/waterdata_utils_test.py | 15 +++------------ 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 7756e101..c36bbe33 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -368,6 +368,17 @@ def _error_body(resp: requests.Response): ) +def _raise_for_non_200(resp: requests.Response) -> None: + """Raise ``RuntimeError(_error_body(resp))`` if ``resp`` is not 200. + + Used by both the initial-request gate and the pagination loops; without + the loop-side check, a 5xx body lacking ``numberReturned`` would be + silently treated as an empty page and pagination would stop quietly. + """ + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) + + def _construct_api_requests( service: str, properties: list[str] | None = None, @@ -612,8 +623,7 @@ def _walk_pages( client = client or requests.Session() try: resp = client.send(req) - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) + _raise_for_non_200(resp) # Store the initial response for metadata initial_response = resp @@ -635,17 +645,11 @@ def _walk_pages( headers=headers, data=content if method == "POST" else None, ) - # Mirror the initial-request check; otherwise a 5xx body - # without "numberReturned" silently yields an empty frame - # and the loop quietly stops. - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) + _raise_for_non_200(resp) dfs.append(_get_resp_data(resp, geopd=geopd)) curr_url = _next_req_url(resp) except Exception as e: # noqa: BLE001 - # Report the *actual* failure — not _error_body(resp) on a - # stale prior-page response (which would describe the wrong - # request and can itself raise on non-JSON bodies). + # `resp` may be stale (prior page) if client.request() raised. logger.error("Request incomplete: %s", e) logger.warning( "Request failed for URL: %s. Data download interrupted.", curr_url @@ -1079,8 +1083,7 @@ def get_stats_data( try: resp = client.send(req) - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) + _raise_for_non_200(resp) # Store the initial response for metadata initial_response = resp @@ -1106,8 +1109,7 @@ def get_stats_data( params=args, headers=headers, ) - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) + _raise_for_non_200(resp) body = resp.json() all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) next_token = body["next"] diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 3008be15..37d97c43 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -1,3 +1,4 @@ +import logging from unittest import mock import pandas as pd @@ -123,12 +124,7 @@ def _walk_pages_with_failure(failure_resp_or_exc): def test_walk_pages_logs_actual_exception_when_request_raises(caplog): - """When client.request() itself raises (e.g. a connection error), the - logged failure must reflect that exception — not _error_body() on the - stale prior-page response, which described the wrong request and could - itself crash on non-JSON bodies.""" - import logging - + """Exception from client.request() must be logged with its actual message.""" caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") df, _ = _walk_pages_with_failure(requests.ConnectionError("boom")) @@ -143,12 +139,7 @@ def test_walk_pages_logs_actual_exception_when_request_raises(caplog): def test_walk_pages_surfaces_5xx_mid_pagination(caplog): - """A non-200 response on a paginated page must be detected; previously - the loop happily called _get_resp_data() on the 5xx body, which — - lacking 'numberReturned' — silently produced an empty DataFrame and - the loop quietly stopped with no error logged.""" - import logging - + """A non-200 mid-pagination response must be logged, not silently swallowed.""" caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") page2_500 = mock.MagicMock() From 404b8ce7f378797a69470df9070ff63adf036fc3 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 14 May 2026 21:08:05 -0500 Subject: [PATCH 3/7] Add get_stats_data failure tests and tighten PR #273 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up cleanups on top of the silent-truncation fix: - New tests for the get_stats_data parallel pagination loop, mirroring the _walk_pages ConnectionError + 5xx pair (closes coverage gap from the prior commits). - Lift the hard-coded logger-name string to a module-level _LOGGER_NAME constant derived from the utils module's __name__ (self-syncing). - logger.error → logger.exception in both pagination except clauses so the traceback is automatically attached to the log record. - _raise_for_non_200 docstring: drop the historical bug narration, add a one-line note distinguishing it from Response.raise_for_status (we route through _error_body for USGS-API-aware messages). - Drop the inconsistent "resp may be stale" comment; the `as e` and variable naming already convey intent, and get_stats_data already sidesteps it by logging `url` instead of `resp.url`. - _run_get_stats_data_with_failure: make monkeypatch a required arg (both callers pass it), and swap the lambda for MagicMock so future signature drift in _handle_stats_nesting isn't silently masked. - Tighten test docstrings that narrated the PR/bug. - Extract _error_log_messages(caplog) helper (used 4 times). - Lift the conditional links ternary out of the dict literal in _resp_ok. - NEWS.md: bump date to today. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 +- dataretrieval/waterdata/utils.py | 12 ++-- tests/waterdata_utils_test.py | 107 +++++++++++++++++++++++++------ 3 files changed, 96 insertions(+), 25 deletions(-) diff --git a/NEWS.md b/NEWS.md index 548d4878..1359fa9a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -**05/08/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved. +**05/14/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved. **05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**. diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index c36bbe33..94515b84 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -371,9 +371,10 @@ def _error_body(resp: requests.Response): def _raise_for_non_200(resp: requests.Response) -> None: """Raise ``RuntimeError(_error_body(resp))`` if ``resp`` is not 200. - Used by both the initial-request gate and the pagination loops; without - the loop-side check, a 5xx body lacking ``numberReturned`` would be - silently treated as an empty page and pagination would stop quietly. + Routes through ``_error_body`` (USGS-API-aware: handles 429/403 + specially, extracts ``code``/``description`` from JSON error bodies) + rather than ``Response.raise_for_status``, which raises + ``HTTPError`` with a generic message. """ if resp.status_code != 200: raise RuntimeError(_error_body(resp)) @@ -649,8 +650,7 @@ def _walk_pages( dfs.append(_get_resp_data(resp, geopd=geopd)) curr_url = _next_req_url(resp) except Exception as e: # noqa: BLE001 - # `resp` may be stale (prior page) if client.request() raised. - logger.error("Request incomplete: %s", e) + logger.exception("Request incomplete: %s", e) logger.warning( "Request failed for URL: %s. Data download interrupted.", curr_url ) @@ -1114,7 +1114,7 @@ def get_stats_data( all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) next_token = body["next"] except Exception as e: # noqa: BLE001 - logger.error("Request incomplete: %s", e) + logger.exception("Request incomplete: %s", e) logger.warning( "Request failed for URL: %s. Data download interrupted.", url ) diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 37d97c43..be27d8e3 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -4,6 +4,7 @@ import pandas as pd import requests +import dataretrieval.waterdata.utils as _utils_module from dataretrieval.waterdata.utils import ( _arrange_cols, _format_api_dates, @@ -12,6 +13,8 @@ _walk_pages, ) +_LOGGER_NAME = _utils_module.__name__ + def test_get_args_basic(): local_vars = { @@ -89,21 +92,25 @@ def test_walk_pages_multiple_mocked(): def _resp_ok(features): """Build a 200-OK mock response carrying the given features list.""" + links = [{"rel": "next", "href": "https://example.com/page2"}] if features else [] resp = mock.MagicMock() resp.json.return_value = { "numberReturned": len(features), "features": features, - "links": [{"rel": "next", "href": "https://example.com/page2"}] - if features - else [], + "links": links, } resp.headers = {} - resp.links = {"next": {"url": "https://example.com/page2"}} if features else {} resp.status_code = 200 resp.url = "https://example.com/page1" return resp +def _error_log_messages(caplog): + """Pull ERROR-and-above message strings out of caplog. The four + pagination-failure tests below all assert against the same shape.""" + return [r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR] + + def _walk_pages_with_failure(failure_resp_or_exc): """Run _walk_pages where page 1 succeeds and page 2 fails as given.""" resp1 = _resp_ok([{"id": "1", "properties": {"val": "a"}}]) @@ -125,22 +132,20 @@ def _walk_pages_with_failure(failure_resp_or_exc): def test_walk_pages_logs_actual_exception_when_request_raises(caplog): """Exception from client.request() must be logged with its actual message.""" - caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") + caplog.set_level(logging.ERROR, logger=_LOGGER_NAME) df, _ = _walk_pages_with_failure(requests.ConnectionError("boom")) # First page's data is preserved (best-effort behavior). assert list(df["val"]) == ["a"] # Logged error mentions the actual ConnectionError, not a stale page body. - error_messages = [ - r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR - ] - assert any("boom" in m for m in error_messages), error_messages + messages = _error_log_messages(caplog) + assert any("boom" in m for m in messages), messages def test_walk_pages_surfaces_5xx_mid_pagination(caplog): """A non-200 mid-pagination response must be logged, not silently swallowed.""" - caplog.set_level(logging.ERROR, logger="dataretrieval.waterdata.utils") + caplog.set_level(logging.ERROR, logger=_LOGGER_NAME) page2_500 = mock.MagicMock() page2_500.status_code = 503 @@ -153,16 +158,82 @@ def test_walk_pages_surfaces_5xx_mid_pagination(caplog): df, _ = _walk_pages_with_failure(page2_500) assert list(df["val"]) == ["a"] - error_messages = [ - r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR - ] - # The 5xx is now visible in the error log (it would previously have - # been silently swallowed because _get_resp_data returned an empty - # frame and the loop stopped quietly). - assert any("503" in m or "ServiceUnavailable" in m for m in error_messages), ( - error_messages + messages = _error_log_messages(caplog) + assert any("503" in m or "ServiceUnavailable" in m for m in messages), messages + + +def _stats_initial_ok(): + """A 200-OK initial stats response: empty data list, signals one more page.""" + resp = mock.MagicMock() + resp.status_code = 200 + resp.json.return_value = { + "next": "tok2", + "features": [], + } + resp.headers = {} + resp.url = "https://example.com/stats?service=foo" + return resp + + +def _run_get_stats_data_with_failure(failure_resp_or_exc, monkeypatch): + """Exercise get_stats_data where the initial response succeeds and the + paginated follow-up fails as given. Mirrors _walk_pages_with_failure. + `monkeypatch` stubs ``_handle_stats_nesting`` so the synthetic minimal + response body doesn't need to parse — these tests only assert on the + pagination loop's error surfacing.""" + from dataretrieval.waterdata.utils import get_stats_data + + monkeypatch.setattr( + _utils_module, + "_handle_stats_nesting", + mock.MagicMock(return_value=pd.DataFrame()), ) + mock_client = mock.MagicMock(spec=requests.Session) + mock_client.send.return_value = _stats_initial_ok() + if isinstance(failure_resp_or_exc, BaseException): + mock_client.request.side_effect = failure_resp_or_exc + else: + mock_client.request.return_value = failure_resp_or_exc + + return get_stats_data( + args={"monitoring_location_id": "USGS-1"}, + service="observationNormals", + expand_percentiles=False, + client=mock_client, + ) + + +def test_get_stats_data_logs_actual_exception_when_request_raises(caplog, monkeypatch): + """get_stats_data variant of the connection-error scenario.""" + caplog.set_level(logging.ERROR, logger=_LOGGER_NAME) + + _run_get_stats_data_with_failure( + requests.ConnectionError("stats-boom"), + monkeypatch, + ) + + messages = _error_log_messages(caplog) + assert any("stats-boom" in m for m in messages), messages + + +def test_get_stats_data_surfaces_5xx_mid_pagination(caplog, monkeypatch): + """get_stats_data variant of the mid-pagination 5xx scenario.""" + caplog.set_level(logging.ERROR, logger=_LOGGER_NAME) + + page2_503 = mock.MagicMock() + page2_503.status_code = 503 + page2_503.json.return_value = { + "code": "ServiceUnavailable", + "description": "upstream timeout", + } + page2_503.url = "https://example.com/stats?service=foo&next_token=tok2" + + _run_get_stats_data_with_failure(page2_503, monkeypatch) + + messages = _error_log_messages(caplog) + assert any("503" in m or "ServiceUnavailable" in m for m in messages), messages + def test_handle_stats_nesting_tolerates_missing_drop_columns(): """If the upstream stats response shape ever changes such that one of From 200262a8da27a6f7ba20d62cf6a703d415104fe9 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 14 May 2026 21:15:17 -0500 Subject: [PATCH 4/7] Address Copilot PR #273 review: include next_token in pagination warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot noted that the get_stats_data pagination-failure warning logs only the base statistics URL (constant across all pages), dropping the next_token cursor that uniquely identifies which page in the sequence failed. The original `resp.url` reference was abandoned because `resp` may be stale when client.request() itself raises — but next_token is the loop variable and is always current. Add `(next_token=)` to the warning, plus a focused test that asserts the cursor surfaces in the WARNING-level log entry. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 5 ++++- tests/waterdata_utils_test.py | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index e6006cd5..7cfcad0f 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -1149,7 +1149,10 @@ def get_stats_data( except Exception as e: # noqa: BLE001 logger.exception("Request incomplete: %s", e) logger.warning( - "Request failed for URL: %s. Data download interrupted.", url + "Request failed for URL: %s (next_token=%s). " + "Data download interrupted.", + url, + next_token, ) next_token = None diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index fa8ff4e3..2c96e89f 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -236,6 +236,26 @@ def test_get_stats_data_surfaces_5xx_mid_pagination(caplog, monkeypatch): assert any("503" in m or "ServiceUnavailable" in m for m in messages), messages +def test_get_stats_data_warning_includes_next_token(caplog, monkeypatch): + """The pagination-failure warning includes the next_token so operators + can identify which page in the sequence failed. (Addresses Copilot's + PR #273 review note: the base URL alone drops cursor context.)""" + caplog.set_level(logging.WARNING, logger=_LOGGER_NAME) + + page2_503 = mock.MagicMock() + page2_503.status_code = 503 + page2_503.json.return_value = { + "code": "ServiceUnavailable", + "description": "upstream timeout", + } + + _run_get_stats_data_with_failure(page2_503, monkeypatch) + + warnings_ = [r.getMessage() for r in caplog.records if r.levelno == logging.WARNING] + # The initial response from _stats_initial_ok carries next=tok2. + assert any("tok2" in m for m in warnings_), warnings_ + + def test_handle_stats_nesting_tolerates_missing_drop_columns(): """If the upstream stats response shape ever changes such that one of the columns we try to drop ("type", "properties.data") is absent, the From 9c6481f7154e11e3192581f44734b58e425a92de Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 14 May 2026 21:42:05 -0500 Subject: [PATCH 5/7] =?UTF-8?q?Rename=20page2=5F500=20=E2=86=92=20page2=5F?= =?UTF-8?q?503=20to=20match=20the=20mocked=20status=20code?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot PR #273 review: the variable name `page2_500` is misleading because the mocked response is a 503 (ServiceUnavailable), not a 500. Renamed for clarity so future readers debugging a failure don't have to mentally reconcile the name against the actual status code. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/waterdata_utils_test.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 2c96e89f..506a3bd2 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -148,15 +148,15 @@ def test_walk_pages_surfaces_5xx_mid_pagination(caplog): """A non-200 mid-pagination response must be logged, not silently swallowed.""" caplog.set_level(logging.ERROR, logger=_LOGGER_NAME) - page2_500 = mock.MagicMock() - page2_500.status_code = 503 - page2_500.json.return_value = { + page2_503 = mock.MagicMock() + page2_503.status_code = 503 + page2_503.json.return_value = { "code": "ServiceUnavailable", "description": "upstream timeout", } - page2_500.url = "https://example.com/page2" + page2_503.url = "https://example.com/page2" - df, _ = _walk_pages_with_failure(page2_500) + df, _ = _walk_pages_with_failure(page2_503) assert list(df["val"]) == ["a"] messages = _error_log_messages(caplog) From e4e428170b99fdcaccf8cb4f577a3adc498f2ee7 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 15 May 2026 08:32:43 -0500 Subject: [PATCH 6/7] Apply Copilot PR #273 review: logger.error + future-proof helper docstring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two small follow-ups from Copilot's review of 9c6481f: - Revert `logger.exception` -> `logger.error` in both `_walk_pages` and `get_stats_data` except blocks. The PR description always advertised `logger.error("Request incomplete: %s", e)`; the simplify pass had switched to `logger.exception(...)` (which auto-attaches a traceback) but mid-pagination failures are expected, best-effort cases — the exception text is already in the message via `%s`, and a per-page traceback would be noisy. Reverting matches the PR description and the original intent. - Drop the literal count from `_error_log_messages`'s docstring. The prior wording ("the four pagination-failure tests below") was already stale-prone, and Copilot flagged it. Replaced with a count-free phrasing so future test additions don't render it misleading. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 4 ++-- tests/waterdata_utils_test.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 7cfcad0f..9b66de15 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -683,7 +683,7 @@ def _walk_pages( dfs.append(_get_resp_data(resp, geopd=geopd)) curr_url = _next_req_url(resp) except Exception as e: # noqa: BLE001 - logger.exception("Request incomplete: %s", e) + logger.error("Request incomplete: %s", e) logger.warning( "Request failed for URL: %s. Data download interrupted.", curr_url ) @@ -1147,7 +1147,7 @@ def get_stats_data( all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) next_token = body["next"] except Exception as e: # noqa: BLE001 - logger.exception("Request incomplete: %s", e) + logger.error("Request incomplete: %s", e) logger.warning( "Request failed for URL: %s (next_token=%s). " "Data download interrupted.", diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 506a3bd2..1abcca56 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -107,8 +107,8 @@ def _resp_ok(features): def _error_log_messages(caplog): - """Pull ERROR-and-above message strings out of caplog. The four - pagination-failure tests below all assert against the same shape.""" + """Pull ERROR-and-above message strings out of caplog. The pagination- + failure tests below all assert against the same shape.""" return [r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR] From dcd2ec55c6d2641b32c7eb5fea242dfee553ce1f Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 15 May 2026 08:49:08 -0500 Subject: [PATCH 7/7] Avoid hyphenating "pagination-failure" across a line break Copilot PR #273 review: the docstring of `_error_log_messages` wrapped mid-compound-word as `pagination-\n failure`, which renders as "pagination- failure" in help output. Rephrased so the term stays on one line. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/waterdata_utils_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 1abcca56..b05587e2 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -107,8 +107,8 @@ def _resp_ok(features): def _error_log_messages(caplog): - """Pull ERROR-and-above message strings out of caplog. The pagination- - failure tests below all assert against the same shape.""" + """Pull ERROR-and-above message strings out of caplog. Shared by the + pagination-failure tests below.""" return [r.getMessage() for r in caplog.records if r.levelno >= logging.ERROR]