From 90c1dd23a34f5655204ab4c5e32013ada3228d31 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 3 May 2026 12:49:19 -0500 Subject: [PATCH 1/4] Fix _format_api_dates: parse ISO 8601, anchor duration check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A single ISO 8601 datetime such as "2018-02-12T23:20:50Z" — the format the docstrings of every getter cite as a valid `time` example — was silently dropped to None. The function only tried "%Y-%m-%d %H:%M:%S" and "%Y-%m-%d", neither of which matches a string with a `T` separator and trailing `Z`. `_construct_api_requests` then set `params["time"] = None`, requests dropped the param, and the user's time filter was silently discarded — the API fell back to "most recent year". Replace the cascading try/except with a single helper that walks the supported formats in order: ISO 8601 with offset and/or fractional seconds, plain ISO 8601, the legacy space-separated form, and date-only. Tz-aware inputs convert directly to UTC; naive inputs continue to be interpreted in the runner's local zone for backwards compatibility. Also tighten the duration / interval pass-through. The previous `re.search(r"P", ..., re.IGNORECASE)` matched any string containing a `p` or `P` anywhere — words like "Apr" or "sample" would skip parsing entirely. Anchor the check to `^[Pp]\d`. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 102 ++++++++++++++++++------------- tests/waterdata_utils_test.py | 53 ++++++++++++++++ 2 files changed, 112 insertions(+), 43 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 7070da50..3f457ad1 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -128,6 +128,34 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s return [p for p in properties if p not in ["geometry", service_id]] +_DATETIME_FORMATS = ( + "%Y-%m-%dT%H:%M:%S.%f%z", + "%Y-%m-%dT%H:%M:%S%z", + "%Y-%m-%dT%H:%M:%S.%f", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%d %H:%M:%S.%f", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d", +) + + +def _parse_datetime(value: str) -> datetime | None: + """Parse a single datetime string against the supported formats. + + Returns a ``datetime`` (tz-aware iff the input carried a UTC offset), + or ``None`` if no format matched. + """ + # ``datetime.strptime`` accepts a numeric offset like ``+00:00`` but not + # the ``Z`` shorthand, so normalize trailing ``Z`` first. + candidate = value[:-1] + "+00:00" if value.endswith("Z") else value + for fmt in _DATETIME_FORMATS: + try: + return datetime.strptime(candidate, fmt) + except ValueError: + continue + return None + + def _format_api_dates( datetime_input: str | list[str], date: bool = False ) -> str | None: @@ -141,8 +169,8 @@ def _format_api_dates( ---------- datetime_input : Union[str, List[str]] A single date/datetime string or a list of one or two date/datetime - strings. Accepts formats like "%Y-%m-%d %H:%M:%S", ISO 8601, or relative - periods (e.g., "P7D"). + strings. Accepts formats like "%Y-%m-%d %H:%M:%S", ISO 8601 (with or + without ``Z``/numeric offset), or relative periods (e.g., "P7D"). date : bool, optional If True, uses only the date portion ("YYYY-MM-DD"). If False (default), returns full datetime in UTC ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). @@ -168,7 +196,9 @@ def _format_api_dates( - Supports relative period strings (e.g., "P7D") and passes them through unchanged. - Converts datetimes to UTC and formats as ISO 8601 with 'Z' suffix when - `date` is False. + `date` is False. Inputs with an explicit offset (``Z`` or ``+HH:MM``) are + converted from that offset to UTC; naive inputs are interpreted in the + local time zone for backwards compatibility. - For date ranges, replaces "nan" with ".." in the output. """ # Get timezone @@ -182,48 +212,34 @@ def _format_api_dates( if all(pd.isna(dt) or dt == "" or dt is None for dt in datetime_input): return None - if len(datetime_input) <= 2: - # If the list is of length 1, first look for things like "P7D" or dates - # already formatted in ISO08601. Otherwise, try to coerce to datetime - if len(datetime_input) == 1 and ( - re.search(r"P", datetime_input[0], re.IGNORECASE) - or "/" in datetime_input[0] - ): - return datetime_input[0] - # Otherwise, use list comprehension to parse dates - else: - try: - # Parse to naive datetime - parsed_dates = [ - datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") # noqa: DTZ007 - for dt in datetime_input - ] - except ValueError: - # Parse to date only - try: - parsed_dates = [ - datetime.strptime(dt, "%Y-%m-%d") # noqa: DTZ007 - for dt in datetime_input - ] - except ValueError: - return None - # If the service only accepts dates for this input, not - # datetimes (e.g. "daily"), return just the dates separated by a - # "/", otherwise, return the datetime in UTC format. - if date: - return "/".join(dt.strftime("%Y-%m-%d") for dt in parsed_dates) - else: - parsed_locals = [ - dt.replace(tzinfo=local_timezone) for dt in parsed_dates - ] - formatted = "/".join( - dt.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") - for dt in parsed_locals - ) - return formatted - else: + if len(datetime_input) > 2: raise ValueError("datetime_input should only include 1-2 values") + # Pass through duration ("P7D") and pre-formatted interval ("a/b") strings + # untouched. Anchor the duration check so the bare letter ``P`` / ``p`` + # appearing inside a normal word doesn't accidentally bypass parsing. + if len(datetime_input) == 1: + single = datetime_input[0] + if re.match(r"^[Pp]\d", single) or "/" in single: + return single + + parsed_dates = [_parse_datetime(dt) for dt in datetime_input] + if any(dt is None for dt in parsed_dates): + return None + + if date: + return "/".join(dt.strftime("%Y-%m-%d") for dt in parsed_dates) + + # Localize naive datetimes to the runner's local zone before converting + # to UTC; tz-aware datetimes are converted directly. + utc_dates = [ + (dt if dt.tzinfo is not None else dt.replace(tzinfo=local_timezone)).astimezone( + ZoneInfo("UTC") + ) + for dt in parsed_dates + ] + return "/".join(dt.strftime("%Y-%m-%dT%H:%M:%SZ") for dt in utc_dates) + def _cql2_param(args: dict[str, Any]) -> str: """ diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 36150be8..13091968 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -3,6 +3,7 @@ import requests from dataretrieval.waterdata.utils import ( + _format_api_dates, _get_args, _walk_pages, ) @@ -80,3 +81,55 @@ def test_walk_pages_multiple_mocked(): assert mock_client.send.called assert mock_client.request.called assert mock_client.request.call_args[0][1] == "https://example.com/page2" + + +# --- _format_api_dates ------------------------------------------------------- + + +def test_format_api_dates_iso8601_with_z(): + """ISO 8601 datetimes with a 'Z' suffix must be parsed, not dropped to None.""" + assert _format_api_dates("2018-02-12T23:20:50Z") == "2018-02-12T23:20:50Z" + + +def test_format_api_dates_iso8601_with_fractional_seconds(): + assert _format_api_dates("2018-02-12T23:20:50.123Z") == "2018-02-12T23:20:50Z" + + +def test_format_api_dates_iso8601_with_offset(): + """Numeric offsets must be converted to UTC.""" + assert _format_api_dates("2018-02-12T19:20:50-04:00") == "2018-02-12T23:20:50Z" + + +def test_format_api_dates_iso8601_pair(): + """A list of two ISO 8601 datetimes must be parsed into a UTC interval.""" + result = _format_api_dates(["2018-02-12T23:20:50Z", "2018-03-18T12:31:12Z"]) + assert result == "2018-02-12T23:20:50Z/2018-03-18T12:31:12Z" + + +def test_format_api_dates_passthrough_interval(): + assert _format_api_dates("2018-02-12T00:00:00Z/..") == "2018-02-12T00:00:00Z/.." + + +def test_format_api_dates_passthrough_duration(): + assert _format_api_dates("P7D") == "P7D" + + +def test_format_api_dates_word_with_p_is_not_a_duration(): + """Strings containing the letter 'p' must not be misclassified as durations.""" + assert _format_api_dates("Apr") is None + + +def test_format_api_dates_date_only(): + assert _format_api_dates("2024-01-01", date=True) == "2024-01-01" + + +def test_format_api_dates_date_only_pair(): + assert ( + _format_api_dates(["2024-01-01", "2024-02-01"], date=True) + == "2024-01-01/2024-02-01" + ) + + +def test_format_api_dates_space_separated_still_works(): + """The legacy space-separated format must still parse.""" + assert _format_api_dates("2024-01-01 00:00:00", date=True) == "2024-01-01" From 9bde9e106b32699678277d843bb71dc66b6f21ea Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 4 May 2026 10:06:49 -0500 Subject: [PATCH 2/4] Accept ISO 8601 PT durations and open-ended range NA endpoints Per copilot review on PR #247: - Broaden the duration passthrough regex from `^[Pp]\\d` to `^[Pp]T?\\d` so time-only durations like `PT36H` (documented in get_continuous and get_daily as valid `time`/`last_modified` values) are preserved instead of being parsed as datetimes and silently dropped. - Per-element NA handling: a `None` / `NaN` / empty endpoint in a 2-value range now becomes `..` in the output, matching the docstring contract and supporting half-bounded intervals like `[date, None]` and `[None, date]`. Previously, a non-string element would raise from `.endswith` inside `_parse_datetime`. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 47 ++++++++++++++++++-------------- tests/waterdata_utils_test.py | 11 ++++++++ 2 files changed, 38 insertions(+), 20 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 3f457ad1..de445948 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -215,30 +215,37 @@ def _format_api_dates( if len(datetime_input) > 2: raise ValueError("datetime_input should only include 1-2 values") - # Pass through duration ("P7D") and pre-formatted interval ("a/b") strings - # untouched. Anchor the duration check so the bare letter ``P`` / ``p`` - # appearing inside a normal word doesn't accidentally bypass parsing. - if len(datetime_input) == 1: + # Pass through duration ("P7D", "PT36H") and pre-formatted interval ("a/b") + # strings untouched. Anchor the duration check so the bare letter ``P`` + # appearing inside a normal word doesn't bypass parsing; allow the optional + # ``T`` so time-only durations like ``PT36H`` are recognized. + if len(datetime_input) == 1 and isinstance(datetime_input[0], str): single = datetime_input[0] - if re.match(r"^[Pp]\d", single) or "/" in single: + if re.match(r"^[Pp]T?\d", single) or "/" in single: return single - parsed_dates = [_parse_datetime(dt) for dt in datetime_input] - if any(dt is None for dt in parsed_dates): + # Per-element: NA endpoints become ".." in the output for half-bounded + # ranges; otherwise parse. If any non-NA element fails to parse, return + # None overall. + def _format_one(dt) -> str | None: + if pd.isna(dt) or dt == "" or dt is None: + return ".." + parsed = _parse_datetime(dt) + if parsed is None: + return None + if date: + return parsed.strftime("%Y-%m-%d") + utc = ( + parsed + if parsed.tzinfo is not None + else parsed.replace(tzinfo=local_timezone) + ).astimezone(ZoneInfo("UTC")) + return utc.strftime("%Y-%m-%dT%H:%M:%SZ") + + formatted = [_format_one(dt) for dt in datetime_input] + if any(f is None for f in formatted): return None - - if date: - return "/".join(dt.strftime("%Y-%m-%d") for dt in parsed_dates) - - # Localize naive datetimes to the runner's local zone before converting - # to UTC; tz-aware datetimes are converted directly. - utc_dates = [ - (dt if dt.tzinfo is not None else dt.replace(tzinfo=local_timezone)).astimezone( - ZoneInfo("UTC") - ) - for dt in parsed_dates - ] - return "/".join(dt.strftime("%Y-%m-%dT%H:%M:%SZ") for dt in utc_dates) + return "/".join(formatted) def _cql2_param(args: dict[str, Any]) -> str: diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 13091968..e1e1d103 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -114,6 +114,11 @@ def test_format_api_dates_passthrough_duration(): assert _format_api_dates("P7D") == "P7D" +def test_format_api_dates_passthrough_time_only_duration(): + """ISO 8601 time-only durations (PT...) are passed through unchanged.""" + assert _format_api_dates("PT36H") == "PT36H" + + def test_format_api_dates_word_with_p_is_not_a_duration(): """Strings containing the letter 'p' must not be misclassified as durations.""" assert _format_api_dates("Apr") is None @@ -133,3 +138,9 @@ def test_format_api_dates_date_only_pair(): def test_format_api_dates_space_separated_still_works(): """The legacy space-separated format must still parse.""" assert _format_api_dates("2024-01-01 00:00:00", date=True) == "2024-01-01" + + +def test_format_api_dates_open_ended_range_with_none(): + """A None / NaN endpoint becomes '..' in the output range.""" + assert _format_api_dates(["2024-01-01", None], date=True) == "2024-01-01/.." + assert _format_api_dates([None, "2024-01-01"], date=True) == "../2024-01-01" From 74625a0ae7a889b0b9964b73e98ab2095ccdd56c Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 4 May 2026 15:08:51 -0500 Subject: [PATCH 3/4] Tidy _format_api_dates per /simplify review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per /simplify review on PR #247: - Lift `_format_one` from inside `_format_api_dates` to module scope so it sits next to `_parse_datetime` like the file's other helpers, and pass `date` / `local_tz` explicitly instead of capturing them via closure. - Split the parenthesized ternary `(parsed if … else parsed.replace(...)).astimezone(...)` into a named `aware` local plus an explicit `.astimezone(...)` call. - Hoist the duration regex into a module-level `_DURATION_RE = re.compile(r"^[Pp]T?\d")` with the rationale comment attached to the constant rather than buried inside the call site. - Trim the now-redundant comment block before the `_format_one` loop. Validated against the live waterdata API: ISO 8601 with `Z`, plain dates, half-bounded ranges, `P7D` duration passthrough, and numeric- offset-to-UTC conversion all produce correct URLs and matching row counts. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 48 +++++++++++++++++--------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index f77d93f8..0894b3ef 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -138,6 +138,11 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s "%Y-%m-%d", ) +# Anchored to ``[Pp]\d`` so a normal word containing ``p`` (e.g. ``"Apr"``) +# doesn't get mis-classified as an ISO 8601 duration; the optional ``T`` +# admits time-only forms like ``PT36H``. +_DURATION_RE = re.compile(r"^[Pp]T?\d") + def _parse_datetime(value: str) -> datetime | None: """Parse a single datetime string against the supported formats. @@ -156,6 +161,19 @@ def _parse_datetime(value: str) -> datetime | None: return None +def _format_one(dt, *, date: bool, local_tz) -> str | None: + """Format a single datetime element for inclusion in the API time arg.""" + if pd.isna(dt) or dt == "" or dt is None: + return ".." + parsed = _parse_datetime(dt) + if parsed is None: + return None + if date: + return parsed.strftime("%Y-%m-%d") + aware = parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=local_tz) + return aware.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") + + def _format_api_dates( datetime_input: str | list[str], date: bool = False ) -> str | None: @@ -216,33 +234,17 @@ def _format_api_dates( raise ValueError("datetime_input should only include 1-2 values") # Pass through duration ("P7D", "PT36H") and pre-formatted interval ("a/b") - # strings untouched. Anchor the duration check so the bare letter ``P`` - # appearing inside a normal word doesn't bypass parsing; allow the optional - # ``T`` so time-only durations like ``PT36H`` are recognized. + # strings untouched. if len(datetime_input) == 1 and isinstance(datetime_input[0], str): single = datetime_input[0] - if re.match(r"^[Pp]T?\d", single) or "/" in single: + if _DURATION_RE.match(single) or "/" in single: return single - # Per-element: NA endpoints become ".." in the output for half-bounded - # ranges; otherwise parse. If any non-NA element fails to parse, return - # None overall. - def _format_one(dt) -> str | None: - if pd.isna(dt) or dt == "" or dt is None: - return ".." - parsed = _parse_datetime(dt) - if parsed is None: - return None - if date: - return parsed.strftime("%Y-%m-%d") - utc = ( - parsed - if parsed.tzinfo is not None - else parsed.replace(tzinfo=local_timezone) - ).astimezone(ZoneInfo("UTC")) - return utc.strftime("%Y-%m-%dT%H:%M:%SZ") - - formatted = [_format_one(dt) for dt in datetime_input] + # Half-bounded ranges: NA endpoints render as ".."; any unparseable non-NA + # element invalidates the range. + formatted = [ + _format_one(dt, date=date, local_tz=local_timezone) for dt in datetime_input + ] if any(f is None for f in formatted): return None return "/".join(formatted) From 3316f7a47ce14dfb221ba13a68f99c373da97844 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 4 May 2026 15:17:48 -0500 Subject: [PATCH 4/4] Address Copilot review on PR #247: tighten _format_api_dates docstring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Widen the `datetime_input` annotation to `str | list[str | None] | None` to match the actual contract — range endpoints may be `None`/`NaN`/empty for half-bounded ranges. - Rewrite the Notes bullets to spell out the per-element NA behavior (single value -> None; range endpoint -> ".."; range is None only when every element is NA or a non-NA element fails to parse). - Drop the stale "replaces 'nan' with '..' in the output" bullet — the implementation no longer string-replaces, it handles NA per-element. - Document `PT...` time-only durations alongside `P7D` in the parameter description. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 0894b3ef..d78387b4 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -175,7 +175,7 @@ def _format_one(dt, *, date: bool, local_tz) -> str | None: def _format_api_dates( - datetime_input: str | list[str], date: bool = False + datetime_input: str | list[str | None] | None, date: bool = False ) -> str | None: """ Formats date or datetime input(s) for use with an API. @@ -185,10 +185,12 @@ def _format_api_dates( Parameters ---------- - datetime_input : Union[str, List[str]] + datetime_input : Union[str, List[Optional[str]], None] A single date/datetime string or a list of one or two date/datetime strings. Accepts formats like "%Y-%m-%d %H:%M:%S", ISO 8601 (with or - without ``Z``/numeric offset), or relative periods (e.g., "P7D"). + without ``Z``/numeric offset), or relative periods (e.g., "P7D" / + "PT36H"). Range endpoints may be ``None``/``NaN``/empty to denote a + half-bounded range. date : bool, optional If True, uses only the date portion ("YYYY-MM-DD"). If False (default), returns full datetime in UTC ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). @@ -210,14 +212,16 @@ def _format_api_dates( Notes ----- - - Handles blank or NA values by returning None. - - Supports relative period strings (e.g., "P7D") and passes them through - unchanged. + - A single blank/NA value returns None. In a two-value range, a blank/NA + endpoint is rendered as ``".."`` to denote an open bound (e.g. + ``"2024-01-01/.."``); the range is only None when *every* element is + blank/NA or any non-NA element fails to parse. + - Supports ISO 8601 durations such as "P7D" and "PT36H" and pre-formatted + intervals containing ``"/"``; both are passed through unchanged. - Converts datetimes to UTC and formats as ISO 8601 with 'Z' suffix when `date` is False. Inputs with an explicit offset (``Z`` or ``+HH:MM``) are converted from that offset to UTC; naive inputs are interpreted in the local time zone for backwards compatibility. - - For date ranges, replaces "nan" with ".." in the output. """ # Get timezone local_timezone = datetime.now().astimezone().tzinfo