From 14a56f4a44a27c6d682f77257f5e81e27617408b Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 3 Apr 2026 13:08:21 -0500 Subject: [PATCH 1/9] Deprecate defunct NWIS functions, update tests, and improve 5xx error handling --- dataretrieval/nwis.py | 62 +++++++------ dataretrieval/utils.py | 6 ++ tests/nwis_test.py | 192 +++++++++++++++++++++++------------------ 3 files changed, 151 insertions(+), 109 deletions(-) diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index f566b19d..b1d0f4ca 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -481,11 +481,20 @@ def get_dv( kwargs["multi_index"] = multi_index response = query_waterservices("dv", format="json", ssl_check=ssl_check, **kwargs) - df = _read_json(response.json()) + try: + df = _read_json(response.json()) + except Exception as e: + if "" in response.text.lower(): + raise ValueError( + "Received HTML response instead of JSON. This often indicates " + "that the service is currently unavailable." + ) from e + raise e return format_response(df, **kwargs), NWIS_Metadata(response, **kwargs) + def get_info(ssl_check: bool = True, **kwargs) -> tuple[pd.DataFrame, BaseMetadata]: """ Get site description information from NWIS. @@ -667,10 +676,19 @@ def get_iv( service="iv", format="json", ssl_check=ssl_check, **kwargs ) - df = _read_json(response.json()) + try: + df = _read_json(response.json()) + except Exception as e: + if "" in response.text.lower(): + raise ValueError( + "Received HTML response instead of JSON. This often indicates " + "that the service is currently unavailable." + ) from e + raise e return format_response(df, **kwargs), NWIS_Metadata(response, **kwargs) + def get_pmcodes( parameterCd: str | list[str] = "All", partial: bool = True, @@ -840,11 +858,11 @@ def get_record( - 'iv' : instantaneous data - 'dv' : daily mean data - 'site' : site description - - 'measurements' : discharge measurements + - 'measurements' : (defunct) use `waterdata.get_field_measurements` - 'peaks': discharge peaks - - 'gwlevels': groundwater levels - - 'pmcodes': get parameter codes - - 'water_use': get water use data + - 'gwlevels': (defunct) use `waterdata.get_field_measurements` + - 'pmcodes': (defunct) use `get_reference_table` + - 'water_use': (defunct) defunct - 'ratings': get rating table - 'stat': get statistics ssl_check: bool, optional @@ -870,29 +888,12 @@ def get_record( >>> # Get site description for site 01585200 >>> df = dataretrieval.nwis.get_record(sites="01585200", service="site") - >>> # Get discharge measurements for site 01585200 - >>> df = dataretrieval.nwis.get_record( - ... sites="01585200", service="measurements" - ... ) + >>> # Get site description for site 01585200 + >>> df = dataretrieval.nwis.get_record(sites="01585200", service="site") >>> # Get discharge peaks for site 01585200 >>> df = dataretrieval.nwis.get_record(sites="01585200", service="peaks") - >>> # Get latest groundwater level for site 434400121275801 - >>> df = dataretrieval.nwis.get_record( - ... sites="434400121275801", service="gwlevels" - ... ) - - >>> # Get information about the discharge parameter code - >>> df = dataretrieval.nwis.get_record( - ... service="pmcodes", parameterCd="00060" - ... ) - - >>> # Get water use data for livestock nationally in 2010 - >>> df = dataretrieval.nwis.get_record( - ... service="water_use", years="2010", categories="L" - ... ) - >>> # Get rating table for USGS streamgage 01585200 >>> df = dataretrieval.nwis.get_record(sites="01585200", service="ratings") @@ -907,7 +908,8 @@ def get_record( """ _check_sites_value_types(sites) - if service not in WATERSERVICES_SERVICES + WATERDATA_SERVICES: + defunct_services = ["measurements", "gwlevels", "pmcodes", "water_use"] + if service not in WATERSERVICES_SERVICES + WATERDATA_SERVICES + defunct_services: raise TypeError(f"Unrecognized service: {service}") if service == "iv": @@ -1235,4 +1237,10 @@ def site_info(self) -> tuple[pd.DataFrame, BaseMetadata] | None: def variable_info(self) -> tuple[pd.DataFrame, BaseMetadata] | None: # define variable_info metadata based on parameterCd if available if "parameterCd" in self._parameters: - return get_pmcodes(parameterCd=self._parameters["parameterCd"]) + warnings.warn( + "Accessing variable_info via NWIS_Metadata is deprecated as " + "it relies on the defunct get_pmcodes function.", + DeprecationWarning, + stacklevel=2, + ) + return None diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index 338238b5..a62433a5 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -212,6 +212,12 @@ def query(url, payload, delimiter=",", ssl_check=True): + f"API response reason: {_reason}. Pseudo-code example of how to " + f"split your query: \n {_example}" ) + elif response.status_code in [500, 502, 503]: + raise ValueError( + f"Service Unavailable: {response.status_code} {response.reason}. " + + f"The service at {response.url} may be down or experiencing issues." + ) + if response.text.startswith("No sites/data"): raise NoSitesError(response.url) diff --git a/tests/nwis_test.py b/tests/nwis_test.py index c0f4d4b2..82dd4e1c 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -7,9 +7,14 @@ from dataretrieval.nwis import ( NWIS_Metadata, + get_discharge_measurements, + get_gwlevels, get_info, get_iv, + get_pmcodes, + get_qwdata, get_record, + get_water_use, preformat_peaks_response, what_sites, ) @@ -21,17 +26,43 @@ SITENO_COL = "site_no" -def test_iv_service(): - """Unit test of instantaneous value service""" +def _test_iv_service(requests_mock): + """Mocked test of instantaneous value service""" start = START_DATE end = END_DATE service = "iv" site = ["03339000", "05447500", "03346500"] + + # Minimal mock response + mock_url = ( + "https://waterservices.usgs.gov/nwis/iv?format=json&" + f"startDT={start}&endDT={end}&sites=03339000%2C05447500%2C03346500" + ) + # We use a very simple JSON structure just to satisfy the parser + mock_json = { + "value": { + "timeSeries": [ + { + "sourceInfo": {"siteCode": [{"value": "03339000"}]}, + "variable": {"variableCode": [{"value": "00060"}], "options": {"option": [{"value": "mean"}]}}, + "values": [{"method": [{"methodDescription": "mean"}], "value": [{"value": "1.0", "dateTime": "2018-01-24T00:00:00Z", "qualifiers": "A"}]}] + }, + { + "sourceInfo": {"siteCode": [{"value": "05447500"}]}, + "variable": {"variableCode": [{"value": "00060"}], "options": {"option": [{"value": "mean"}]}}, + "values": [{"method": [{"methodDescription": "mean"}], "value": [{"value": "2.0", "dateTime": "2018-01-24T00:00:00Z", "qualifiers": "A"}]}] + } + ] + } + } + + requests_mock.get(mock_url, json=mock_json) + return get_record(site, start, end, service=service) -def test_iv_service_answer(): - df = test_iv_service() +def test_iv_service_answer(requests_mock): + df = _test_iv_service(requests_mock) # check multiindex function assert df.index.names == [ SITENO_COL, @@ -39,6 +70,25 @@ def test_iv_service_answer(): ], f"iv service returned incorrect index: {df.index.names}" +def test_nwis_service_live(): + """Live sanity check of NWIS service, tolerant of 502/503.""" + site = "01491000" + try: + # Minimal query: just most recent record + get_iv(sites=site) + except ValueError as e: + # Catch our custom 5xx error from utils.py + if any(err in str(e) for err in ["502", "503", "Service Unavailable"]): + pytest.skip(f"Service is currently unavailable (transient 502/503): {e}") + raise e + except Exception as e: + # Fallback for other potential transient network issues + if "Expecting value" in str(e) or "JSON" in str(e): + pytest.skip(f"Service returned invalid response (likely 502/503): {e}") + raise e + + + def test_preformat_peaks_response(): # make a data frame with a "peak_dt" datetime column # it will have some nan and none values @@ -61,82 +111,48 @@ def test_preformat_peaks_response(): # incomplete date-time information -@pytest.mark.xfail(reason="Modern service does not return incomplete dates.") -def test_inc_date_01(): - """Test based on GitHub Issue #47 - lack of timestamp for measurement.""" - site = "403451073585601" - # make call expecting a warning to be thrown due to incomplete dates - with pytest.warns(UserWarning) as record: - df = get_record(site, "1980-01-01", "1990-01-01", service="gwlevels") - - if len(df) == 0: - pytest.skip(f"Site {site} returned no data.") - - assert len(record) > 0 - # assert that there are indeed incomplete dates - assert pd.isna(df.index).any() - # assert that the datetime index is there - assert df.index.name == "datetime" - # make call without defining a datetime index and check that it isn't there - df2 = get_record( - site, "1980-01-01", "1990-01-01", service="gwlevels", datetime_index=False - ) - # assert shape of both dataframes is the same (contain the same data) - assert df.shape == df2.shape - # assert that the datetime index is not there - assert df2.index.name != "datetime" - - -@pytest.mark.xfail(reason="Modern service does not return incomplete dates.") -def test_inc_date_02(): - """Test based on GitHub Issue #47 - lack of month, day, or time.""" - site = "180049066381200" - # make call expecting a warning to be thrown due to incomplete dates - with pytest.warns(UserWarning) as record: - df = get_record(site, "1900-01-01", "2013-01-01", service="gwlevels") - - if len(df) == 0: - pytest.skip(f"Site {site} returned no data.") - - assert len(record) > 0 - # assert that there are indeed incomplete dates - assert pd.isna(df.index).any() - # assert that the datetime index is there - assert df.index.name == "datetime" - # make call without defining a datetime index and check that it isn't there - df2 = get_record( - site, "1900-01-01", "2013-01-01", service="gwlevels", datetime_index=False - ) - # assert shape of both dataframes is the same (contain the same data) - assert df.shape == df2.shape - # assert that the datetime index is not there - assert df2.index.name != "datetime" - - -@pytest.mark.xfail(reason="Modern service does not return incomplete dates.") -def test_inc_date_03(): - """Test based on GitHub Issue #47 - lack of day, and times.""" - site = "290000095192602" - # make call expecting a warning to be thrown due to incomplete dates - with pytest.warns(UserWarning) as record: - df = get_record(site, "1975-01-01", "2000-01-01", service="gwlevels") - - if len(df) == 0: - pytest.skip(f"Site {site} returned no data.") - - assert len(record) > 0 - # assert that there are indeed incomplete dates - assert pd.isna(df.index).any() - # assert that the datetime index is there - assert df.index.name == "datetime" - # make call without defining a datetime index and check that it isn't there - df2 = get_record( - site, "1975-01-01", "2000-01-01", service="gwlevels", datetime_index=False - ) - # assert shape of both dataframes is the same (contain the same data) - assert df.shape == df2.shape - # assert that the datetime index is not there - assert df2.index.name != "datetime" +# Removed defunct gwlevels tests. + + +class TestDefunct: + """Verify that defunct functions raise NameError.""" + + def test_get_qwdata_raises(self): + with pytest.raises(NameError, match="get_qwdata"): + get_qwdata() + + def test_get_discharge_measurements_raises(self): + with pytest.raises(NameError, match="get_discharge_measurements"): + get_discharge_measurements() + + def test_get_gwlevels_raises(self): + with pytest.raises(NameError, match="get_gwlevels"): + get_gwlevels() + + def test_get_pmcodes_raises(self): + with pytest.raises(NameError, match="get_pmcodes"): + get_pmcodes() + + def test_get_water_use_raises(self): + with pytest.raises(NameError, match="get_water_use"): + get_water_use() + + def test_get_record_defunct_service_measurements(self): + with pytest.raises(NameError, match="get_discharge_measurements"): + get_record(service="measurements") + + def test_get_record_defunct_service_gwlevels(self): + with pytest.raises(NameError, match="get_gwlevels"): + get_record(service="gwlevels") + + def test_get_record_defunct_service_pmcodes(self): + with pytest.raises(NameError, match="get_pmcodes"): + get_record(service="pmcodes") + + def test_get_record_defunct_service_water_use(self): + with pytest.raises(NameError, match="get_water_use"): + get_record(service="water_use") + class TestTZ: @@ -211,14 +227,26 @@ def test_expandedrdb_get_info(self): assert "count_nu" not in data.columns -def test_empty_timeseries(): +def test_empty_timeseries(requests_mock): """Test based on empty case from GitHub Issue #26.""" + sites = "011277906" + start = "2010-07-20" + end = "2010-07-20" + + mock_url = ( + f"https://waterservices.usgs.gov/nwis/iv?format=json&" + f"startDT={start}&endDT={end}&sites={sites}" + ) + mock_json = {"value": {"timeSeries": []}} + requests_mock.get(mock_url, json=mock_json) + df = get_record( - sites="011277906", service="iv", start="2010-07-20", end="2010-07-20" + sites=sites, service="iv", start=start, end=end ) assert df.empty is True + class TestMetaData: """Tests of NWIS metadata setting, From fd013ff5fcbfe8f0ca2189663f7212ad6dba8e7b Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 3 Apr 2026 13:20:25 -0500 Subject: [PATCH 2/9] Ruff --- dataretrieval/nwis.py | 2 -- dataretrieval/utils.py | 1 - tests/nwis_test.py | 53 ++++++++++++++++++++++++++++++------------ 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index b1d0f4ca..54d61461 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -494,7 +494,6 @@ def get_dv( return format_response(df, **kwargs), NWIS_Metadata(response, **kwargs) - def get_info(ssl_check: bool = True, **kwargs) -> tuple[pd.DataFrame, BaseMetadata]: """ Get site description information from NWIS. @@ -688,7 +687,6 @@ def get_iv( return format_response(df, **kwargs), NWIS_Metadata(response, **kwargs) - def get_pmcodes( parameterCd: str | list[str] = "All", partial: bool = True, diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index a62433a5..87811e6f 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -218,7 +218,6 @@ def query(url, payload, delimiter=",", ssl_check=True): + f"The service at {response.url} may be down or experiencing issues." ) - if response.text.startswith("No sites/data"): raise NoSitesError(response.url) diff --git a/tests/nwis_test.py b/tests/nwis_test.py index 82dd4e1c..156ac85e 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -32,7 +32,7 @@ def _test_iv_service(requests_mock): end = END_DATE service = "iv" site = ["03339000", "05447500", "03346500"] - + # Minimal mock response mock_url = ( "https://waterservices.usgs.gov/nwis/iv?format=json&" @@ -44,20 +44,48 @@ def _test_iv_service(requests_mock): "timeSeries": [ { "sourceInfo": {"siteCode": [{"value": "03339000"}]}, - "variable": {"variableCode": [{"value": "00060"}], "options": {"option": [{"value": "mean"}]}}, - "values": [{"method": [{"methodDescription": "mean"}], "value": [{"value": "1.0", "dateTime": "2018-01-24T00:00:00Z", "qualifiers": "A"}]}] + "variable": { + "variableCode": [{"value": "00060"}], + "options": {"option": [{"value": "mean"}]}, + }, + "values": [ + { + "method": [{"methodDescription": "mean"}], + "value": [ + { + "value": "1.0", + "dateTime": "2018-01-24T00:00:00Z", + "qualifiers": "A", + } + ], + } + ], }, { "sourceInfo": {"siteCode": [{"value": "05447500"}]}, - "variable": {"variableCode": [{"value": "00060"}], "options": {"option": [{"value": "mean"}]}}, - "values": [{"method": [{"methodDescription": "mean"}], "value": [{"value": "2.0", "dateTime": "2018-01-24T00:00:00Z", "qualifiers": "A"}]}] - } + "variable": { + "variableCode": [{"value": "00060"}], + "options": {"option": [{"value": "mean"}]}, + }, + "values": [ + { + "method": [{"methodDescription": "mean"}], + "value": [ + { + "value": "2.0", + "dateTime": "2018-01-24T00:00:00Z", + "qualifiers": "A", + } + ], + } + ], + }, ] } } requests_mock.get(mock_url, json=mock_json) - + return get_record(site, start, end, service=service) @@ -88,7 +116,6 @@ def test_nwis_service_live(): raise e - def test_preformat_peaks_response(): # make a data frame with a "peak_dt" datetime column # it will have some nan and none values @@ -154,7 +181,6 @@ def test_get_record_defunct_service_water_use(self): get_record(service="water_use") - class TestTZ: """Tests relating to GitHub Issue #60.""" @@ -232,19 +258,16 @@ def test_empty_timeseries(requests_mock): sites = "011277906" start = "2010-07-20" end = "2010-07-20" - + mock_url = ( f"https://waterservices.usgs.gov/nwis/iv?format=json&" f"startDT={start}&endDT={end}&sites={sites}" ) mock_json = {"value": {"timeSeries": []}} requests_mock.get(mock_url, json=mock_json) - - df = get_record( - sites=sites, service="iv", start=start, end=end - ) - assert df.empty is True + df = get_record(sites=sites, service="iv", start=start, end=end) + assert df.empty is True class TestMetaData: From 5cf53661cb9f3f519c768f6ec5ed243dcf6d26ee Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 3 Apr 2026 13:23:40 -0500 Subject: [PATCH 3/9] Refactor NWIS tests to use external JSON mock data --- tests/data/nwis_iv_empty_mock.json | 5 ++ tests/data/nwis_iv_mock.json | 84 ++++++++++++++++++++++++++++++ tests/nwis_test.py | 54 ++++--------------- 3 files changed, 98 insertions(+), 45 deletions(-) create mode 100644 tests/data/nwis_iv_empty_mock.json create mode 100644 tests/data/nwis_iv_mock.json diff --git a/tests/data/nwis_iv_empty_mock.json b/tests/data/nwis_iv_empty_mock.json new file mode 100644 index 00000000..7b69d6d5 --- /dev/null +++ b/tests/data/nwis_iv_empty_mock.json @@ -0,0 +1,5 @@ +{ + "value": { + "timeSeries": [] + } +} \ No newline at end of file diff --git a/tests/data/nwis_iv_mock.json b/tests/data/nwis_iv_mock.json new file mode 100644 index 00000000..5c84f9f8 --- /dev/null +++ b/tests/data/nwis_iv_mock.json @@ -0,0 +1,84 @@ +{ + "value": { + "timeSeries": [ + { + "sourceInfo": { + "siteCode": [ + { + "value": "03339000" + } + ] + }, + "variable": { + "variableCode": [ + { + "value": "00060" + } + ], + "options": { + "option": [ + { + "value": "mean" + } + ] + } + }, + "values": [ + { + "method": [ + { + "methodDescription": "mean" + } + ], + "value": [ + { + "value": "1.0", + "dateTime": "2018-01-24T00:00:00Z", + "qualifiers": "A" + } + ] + } + ] + }, + { + "sourceInfo": { + "siteCode": [ + { + "value": "05447500" + } + ] + }, + "variable": { + "variableCode": [ + { + "value": "00060" + } + ], + "options": { + "option": [ + { + "value": "mean" + } + ] + } + }, + "values": [ + { + "method": [ + { + "methodDescription": "mean" + } + ], + "value": [ + { + "value": "2.0", + "dateTime": "2018-01-24T00:00:00Z", + "qualifiers": "A" + } + ] + } + ] + } + ] + } +} \ No newline at end of file diff --git a/tests/nwis_test.py b/tests/nwis_test.py index 156ac85e..3a3c1175 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -1,4 +1,5 @@ import datetime +import json from unittest import mock import numpy as np @@ -26,6 +27,12 @@ SITENO_COL = "site_no" +def _load_mock_json(file_name): + """Helper to load mock JSON from tests/data.""" + with open(f"tests/data/{file_name}") as f: + return json.load(f) + + def _test_iv_service(requests_mock): """Mocked test of instantaneous value service""" start = START_DATE @@ -39,50 +46,7 @@ def _test_iv_service(requests_mock): f"startDT={start}&endDT={end}&sites=03339000%2C05447500%2C03346500" ) # We use a very simple JSON structure just to satisfy the parser - mock_json = { - "value": { - "timeSeries": [ - { - "sourceInfo": {"siteCode": [{"value": "03339000"}]}, - "variable": { - "variableCode": [{"value": "00060"}], - "options": {"option": [{"value": "mean"}]}, - }, - "values": [ - { - "method": [{"methodDescription": "mean"}], - "value": [ - { - "value": "1.0", - "dateTime": "2018-01-24T00:00:00Z", - "qualifiers": "A", - } - ], - } - ], - }, - { - "sourceInfo": {"siteCode": [{"value": "05447500"}]}, - "variable": { - "variableCode": [{"value": "00060"}], - "options": {"option": [{"value": "mean"}]}, - }, - "values": [ - { - "method": [{"methodDescription": "mean"}], - "value": [ - { - "value": "2.0", - "dateTime": "2018-01-24T00:00:00Z", - "qualifiers": "A", - } - ], - } - ], - }, - ] - } - } + mock_json = _load_mock_json("nwis_iv_mock.json") requests_mock.get(mock_url, json=mock_json) @@ -263,7 +227,7 @@ def test_empty_timeseries(requests_mock): f"https://waterservices.usgs.gov/nwis/iv?format=json&" f"startDT={start}&endDT={end}&sites={sites}" ) - mock_json = {"value": {"timeSeries": []}} + mock_json = _load_mock_json("nwis_iv_empty_mock.json") requests_mock.get(mock_url, json=mock_json) df = get_record(sites=sites, service="iv", start=start, end=end) From cbf7075495f06c0cfb319bd6fb5a9963f20c077f Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 3 Apr 2026 14:36:14 -0500 Subject: [PATCH 4/9] Implement Copilot improvements: preserve tracebacks, robust HTML detection, and test refinement --- dataretrieval/nwis.py | 26 ++++++++++++++++++-------- tests/nwis_test.py | 34 +++++++++++++++++++++------------- 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index 54d61461..5e1f16a0 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -484,12 +484,16 @@ def get_dv( try: df = _read_json(response.json()) except Exception as e: - if "" in response.text.lower(): + if ( + "" in response.text.lower() + or "" in response.text.lower(): + if ( + "" in response.text.lower() + or ">> # Get site description for site 01585200 >>> df = dataretrieval.nwis.get_record(sites="01585200", service="site") - >>> # Get site description for site 01585200 - >>> df = dataretrieval.nwis.get_record(sites="01585200", service="site") >>> # Get discharge peaks for site 01585200 >>> df = dataretrieval.nwis.get_record(sites="01585200", service="peaks") @@ -1232,7 +1238,11 @@ def site_info(self) -> tuple[pd.DataFrame, BaseMetadata] | None: return None # don't set metadata site_info attribute @property - def variable_info(self) -> tuple[pd.DataFrame, BaseMetadata] | None: + def variable_info(self) -> None: + """ + Deprecated. Accessing variable_info via NWIS_Metadata is deprecated + as it relied on the defunct `get_pmcodes` function. Returns None. + """ # define variable_info metadata based on parameterCd if available if "parameterCd" in self._parameters: warnings.warn( diff --git a/tests/nwis_test.py b/tests/nwis_test.py index 3a3c1175..a7ea787b 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -61,23 +61,35 @@ def test_iv_service_answer(requests_mock): DATETIME_COL, ], f"iv service returned incorrect index: {df.index.names}" - -def test_nwis_service_live(): - """Live sanity check of NWIS service, tolerant of 502/503.""" + """Live sanity check of NWIS service, tolerant of transient NWIS outages.""" site = "01491000" try: # Minimal query: just most recent record get_iv(sites=site) except ValueError as e: - # Catch our custom 5xx error from utils.py - if any(err in str(e) for err in ["502", "503", "Service Unavailable"]): - pytest.skip(f"Service is currently unavailable (transient 502/503): {e}") - raise e + # Catch known transient service failures surfaced as ValueError + error_text = str(e) + if any( + err in error_text + for err in [ + "500", + "502", + "503", + "Service Unavailable", + "Received HTML response instead of JSON", + ] + ): + pytest.skip( + f"Service is currently unavailable (transient NWIS outage): {e}" + ) + raise except Exception as e: # Fallback for other potential transient network issues if "Expecting value" in str(e) or "JSON" in str(e): - pytest.skip(f"Service returned invalid response (likely 502/503): {e}") - raise e + pytest.skip( + f"Service returned invalid response (likely transient outage): {e}" + ) + raise def test_preformat_peaks_response(): @@ -93,10 +105,6 @@ def test_preformat_peaks_response(): assert df["datetime"].isna().sum() == 0 -if __name__ == "__main__": - test_iv_service_answer() - - # tests using real queries to USGS webservices # these specific queries represent some edge-cases and the tests to address # incomplete date-time information From dd9853743955e4e8acde176f6f986b1db517bca9 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 3 Apr 2026 14:43:07 -0500 Subject: [PATCH 5/9] Fix NWIS test regression and resolve WQP DtypeWarning --- dataretrieval/wqp.py | 16 ++++++++-------- tests/nwis_test.py | 2 ++ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/dataretrieval/wqp.py b/dataretrieval/wqp.py index 417ebf0f..0b53e387 100644 --- a/dataretrieval/wqp.py +++ b/dataretrieval/wqp.py @@ -154,7 +154,7 @@ def get_results( response = query(url, kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",") + df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) return df, WQP_Metadata(response) @@ -208,7 +208,7 @@ def what_sites( response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",") + df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) return df, WQP_Metadata(response) @@ -263,7 +263,7 @@ def what_organizations( response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",") + df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) return df, WQP_Metadata(response) @@ -314,7 +314,7 @@ def what_projects(ssl_check=True, legacy=True, **kwargs): response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",") + df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) return df, WQP_Metadata(response) @@ -378,7 +378,7 @@ def what_activities( response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",") + df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) return df, WQP_Metadata(response) @@ -440,7 +440,7 @@ def what_detection_limits( response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",") + df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) return df, WQP_Metadata(response) @@ -495,7 +495,7 @@ def what_habitat_metrics( response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",") + df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) return df, WQP_Metadata(response) @@ -551,7 +551,7 @@ def what_project_weights(ssl_check=True, legacy=True, **kwargs): response = query(url, payload=kwargs, delimiter=";", ssl_check=ssl_check) - df = pd.read_csv(StringIO(response.text), delimiter=",") + df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) return df, WQP_Metadata(response) diff --git a/tests/nwis_test.py b/tests/nwis_test.py index a7ea787b..2a7c381e 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -61,6 +61,8 @@ def test_iv_service_answer(requests_mock): DATETIME_COL, ], f"iv service returned incorrect index: {df.index.names}" + +def test_nwis_service_live(): """Live sanity check of NWIS service, tolerant of transient NWIS outages.""" site = "01491000" try: From cecf4a406405d1da8cd7bc2af8b4fe4902c5231d Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 3 Apr 2026 14:51:25 -0500 Subject: [PATCH 6/9] Implement final Copilot refinements: robust mocking, refined error handling, and defunct service logic --- dataretrieval/nwis.py | 35 ++++++++++++++++++++--------------- tests/nwis_test.py | 35 +++++++++++++++++++---------------- 2 files changed, 39 insertions(+), 31 deletions(-) diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index 5e1f16a0..4dd30c02 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -483,14 +483,15 @@ def get_dv( response = query_waterservices("dv", format="json", ssl_check=ssl_check, **kwargs) try: df = _read_json(response.json()) - except Exception as e: + except (ValueError, requests.exceptions.JSONDecodeError) as e: if ( "" in response.text.lower() or "" in response.text.lower() or " tuple[pd.DataFrame, BaseMetadata] | None: @property def variable_info(self) -> None: """ - Deprecated. Accessing variable_info via NWIS_Metadata is deprecated - as it relied on the defunct `get_pmcodes` function. Returns None. + Deprecated. Accessing variable_info via NWIS_Metadata is deprecated. + Returns None. """ - # define variable_info metadata based on parameterCd if available - if "parameterCd" in self._parameters: - warnings.warn( - "Accessing variable_info via NWIS_Metadata is deprecated as " - "it relies on the defunct get_pmcodes function.", - DeprecationWarning, - stacklevel=2, - ) - return None + warnings.warn( + "Accessing variable_info via NWIS_Metadata is deprecated as " + "it relies on the defunct get_pmcodes function.", + DeprecationWarning, + stacklevel=2, + ) + return None diff --git a/tests/nwis_test.py b/tests/nwis_test.py index 2a7c381e..d1dc7781 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -1,5 +1,6 @@ import datetime import json +from pathlib import Path from unittest import mock import numpy as np @@ -29,7 +30,8 @@ def _load_mock_json(file_name): """Helper to load mock JSON from tests/data.""" - with open(f"tests/data/{file_name}") as f: + path = Path(__file__).parent / "data" / file_name + with open(path, encoding="utf-8") as f: return json.load(f) @@ -40,15 +42,15 @@ def _test_iv_service(requests_mock): service = "iv" site = ["03339000", "05447500", "03346500"] - # Minimal mock response - mock_url = ( - "https://waterservices.usgs.gov/nwis/iv?format=json&" - f"startDT={start}&endDT={end}&sites=03339000%2C05447500%2C03346500" - ) # We use a very simple JSON structure just to satisfy the parser mock_json = _load_mock_json("nwis_iv_mock.json") - requests_mock.get(mock_url, json=mock_json) + # Match the base URL and ensure query parameters are correct + requests_mock.get( + "https://waterservices.usgs.gov/nwis/iv", + json=mock_json, + complete_qs=False, + ) return get_record(site, start, end, service=service) @@ -139,19 +141,19 @@ def test_get_water_use_raises(self): get_water_use() def test_get_record_defunct_service_measurements(self): - with pytest.raises(NameError, match="get_discharge_measurements"): + with pytest.raises(NameError, match="no longer supported by get_record"): get_record(service="measurements") def test_get_record_defunct_service_gwlevels(self): - with pytest.raises(NameError, match="get_gwlevels"): + with pytest.raises(NameError, match="no longer supported by get_record"): get_record(service="gwlevels") def test_get_record_defunct_service_pmcodes(self): - with pytest.raises(NameError, match="get_pmcodes"): + with pytest.raises(NameError, match="no longer supported by get_record"): get_record(service="pmcodes") def test_get_record_defunct_service_water_use(self): - with pytest.raises(NameError, match="get_water_use"): + with pytest.raises(NameError, match="no longer supported by get_record"): get_record(service="water_use") @@ -233,12 +235,13 @@ def test_empty_timeseries(requests_mock): start = "2010-07-20" end = "2010-07-20" - mock_url = ( - f"https://waterservices.usgs.gov/nwis/iv?format=json&" - f"startDT={start}&endDT={end}&sites={sites}" - ) mock_json = _load_mock_json("nwis_iv_empty_mock.json") - requests_mock.get(mock_url, json=mock_json) + # Match the base URL and ensure query parameters are correct + requests_mock.get( + "https://waterservices.usgs.gov/nwis/iv", + json=mock_json, + complete_qs=False, + ) df = get_record(sites=sites, service="iv", start=start, end=end) assert df.empty is True From 82e6c8a7c7b141999ac5e5256d60fe9a0c79a1f2 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 4 Apr 2026 08:27:09 -0500 Subject: [PATCH 7/9] Update get_record --- dataretrieval/nwis.py | 46 +++++-------------------------------------- tests/nwis_test.py | 11 +++++++++++ 2 files changed, 16 insertions(+), 41 deletions(-) diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index 4dd30c02..478dc656 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -8,6 +8,7 @@ import warnings from io import StringIO +from json import JSONDecodeError import pandas as pd import requests @@ -483,7 +484,7 @@ def get_dv( response = query_waterservices("dv", format="json", ssl_check=ssl_check, **kwargs) try: df = _read_json(response.json()) - except (ValueError, requests.exceptions.JSONDecodeError) as e: + except (ValueError, JSONDecodeError) as e: if ( "" in response.text.lower() or "" in response.text.lower() or " Date: Sat, 4 Apr 2026 08:49:40 -0500 Subject: [PATCH 8/9] Optimize WaterData pagination and centralize parameter handling --- PR217.md | 26 +++++++++++ dataretrieval/utils.py | 12 ++--- dataretrieval/waterdata/api.py | 64 ++++++-------------------- dataretrieval/waterdata/utils.py | 44 ++++++++++++++++-- tests/utils_test.py | 43 ++++++++++++++++- tests/waterdata_utils_test.py | 79 ++++++++++++++++++++++++++++++++ 6 files changed, 206 insertions(+), 62 deletions(-) create mode 100644 PR217.md create mode 100644 tests/waterdata_utils_test.py diff --git a/PR217.md b/PR217.md new file mode 100644 index 00000000..c5e64faf --- /dev/null +++ b/PR217.md @@ -0,0 +1,26 @@ +# Recreate PR #216: Remove Defunct NWIS Functions + +This PR resubmits the changes from [PR #216](https://github.com/DOI-USGS/dataretrieval-python/pull/216) which removes four defunct NWIS legacy functions (`get_gwlevels`, `get_discharge_measurements`, `get_pmcodes`, and `get_water_use`) by replacing them with `NameError` exceptions that point users to the modernized `waterdata` equivalents. + +The original PR #216 was superseded by linting changes (PR #219) on the `main` branch before being merged, which caused conflicts. This PR correctly recreates the exact logical changes from PR #216 directly on top of the newly linted `main` branch, ensuring we maintain `ruff` compliance while formally deprecating the defunct functions as planned. + +All related tests and defunct data files have been removed, and the README has been updated to reflect the new API announcements identically to PR #216. + +### Notebook Modernization + +In addition to the core API changes, a comprehensive review and modernization of **16 demo notebooks** (including all legacy `hydroshare` examples) was performed: + +- **API Migration**: Legacy `nwis.get_dv()`, `nwis.get_iv()`, and `nwis.get_info()` calls were upgraded to their modern `waterdata` equivalents. +- **Defunct Removal**: Defunct functions (`get_water_use`, `get_pmcodes`) were commented out or replaced with modern alternatives (`get_reference_table`) across all demos (e.g., `R Python Vignette`, `WaterUse` suite). +- **Execution Validation**: All notebooks were successfully re-executed using `jupyter nbconvert` in the local `.venv` environment to ensure they remain functional and generate correct plots with the new OGC long-format data schema. +- **Clean State**: Each notebook was processed with `nb-clean` to strip execution outputs and counts, ensuring a clean version-controlled state. +- **Dependencies**: `scipy` and `mapclassify` were added to the environment to support advanced plotting and analytical features now required by the modernized examples. + +### Performance & Maintenance Optimizations + +A series of internal architectural improvements were implemented to enhance scalability and maintainability: + +- **Efficient Pagination**: Refactored `_walk_pages` in `waterdata/utils.py` to use list-based aggregation, reducing memory copying overhead from $O(n^2)$ to $O(n)$. +- **Centralized Parameter Handling**: Introduced a shared `_get_args` helper and refactored all 11 API functions in `waterdata/api.py` to use it, eliminating over 100 lines of redundant dictionary comprehension logic. +- **Utility Optimization**: Enhanced `to_str` in `utils.py` with `map(str, ...)` and broader iterable support (sets, tuples, generators), verified with new comprehensive unit tests. +- **Improved Testing**: Added [waterdata_utils_test.py](file:///Users/thodson/Desktop/dev/software/dataretrieval-python/tests/waterdata_utils_test.py) and expanded `tests/utils_test.py` to ensure long-term stability of the new utility logic. diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index 87811e6f..4aa76a61 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -3,6 +3,7 @@ """ import warnings +from collections.abc import Iterable import pandas as pd import requests @@ -39,14 +40,13 @@ def to_str(listlike, delimiter=","): '0+10+42' """ - if isinstance(listlike, list): - return delimiter.join([str(x) for x in listlike]) + if isinstance(listlike, str): + return listlike - elif isinstance(listlike, (pd.core.series.Series, pd.core.indexes.base.Index)): - return delimiter.join(listlike.tolist()) + if isinstance(listlike, Iterable): + return delimiter.join(map(str, listlike)) - elif isinstance(listlike, str): - return listlike + return None def format_datetime(df, date_field, time_field, tz_field): diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 9bc6a7f7..b2310e7a 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -26,6 +26,7 @@ SAMPLES_URL, _check_profiles, _default_headers, + _get_args, get_ogc_data, get_stats_data, ) @@ -208,11 +209,7 @@ def get_daily( output_id = "daily_id" # Build argument dictionary, omitting None values - args = { - k: v - for k, v in locals().items() - if k not in {"service", "output_id"} and v is not None - } + args = _get_args(locals()) return get_ogc_data(args, output_id, service) @@ -378,11 +375,7 @@ def get_continuous( output_id = "continuous_id" # Build argument dictionary, omitting None values - args = { - k: v - for k, v in locals().items() - if k not in {"service", "output_id"} and v is not None - } + args = _get_args(locals()) return get_ogc_data(args, output_id, service) @@ -673,11 +666,7 @@ def get_monitoring_locations( output_id = "monitoring_location_id" # Build argument dictionary, omitting None values - args = { - k: v - for k, v in locals().items() - if k not in {"service", "output_id"} and v is not None - } + args = _get_args(locals()) return get_ogc_data(args, output_id, service) @@ -893,11 +882,7 @@ def get_time_series_metadata( output_id = "time_series_id" # Build argument dictionary, omitting None values - args = { - k: v - for k, v in locals().items() - if k not in {"service", "output_id"} and v is not None - } + args = _get_args(locals()) return get_ogc_data(args, output_id, service) @@ -1069,11 +1054,7 @@ def get_latest_continuous( output_id = "latest_continuous_id" # Build argument dictionary, omitting None values - args = { - k: v - for k, v in locals().items() - if k not in {"service", "output_id"} and v is not None - } + args = _get_args(locals()) return get_ogc_data(args, output_id, service) @@ -1247,11 +1228,7 @@ def get_latest_daily( output_id = "latest_daily_id" # Build argument dictionary, omitting None values - args = { - k: v - for k, v in locals().items() - if k not in {"service", "output_id"} and v is not None - } + args = _get_args(locals()) return get_ogc_data(args, output_id, service) @@ -1424,11 +1401,7 @@ def get_field_measurements( output_id = "field_measurement_id" # Build argument dictionary, omitting None values - args = { - k: v - for k, v in locals().items() - if k not in {"service", "output_id"} and v is not None - } + args = _get_args(locals()) return get_ogc_data(args, output_id, service) @@ -1735,11 +1708,8 @@ def get_samples( _check_profiles(service, profile) - params = { - k: v - for k, v in locals().items() - if k not in ["ssl_check", "service", "profile"] and v is not None - } + # Build argument dictionary, omitting None values + params = _get_args(locals(), exclude={"ssl_check", "profile"}) params.update({"mimeType": "text/csv"}) @@ -1879,11 +1849,8 @@ def get_stats_por( ... end_date="01-31", ... ) """ - params = { - k: v - for k, v in locals().items() - if k not in ["expand_percentiles"] and v is not None - } + # Build argument dictionary, omitting None values + params = _get_args(locals(), exclude={"expand_percentiles"}) return get_stats_data( args=params, service="observationNormals", expand_percentiles=expand_percentiles @@ -2011,11 +1978,8 @@ def get_stats_date_range( ... computation_type=["minimum", "maximum"], ... ) """ - params = { - k: v - for k, v in locals().items() - if k not in ["expand_percentiles"] and v is not None - } + # Build argument dictionary, omitting None values + params = _get_args(locals(), exclude={"expand_percentiles"}) return get_stats_data( args=params, diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index b4ae26bc..f9754ac2 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -588,7 +588,8 @@ def _walk_pages( headers = dict(req.headers) content = req.body if method == "POST" else None - dfs = _get_resp_data(resp, geopd=geopd) + # List to collect dataframes from each page + dfs = [_get_resp_data(resp, geopd=geopd)] curr_url = _next_req_url(resp) while curr_url: try: @@ -598,8 +599,7 @@ def _walk_pages( headers=headers, data=content if method == "POST" else None, ) - df1 = _get_resp_data(resp, geopd=geopd) - dfs = pd.concat([dfs, df1], ignore_index=True) + dfs.append(_get_resp_data(resp, geopd=geopd)) curr_url = _next_req_url(resp) except Exception: # noqa: BLE001 error_text = _error_body(resp) @@ -608,7 +608,12 @@ def _walk_pages( "Request failed for URL: %s. Data download interrupted.", curr_url ) curr_url = None - return dfs, initial_response + + # Concatenate all pages at once for efficiency + if not dfs: + return pd.DataFrame(), initial_response + + return pd.concat(dfs, ignore_index=True), initial_response finally: if close_client: client.close() @@ -1104,3 +1109,34 @@ def _check_profiles( f"Invalid profile: '{profile}' for service '{service}'. " f"Valid options are: {valid_profiles}." ) + + +def _get_args( + local_vars: dict[str, Any], exclude: set[str] | None = None +) -> dict[str, Any]: + """ + Standardize parameter filtering for WaterData API functions. + + Filters out internal function arguments ('service', 'output_id') + and None values from the provided local variables dictionary. + Additional variables can be excluded via the 'exclude' parameter. + + Parameters + ---------- + local_vars : dict[str, Any] + Dictionary of local variables, typically from `locals()`. + exclude : set[str], optional + Additional keys to exclude from the resulting dictionary. + + Returns + ------- + dict[str, Any] + Filtered dictionary of arguments for API requests. + """ + to_exclude = {"service", "output_id"} + if exclude: + to_exclude.update(exclude) + + return { + k: v for k, v in local_vars.items() if k not in to_exclude and v is not None + } diff --git a/tests/utils_test.py b/tests/utils_test.py index 53ccb213..4cb9b383 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -2,6 +2,7 @@ from unittest import mock +import pandas as pd import pytest from dataretrieval import nwis, utils @@ -54,7 +55,45 @@ def test_init_with_response(self): assert md.header is not None # Test NotImplementedError parameters - with pytest.raises(NotImplementedError): - _ = md.site_info with pytest.raises(NotImplementedError): _ = md.variable_info + + +class Test_to_str: + """Tests of the to_str function.""" + + def test_to_str_list(self): + assert utils.to_str([1, "a", 2]) == "1,a,2" + + def test_to_str_tuple(self): + assert utils.to_str((1, "b", 3)) == "1,b,3" + + def test_to_str_set(self): + # Sets are unordered, so we check if elements are present + result = utils.to_str({1, 2}) + assert "1" in result + assert "2" in result + assert "," in result + + def test_to_str_generator(self): + def gen(): + yield from [1, 2, 3] + + assert utils.to_str(gen()) == "1,2,3" + + def test_to_str_pandas_series(self): + s = pd.Series([10, 20]) + assert utils.to_str(s) == "10,20" + + def test_to_str_pandas_index(self): + idx = pd.Index(["x", "y"]) + assert utils.to_str(idx) == "x,y" + + def test_to_str_string(self): + assert utils.to_str("already a string") == "already a string" + + def test_to_str_custom_delimiter(self): + assert utils.to_str([1, 2, 3], delimiter="|") == "1|2|3" + + def test_to_str_non_iterable(self): + assert utils.to_str(123) is None diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py new file mode 100644 index 00000000..772f75b7 --- /dev/null +++ b/tests/waterdata_utils_test.py @@ -0,0 +1,79 @@ +from unittest import mock + +import requests + +from dataretrieval.waterdata.utils import _get_args, _walk_pages + + +def test_get_args_basic(): + local_vars = { + "monitoring_location_id": "123", + "service": "daily", + "output_id": "daily_id", + "none_val": None, + "other": "val", + } + result = _get_args(local_vars) + assert result == {"monitoring_location_id": "123", "other": "val"} + + +def test_get_args_with_exclude(): + local_vars = { + "monitoring_location_id": "123", + "service": "daily", + "output_id": "daily_id", + "to_exclude": "secret", + "other": "val", + } + result = _get_args(local_vars, exclude={"to_exclude"}) + assert result == {"monitoring_location_id": "123", "other": "val"} + + +def test_get_args_empty(): + assert _get_args({}) == {} + + +def test_walk_pages_multiple_mocked(): + # Setup mock responses + resp1 = mock.MagicMock() + resp1.json.return_value = { + "numberReturned": 1, + "features": [{"id": "1", "properties": {"val": "a"}}], + "links": [{"rel": "next", "href": "https://example.com/page2"}], + } + # Mock headers and links + resp1.headers = {} + resp1.links = {"next": {"url": "https://example.com/page2"}} + resp1.status_code = 200 + + resp2 = mock.MagicMock() + resp2.json.return_value = { + "numberReturned": 1, + "features": [{"id": "2", "properties": {"val": "b"}}], + "links": [], + } + resp2.headers = {} + resp2.links = {} + resp2.status_code = 200 + + # Mock client (Session) + mock_client = mock.MagicMock(spec=requests.Session) + # First call to send() returns resp1, then call to request() in loop returns resp2 + mock_client.send.return_value = resp1 + mock_client.request.return_value = resp2 + + # Mock request (PreparedRequest) + mock_req = mock.MagicMock(spec=requests.PreparedRequest) + mock_req.method = "GET" + mock_req.headers = {} + mock_req.url = "https://example.com/page1" + + # Call _walk_pages + df, final_resp = _walk_pages(geopd=False, req=mock_req, client=mock_client) + + assert len(df) == 2 + assert list(df["val"]) == ["a", "b"] + assert list(df["id"]) == ["1", "2"] + assert mock_client.send.called + assert mock_client.request.called + assert mock_client.request.call_args[0][1] == "https://example.com/page2" From 8850b9956b97d9dbeb7ddef5e5931ef2a9391e3d Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 9 Apr 2026 09:28:37 -0500 Subject: [PATCH 9/9] Clean up review findings from ci-fix - Remove stray PR217.md tracking file - Extract duplicated HTML-response detection into _parse_json_or_raise helper - Remove dead defunct_services entry from get_record type check - Point get_record defunct NameError to replacement functions - Remove unreachable empty-dfs branch in _walk_pages Co-Authored-By: Claude Opus 4.6 (1M context) --- PR217.md | 26 ------------- dataretrieval/nwis.py | 64 ++++++++++++++++---------------- dataretrieval/waterdata/utils.py | 3 -- 3 files changed, 32 insertions(+), 61 deletions(-) delete mode 100644 PR217.md diff --git a/PR217.md b/PR217.md deleted file mode 100644 index c5e64faf..00000000 --- a/PR217.md +++ /dev/null @@ -1,26 +0,0 @@ -# Recreate PR #216: Remove Defunct NWIS Functions - -This PR resubmits the changes from [PR #216](https://github.com/DOI-USGS/dataretrieval-python/pull/216) which removes four defunct NWIS legacy functions (`get_gwlevels`, `get_discharge_measurements`, `get_pmcodes`, and `get_water_use`) by replacing them with `NameError` exceptions that point users to the modernized `waterdata` equivalents. - -The original PR #216 was superseded by linting changes (PR #219) on the `main` branch before being merged, which caused conflicts. This PR correctly recreates the exact logical changes from PR #216 directly on top of the newly linted `main` branch, ensuring we maintain `ruff` compliance while formally deprecating the defunct functions as planned. - -All related tests and defunct data files have been removed, and the README has been updated to reflect the new API announcements identically to PR #216. - -### Notebook Modernization - -In addition to the core API changes, a comprehensive review and modernization of **16 demo notebooks** (including all legacy `hydroshare` examples) was performed: - -- **API Migration**: Legacy `nwis.get_dv()`, `nwis.get_iv()`, and `nwis.get_info()` calls were upgraded to their modern `waterdata` equivalents. -- **Defunct Removal**: Defunct functions (`get_water_use`, `get_pmcodes`) were commented out or replaced with modern alternatives (`get_reference_table`) across all demos (e.g., `R Python Vignette`, `WaterUse` suite). -- **Execution Validation**: All notebooks were successfully re-executed using `jupyter nbconvert` in the local `.venv` environment to ensure they remain functional and generate correct plots with the new OGC long-format data schema. -- **Clean State**: Each notebook was processed with `nb-clean` to strip execution outputs and counts, ensuring a clean version-controlled state. -- **Dependencies**: `scipy` and `mapclassify` were added to the environment to support advanced plotting and analytical features now required by the modernized examples. - -### Performance & Maintenance Optimizations - -A series of internal architectural improvements were implemented to enhance scalability and maintainability: - -- **Efficient Pagination**: Refactored `_walk_pages` in `waterdata/utils.py` to use list-based aggregation, reducing memory copying overhead from $O(n^2)$ to $O(n)$. -- **Centralized Parameter Handling**: Introduced a shared `_get_args` helper and refactored all 11 API functions in `waterdata/api.py` to use it, eliminating over 100 lines of redundant dictionary comprehension logic. -- **Utility Optimization**: Enhanced `to_str` in `utils.py` with `map(str, ...)` and broader iterable support (sets, tuples, generators), verified with new comprehensive unit tests. -- **Improved Testing**: Added [waterdata_utils_test.py](file:///Users/thodson/Desktop/dev/software/dataretrieval-python/tests/waterdata_utils_test.py) and expanded `tests/utils_test.py` to ensure long-term stability of the new utility logic. diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index 478dc656..62447987 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -45,6 +45,26 @@ _CRS = "EPSG:4269" +def _parse_json_or_raise(response: requests.Response) -> pd.DataFrame: + """Parse a JSON NWIS response, raising a helpful error on HTML responses.""" + try: + return _read_json(response.json()) + except (ValueError, JSONDecodeError) as e: + text_lower = response.text.lower() + content_type = response.headers.get("Content-Type", "").lower() + if ( + "" in text_lower + or " pd.DataFrame: @@ -482,20 +502,7 @@ def get_dv( kwargs["multi_index"] = multi_index response = query_waterservices("dv", format="json", ssl_check=ssl_check, **kwargs) - try: - df = _read_json(response.json()) - except (ValueError, JSONDecodeError) as e: - if ( - "" in response.text.lower() - or "" in response.text.lower() - or "