From 38c1fed31c6549dd18bab3ba6824a83ae1f4b3f1 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 3 May 2026 15:20:00 -0500 Subject: [PATCH 1/4] Fix _arrange_cols: stop mutating the caller's properties list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_arrange_cols` did `properties.append("geometry")` and `properties[properties.index("id")] = output_id` in place. Calling any getter twice with the same `properties=[...]` therefore caused that list to grow unboundedly and have `id` rewritten across calls — a real action-at-a-distance defect for any code that re-uses a `properties` list across requests. Take a local copy of the list and mutate that. Caller's list is untouched. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 12 +++++---- tests/waterdata_utils_test.py | 46 ++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 7070da50..e9641434 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -687,15 +687,17 @@ def _arrange_cols( # If properties are provided, filter to only those columns # plus geometry if skip_geometry is False if properties and not all(pd.isna(properties)): + # Work on a local copy so we don't mutate the caller's list across calls. + local_properties = list(properties) # Make sure geometry stays in the dataframe if skip_geometry is False - if "geometry" in df.columns and "geometry" not in properties: - properties.append("geometry") + if "geometry" in df.columns and "geometry" not in local_properties: + local_properties.append("geometry") # id is technically a valid column from the service, but these # functions make the name more specific. So, if someone requests # 'id', give them the output_id column - if "id" in properties: - properties[properties.index("id")] = output_id - df = df.loc[:, [col for col in properties if col in df.columns]] + if "id" in local_properties: + local_properties[local_properties.index("id")] = output_id + df = df.loc[:, [col for col in local_properties if col in df.columns]] # Move meaningless-to-user, extra id columns to the end # of the dataframe, if they exist diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 36150be8..e4ecf326 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -1,8 +1,10 @@ from unittest import mock +import pandas as pd import requests from dataretrieval.waterdata.utils import ( + _arrange_cols, _get_args, _walk_pages, ) @@ -80,3 +82,47 @@ def test_walk_pages_multiple_mocked(): assert mock_client.send.called assert mock_client.request.called assert mock_client.request.call_args[0][1] == "https://example.com/page2" + + +# --- _arrange_cols ---------------------------------------------------------- + + +def test_arrange_cols_does_not_mutate_caller_properties(): + """`_arrange_cols` must not mutate the caller's `properties` list. + + Regression: previously the function did + ``properties.append("geometry")`` and + ``properties[properties.index("id")] = output_id`` in place, so the + caller's list grew and was rewritten across successive calls. + """ + df = pd.DataFrame( + { + "id": ["a", "b"], + "value": [1.0, 2.0], + "geometry": ["p1", "p2"], + } + ) + properties = ["id", "value"] + snapshot = list(properties) + + _arrange_cols(df, properties, output_id="daily_id") + _arrange_cols(df, properties, output_id="daily_id") + + assert properties == snapshot, ( + f"caller's properties list was mutated: {properties!r} != {snapshot!r}" + ) + + +def test_arrange_cols_swaps_id_in_returned_columns(): + """`'id'` in `properties` should still resolve to the output_id column.""" + df = pd.DataFrame({"id": ["a"], "value": [1.0]}) + result = _arrange_cols(df, ["id", "value"], output_id="daily_id") + assert "daily_id" in result.columns + assert "id" not in result.columns + + +def test_arrange_cols_keeps_geometry_when_present(): + """Geometry must come along even if the caller didn't list it.""" + df = pd.DataFrame({"id": ["a"], "value": [1.0], "geometry": ["p1"]}) + result = _arrange_cols(df, ["value"], output_id="daily_id") + assert "geometry" in result.columns From b813541df120714174131ccf1e891c0806d063db Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 3 May 2026 15:30:21 -0500 Subject: [PATCH 2/4] Tighten _arrange_cols comments Drop the regression-narrative phrasing for the local-copy comment and trim the multi-line "id is technically a valid column..." block to a two-line WHY explaining the rename intent. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index e9641434..b391a892 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -687,14 +687,12 @@ def _arrange_cols( # If properties are provided, filter to only those columns # plus geometry if skip_geometry is False if properties and not all(pd.isna(properties)): - # Work on a local copy so we don't mutate the caller's list across calls. + # Don't alias the caller's list — we mutate below. local_properties = list(properties) - # Make sure geometry stays in the dataframe if skip_geometry is False if "geometry" in df.columns and "geometry" not in local_properties: local_properties.append("geometry") - # id is technically a valid column from the service, but these - # functions make the name more specific. So, if someone requests - # 'id', give them the output_id column + # 'id' is a valid service column, but expose it under the + # service-specific output_id name instead. if "id" in local_properties: local_properties[local_properties.index("id")] = output_id df = df.loc[:, [col for col in local_properties if col in df.columns]] From 0af4d35dc10fb5df901524dcf3e4346c5eb0178b Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 4 May 2026 10:11:08 -0500 Subject: [PATCH 3/4] Fix _arrange_cols comment: drop stale skip_geometry reference Per copilot review on PR #254. _arrange_cols doesn't take skip_geometry - it conditionally keeps the geometry column based on whether one is present in the DataFrame. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index b391a892..ea592568 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -684,8 +684,8 @@ def _arrange_cols( # Rename id column to output_id df = df.rename(columns={"id": output_id}) - # If properties are provided, filter to only those columns - # plus geometry if skip_geometry is False + # If properties are provided, filter to only those columns, + # appending the geometry column when present in the DataFrame. if properties and not all(pd.isna(properties)): # Don't alias the caller's list — we mutate below. local_properties = list(properties) From 15d985c2cf0b67f38895a471f4a2723becfd8728 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 5 May 2026 10:19:49 -0500 Subject: [PATCH 4/4] Drop redundant WHAT-comment per /simplify review The "If properties are provided, filter to only those columns, appending the geometry column when present" block restated what the code below is already self-evidently doing. The two retained comments in the body are the load-bearing WHY-comments (defensive copy rationale, id-rename rationale). Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 8b101995..c4f4b5bf 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -684,8 +684,6 @@ def _arrange_cols( # Rename id column to output_id df = df.rename(columns={"id": output_id}) - # If properties are provided, filter to only those columns, - # appending the geometry column when present in the DataFrame. if properties and not all(pd.isna(properties)): # Don't alias the caller's list — we mutate below. local_properties = list(properties)