From 38c1fed31c6549dd18bab3ba6824a83ae1f4b3f1 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Sun, 3 May 2026 15:20:00 -0500
Subject: [PATCH 1/4] Fix _arrange_cols: stop mutating the caller's properties
 list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`_arrange_cols` did `properties.append("geometry")` and
`properties[properties.index("id")] = output_id` in place. Calling any
getter twice with the same `properties=[...]` therefore caused that
list to grow unboundedly and have `id` rewritten across calls — a
real action-at-a-distance defect for any code that re-uses a
`properties` list across requests.

Take a local copy of the list and mutate that. Caller's list is
untouched.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dataretrieval/waterdata/utils.py | 12 +++++----
 tests/waterdata_utils_test.py    | 46 ++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
index 7070da50..e9641434 100644
--- a/dataretrieval/waterdata/utils.py
+++ b/dataretrieval/waterdata/utils.py
@@ -687,15 +687,17 @@ def _arrange_cols(
     # If properties are provided, filter to only those columns
     # plus geometry if skip_geometry is False
     if properties and not all(pd.isna(properties)):
+        # Work on a local copy so we don't mutate the caller's list across calls.
+        local_properties = list(properties)
         # Make sure geometry stays in the dataframe if skip_geometry is False
-        if "geometry" in df.columns and "geometry" not in properties:
-            properties.append("geometry")
+        if "geometry" in df.columns and "geometry" not in local_properties:
+            local_properties.append("geometry")
         # id is technically a valid column from the service, but these
         # functions make the name more specific. So, if someone requests
         # 'id', give them the output_id column
-        if "id" in properties:
-            properties[properties.index("id")] = output_id
-        df = df.loc[:, [col for col in properties if col in df.columns]]
+        if "id" in local_properties:
+            local_properties[local_properties.index("id")] = output_id
+        df = df.loc[:, [col for col in local_properties if col in df.columns]]
 
     # Move meaningless-to-user, extra id columns to the end
     # of the dataframe, if they exist
diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py
index 36150be8..e4ecf326 100644
--- a/tests/waterdata_utils_test.py
+++ b/tests/waterdata_utils_test.py
@@ -1,8 +1,10 @@
 from unittest import mock
 
+import pandas as pd
 import requests
 
 from dataretrieval.waterdata.utils import (
+    _arrange_cols,
     _get_args,
     _walk_pages,
 )
@@ -80,3 +82,47 @@ def test_walk_pages_multiple_mocked():
     assert mock_client.send.called
     assert mock_client.request.called
     assert mock_client.request.call_args[0][1] == "https://example.com/page2"
+
+
+# --- _arrange_cols ----------------------------------------------------------
+
+
+def test_arrange_cols_does_not_mutate_caller_properties():
+    """`_arrange_cols` must not mutate the caller's `properties` list.
+
+    Regression: previously the function did
+    ``properties.append("geometry")`` and
+    ``properties[properties.index("id")] = output_id`` in place, so the
+    caller's list grew and was rewritten across successive calls.
+    """
+    df = pd.DataFrame(
+        {
+            "id": ["a", "b"],
+            "value": [1.0, 2.0],
+            "geometry": ["p1", "p2"],
+        }
+    )
+    properties = ["id", "value"]
+    snapshot = list(properties)
+
+    _arrange_cols(df, properties, output_id="daily_id")
+    _arrange_cols(df, properties, output_id="daily_id")
+
+    assert properties == snapshot, (
+        f"caller's properties list was mutated: {properties!r} != {snapshot!r}"
+    )
+
+
+def test_arrange_cols_swaps_id_in_returned_columns():
+    """`'id'` in `properties` should still resolve to the output_id column."""
+    df = pd.DataFrame({"id": ["a"], "value": [1.0]})
+    result = _arrange_cols(df, ["id", "value"], output_id="daily_id")
+    assert "daily_id" in result.columns
+    assert "id" not in result.columns
+
+
+def test_arrange_cols_keeps_geometry_when_present():
+    """Geometry must come along even if the caller didn't list it."""
+    df = pd.DataFrame({"id": ["a"], "value": [1.0], "geometry": ["p1"]})
+    result = _arrange_cols(df, ["value"], output_id="daily_id")
+    assert "geometry" in result.columns

From b813541df120714174131ccf1e891c0806d063db Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Sun, 3 May 2026 15:30:21 -0500
Subject: [PATCH 2/4] Tighten _arrange_cols comments

Drop the regression-narrative phrasing for the local-copy comment and
trim the multi-line "id is technically a valid column..." block to a
two-line WHY explaining the rename intent.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dataretrieval/waterdata/utils.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
index e9641434..b391a892 100644
--- a/dataretrieval/waterdata/utils.py
+++ b/dataretrieval/waterdata/utils.py
@@ -687,14 +687,12 @@ def _arrange_cols(
     # If properties are provided, filter to only those columns
     # plus geometry if skip_geometry is False
     if properties and not all(pd.isna(properties)):
-        # Work on a local copy so we don't mutate the caller's list across calls.
+        # Don't alias the caller's list — we mutate below.
         local_properties = list(properties)
-        # Make sure geometry stays in the dataframe if skip_geometry is False
         if "geometry" in df.columns and "geometry" not in local_properties:
             local_properties.append("geometry")
-        # id is technically a valid column from the service, but these
-        # functions make the name more specific. So, if someone requests
-        # 'id', give them the output_id column
+        # 'id' is a valid service column, but expose it under the
+        # service-specific output_id name instead.
         if "id" in local_properties:
             local_properties[local_properties.index("id")] = output_id
         df = df.loc[:, [col for col in local_properties if col in df.columns]]

From 0af4d35dc10fb5df901524dcf3e4346c5eb0178b Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Mon, 4 May 2026 10:11:08 -0500
Subject: [PATCH 3/4] Fix _arrange_cols comment: drop stale skip_geometry
 reference

Per copilot review on PR #254. _arrange_cols doesn't take skip_geometry
- it conditionally keeps the geometry column based on whether one is
present in the DataFrame.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dataretrieval/waterdata/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
index b391a892..ea592568 100644
--- a/dataretrieval/waterdata/utils.py
+++ b/dataretrieval/waterdata/utils.py
@@ -684,8 +684,8 @@ def _arrange_cols(
     # Rename id column to output_id
     df = df.rename(columns={"id": output_id})
 
-    # If properties are provided, filter to only those columns
-    # plus geometry if skip_geometry is False
+    # If properties are provided, filter to only those columns,
+    # appending the geometry column when present in the DataFrame.
     if properties and not all(pd.isna(properties)):
         # Don't alias the caller's list — we mutate below.
         local_properties = list(properties)

From 15d985c2cf0b67f38895a471f4a2723becfd8728 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Tue, 5 May 2026 10:19:49 -0500
Subject: [PATCH 4/4] Drop redundant WHAT-comment per /simplify review

The "If properties are provided, filter to only those columns,
appending the geometry column when present" block restated what the
code below is already self-evidently doing. The two retained comments
in the body are the load-bearing WHY-comments (defensive copy
rationale, id-rename rationale).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dataretrieval/waterdata/utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
index 8b101995..c4f4b5bf 100644
--- a/dataretrieval/waterdata/utils.py
+++ b/dataretrieval/waterdata/utils.py
@@ -684,8 +684,6 @@ def _arrange_cols(
     # Rename id column to output_id
     df = df.rename(columns={"id": output_id})
 
-    # If properties are provided, filter to only those columns,
-    # appending the geometry column when present in the DataFrame.
     if properties and not all(pd.isna(properties)):
         # Don't alias the caller's list — we mutate below.
         local_properties = list(properties)