ESS-2234-2: Improvements to fetching aggregate data (#122)

branislav-jenco-4ss · web-flow · commit 32f6528bbb4f · 2024-01-19T13:27:19.000+01:00
* Added option to use period without number

* Adding progress bar

* Adding tqdm to pyproject.toml

* Increased the default max_page_size

* Reformatting, every time

* Fixed docs
diff --git a/datareservoirio/_utils.py b/datareservoirio/_utils.py
@@ -67,7 +67,7 @@ def as_binary_csv(self):
 
 # Translation of user input parameters of the samples/aggregate method for more convenient use (matching pandas)
 
-function_translation = {"std": "Stdev", "mean": "Avg"}
+function_translation = {"std": "Stdev", "mean": "Avg", "min": "Min", "max": "Max"}
 
 period_translation = {
     "hours": "h",
diff --git a/datareservoirio/client.py b/datareservoirio/client.py
@@ -19,6 +19,7 @@
     wait_chain,
     wait_fixed,
 )
+from tqdm.auto import tqdm
 
 from ._logging import log_decorator
 from ._utils import function_translation, period_translation
@@ -39,6 +40,8 @@
 
 _TIMEOUT_DEAULT = 120
 
+_DEFAULT_MAX_PAGE_SIZE = 30000
+
 
 class Client:
     """
@@ -424,7 +427,7 @@ def get_samples_aggregate(
         end=None,
         aggregation_period=None,
         aggregation_function=None,
-        max_page_size=None,
+        max_page_size=_DEFAULT_MAX_PAGE_SIZE,
     ):
         """
         Retrieve a series from DataReservoir.io using the samples/aggregate endpoint.
@@ -440,7 +443,7 @@ def get_samples_aggregate(
             Stop time (exclusive) of the aggregated series given as anything
             pandas.to_datetime is able to parse. Date must be within the past 90 days.
         aggregation_function : str
-            One of "Avg", "Min", "Max", "Stdev".
+            One of "mean", "min", "max", "std".
         aggregation_period : str
             Used in combination with aggregation function to specify the period for aggregation.
             Aggregation period is maximum 24 hours. Values can be in units of h, m, s, ms,
@@ -482,6 +485,9 @@ def get_samples_aggregate(
         if aggregation_function in function_translation:
             aggregation_function = function_translation[aggregation_function]
 
+        if not aggregation_period[0].isnumeric():
+            aggregation_period = "1" + aggregation_period
+
         for period_unit in period_translation:
             if (
                 aggregation_period.endswith(period_unit)
@@ -493,18 +499,19 @@ def get_samples_aggregate(
                 )
                 break
 
-        start = pd.to_datetime(start, dayfirst=True, unit="ns", utc=True).isoformat()
-        end = pd.to_datetime(end, dayfirst=True, unit="ns", utc=True).isoformat()
+        start = pd.to_datetime(start, dayfirst=True, unit="ns", utc=True)
+        end = pd.to_datetime(end, dayfirst=True, unit="ns", utc=True)
 
-        params = {}
+        if start.value >= end.value:
+            raise ValueError("Start must be before end.")
 
-        if max_page_size:
-            params["maxPageSize"] = max_page_size
+        params = {}
 
+        params["maxPageSize"] = max_page_size
         params["aggregationPeriod"] = aggregation_period
         params["aggregationFunction"] = aggregation_function
-        params["start"] = start
-        params["end"] = end
+        params["start"] = start.isoformat()
+        params["end"] = end.isoformat()
 
         next_page_link = f"{environment.api_base_url}reservoir/timeseries/{series_id}/samples/aggregate?{urlencode(params)}"
 
@@ -535,23 +542,32 @@ def get_samples_aggregate_page(url):
                 timeout=_TIMEOUT_DEAULT,
             )
 
+        progress_bar = tqdm(unit=" pages", desc="Downloading aggregate data")
         while next_page_link:
             response = get_samples_aggregate_page(next_page_link)
             response.raise_for_status()
             response_json = response.json()
             next_page_link = response_json.get("@odata.nextLink", None)
 
             content = [
-                (pd.to_datetime(sample["Timestamp"], utc=True), sample["Value"])
+                (
+                    pd.to_datetime(sample["Timestamp"], unit="ns", utc=True),
+                    sample["Value"],
+                )
                 for sample in response_json["value"]
             ]
 
+            # update the progress bar
+            if content:
+                progress_bar.update(1)
+
             new_df = pd.DataFrame(
                 content, columns=("index", "values"), copy=False
             ).astype({"values": "float64"}, errors="ignore")
 
             df = pd.concat([df, new_df])
 
+        progress_bar.close()
         series = df.set_index("index").squeeze("columns").copy(deep=True)
 
         return series
diff --git a/docs/user_guide/manage_series.rst b/docs/user_guide/manage_series.rst
@@ -132,9 +132,9 @@ You can also access any data you have ``TimeSeriesId`` (and authorization) for w
 .. code-block:: python
 
     # Get entire timeseries
-    timeseries = client.get_samples_aggregate(series_id, start='2018-01-01',
-                            end='2018-01-02', aggregation_period='15m',
-                            aggregation_function='Avg')
+    timeseries = client.get_samples_aggregate(series_id, start='2024-01-01',
+                            end='2024-01-02', aggregation_period='15m',
+                            aggregation_function='mean')
 
 .. note::
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,7 +30,8 @@ dependencies = [
     "importlib_resources",
     "opencensus-ext-azure",
     "tenacity",
-    "urllib3 > 2"
+    "urllib3 > 2",
+    "tqdm"
 ]
 
 [project.urls]
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -890,7 +890,8 @@ def test_tries_error_does_not_throw_retry(self, client_with_invalid_json_error):
 
     @pytest.mark.response_irrelevant
     @pytest.mark.parametrize(
-        "aggregation_function, expected", [("mean", "Avg"), ("std", "Stdev")]
+        "aggregation_function, expected",
+        [("mean", "Avg"), ("std", "Stdev"), ("min", "Min"), ("max", "Max")],
     )
     def test_aggregation_function_gets_translated(
         self, client, mock_requests, aggregation_function, expected, response_cases
@@ -912,6 +913,9 @@ def test_aggregation_function_gets_translated(
     @pytest.mark.parametrize(
         "aggregation_period, expected",
         [
+            ("min", "1m"),
+            ("tick", "1tick"),
+            ("s", "1s"),
             ("15minutes", "15m"),
             ("15minute", "15m"),
             ("15min", "15m"),

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,8 @@ dependencies = [`
`30`	`30`	`"importlib_resources",`
`31`	`31`	`"opencensus-ext-azure",`
`32`	`32`	`"tenacity",`
`33`		`- "urllib3 > 2"`
	`33`	`+ "urllib3 > 2",`
	`34`	`+ "tqdm"`
`34`	`35`	`]`
`35`	`36`
`36`	`37`	`[project.urls]`