Skip to content

Commit 4101ba5

Browse files
Ensure sorted data on upload of data from python (#126)
1 parent beab3dc commit 4101ba5

4 files changed

Lines changed: 120 additions & 9 deletions

File tree

datareservoirio/client.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def create(self, series=None, wait_on_verification=True):
101101
----------
102102
series : pandas.Series, optional
103103
Series with index (as DatetimeIndex-like or integer array). Default
104-
is None.
104+
is None. Needs to be sorted on index.
105105
wait_on_verification : bool (optional)
106106
All series are subjected to a server-side data validation before
107107
they are made available for consumption; failing validation will
@@ -127,6 +127,11 @@ def create(self, series=None, wait_on_verification=True):
127127
response.raise_for_status()
128128
return response.json()
129129

130+
if not series.index.is_monotonic_increasing:
131+
raise ValueError(
132+
"Index not sorted. Please sort series on index before creating a timeseries."
133+
)
134+
130135
df = self._verify_and_prepare_series(series)
131136

132137
response_file = self._auth_session.post(
@@ -163,7 +168,7 @@ def append(self, series, series_id, wait_on_verification=True):
163168
Parameters
164169
----------
165170
series : pandas.Series
166-
Series with index (as DatetimeIndex-like or integer array).
171+
Series with index (as DatetimeIndex-like or integer array). Needs to be sorted on index.
167172
series_id : string
168173
The identifier of the existing series.
169174
wait_on_verification : bool (optional)
@@ -182,6 +187,10 @@ def append(self, series, series_id, wait_on_verification=True):
182187
dict
183188
The response from DataReservoir.io.
184189
"""
190+
if not series.index.is_monotonic_increasing:
191+
raise ValueError(
192+
"Index not sorted. Please sort series on index before appending data."
193+
)
185194
df = self._verify_and_prepare_series(series)
186195

187196
response_file = self._auth_session.post(
@@ -411,7 +420,21 @@ def get(
411420
else:
412421
df = pd.DataFrame(columns=("index", "values")).astype({"index": "int64"})
413422

414-
series = df.set_index("index").squeeze("columns").loc[start:end].copy(deep=True)
423+
try:
424+
series = (
425+
df.set_index("index").squeeze("columns").loc[start:end].copy(deep=True)
426+
)
427+
except KeyError as e:
428+
logging.warning(
429+
"The time series you requested is not properly ordered. The data will be sorted to attempt to resolve the issue. Please note that this operation may take some time."
430+
)
431+
series = (
432+
df.set_index("index")
433+
.sort_index()
434+
.squeeze("columns")
435+
.loc[start:end]
436+
.copy(deep=True)
437+
)
415438
series.index.name = None
416439

417440
if series.empty and raise_empty: # may become empty after slicing

docs/user_guide/index.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ explained below.
3838
Series
3939
------
4040
A series is a one-dimensional sequence with numeric values (64-bit float) and
41-
unique indicies (64-bit integer). (Consequently, each numeric value is natively
42-
represented with 128-bits.) Each series is assigned a unique identifier
41+
unique indices (64-bit integer). Consequently, each numeric value is natively
42+
represented with 128-bits. Each series is assigned a unique identifier
4343
`TimeSeriesId` (guid) for convenient access. Furthermore, a series can be
4444
enriched with :ref:`metadata <metadata>`.
4545

@@ -76,8 +76,8 @@ But, that responsibility is left to the user/app/service that uses
7676

7777
Metadata entries are organized using ``namespace`` and ``key``. A ``namespace``
7878
can be thought of as a table and ``key`` is the row index. Then a row can have
79-
any number of arbitrary number of columns. (Note that rows in a table do not
80-
have to share the columns!). This resembles "table storage" paradigm for those
79+
any number of arbitrary number of columns (Note that rows in a table do not
80+
have to share the columns). This resembles "table storage" paradigm for those
8181
who are familiar with that.
8282

8383
Thus, a ``namespace`` and ``key`` combination uniquely defines a metadata

docs/user_guide/manage_series.rst

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ information is returned:
4848
converted to UTC and therefore, time zone information is lost when data is
4949
stored in `DataReservoir.io`_.
5050

51-
You can also store a sequence of data. However, you are required to define an
52-
integer index. (This is useful when appending and updating the data later.)
51+
You can also store a sequence of data. However, you are required to define an increasing
52+
integer index. This is useful when appending and updating the data later.
5353

5454
Store sequence:
5555

@@ -75,6 +75,9 @@ data:
7575
series_id = response['TimeSeriesId']
7676
response = client.append(series, series_id)
7777
78+
.. important::
79+
80+
The index of the series must be sorted. This is also efficient when accessing the data later.
7881

7982
Data verification process
8083
-------------------------

tests/test_client.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import types
55
from encodings.utf_8 import encode
66
from pathlib import Path
7+
from unittest.mock import MagicMock, patch
78

89
import pandas as pd
910
import pytest
@@ -58,6 +59,10 @@ def fail_with_invalid_json_error(self, url, timeout):
5859
raise InvalidJSONError()
5960

6061

62+
def _mock_blob_sequence_days(response_json):
63+
return {1: "file1", 2: "file2"}
64+
65+
6166
class Test_Client:
6267
"""
6368
Tests the ``datareservoirio.Client`` class.
@@ -158,6 +163,53 @@ def test_get_raise_empty(self, client):
158163
with pytest.raises(ValueError):
159164
client.get("e3d82cda-4737-4af9-8d17-d9dfda8703d0", raise_empty=True)
160165

166+
def test_get_keyerror(self, client):
167+
series_id = "test_series_id"
168+
start = "2023-01-01"
169+
end = "2023-01-02"
170+
171+
response_mock = MagicMock()
172+
response_mock.status_code = 200
173+
response_mock.json.return_value = {
174+
"Files": [
175+
{"Chunks": "file1"},
176+
{"Chunks": "file2"},
177+
] # mock files with correct structure
178+
}
179+
180+
client._auth_session.get = MagicMock(return_value=response_mock)
181+
182+
def mock_storage_get(blob_sequence_i):
183+
if blob_sequence_i == "file1":
184+
return pd.DataFrame(
185+
{
186+
"index": [1672358410000000000, 1672358400000000000],
187+
"values": [100, 200],
188+
}
189+
)
190+
elif blob_sequence_i == "file2":
191+
return pd.DataFrame(
192+
{
193+
"index": [1672358410000000000, 1672358420000000000],
194+
"values": [200, 400],
195+
}
196+
)
197+
else:
198+
raise ValueError("Unexpected blob_sequence_i value")
199+
200+
client._storage.get = MagicMock(side_effect=mock_storage_get)
201+
202+
with patch(
203+
"datareservoirio.client._blob_sequence_days",
204+
side_effect=_mock_blob_sequence_days,
205+
):
206+
with patch("datareservoirio.logging.warning") as mock_logging_warning:
207+
result = client.get(series_id, start, end)
208+
mock_logging_warning.assert_called_once_with(
209+
"The time series you requested is not properly ordered. The data will be sorted to attempt to resolve the issue. Please note that this operation may take some time."
210+
)
211+
assert isinstance(result, pd.Series)
212+
161213
def test_get_raises_end_not_after_start(self, client):
162214
start = 1672358400000000000
163215
end = start - 1
@@ -427,6 +479,22 @@ def test_create_upload_raises(self, client, data_float, response_cases):
427479
with pytest.raises(HTTPError):
428480
client.create(series=data_float.as_series(), wait_on_verification=True)
429481

482+
def test_create_raises_valueerror_unsorted_index(self, client):
483+
data = pd.Series(
484+
[1, 2, 3],
485+
index=[
486+
pd.to_datetime("2022-04-04"),
487+
pd.to_datetime("2022-04-03"),
488+
pd.to_datetime("2022-04-05"),
489+
],
490+
)
491+
with pytest.raises(ValueError) as e:
492+
client.create(data)
493+
assert (
494+
str(e.value)
495+
== "Index not sorted. Please sort series on index before creating a timeseries."
496+
)
497+
430498
def test_append(
431499
self, client, data_float, mock_requests, bytesio_with_memory, response_cases
432500
):
@@ -501,6 +569,23 @@ def test_append_upload_raises(self, client, data_float, response_cases):
501569
with pytest.raises(HTTPError):
502570
client.append(data_float.as_series(), series_id, wait_on_verification=True)
503571

572+
def test_append_raises_valueerror_unsorted_index(self, client):
573+
series_id = "d30519af-5035-4093-a425-dafd857ad0ef"
574+
data = pd.Series(
575+
[1, 2, 3],
576+
index=[
577+
pd.to_datetime("2022-04-04"),
578+
pd.to_datetime("2022-04-03"),
579+
pd.to_datetime("2022-04-05"),
580+
],
581+
)
582+
with pytest.raises(ValueError) as e:
583+
client.append(data, series_id)
584+
assert (
585+
str(e.value)
586+
== "Index not sorted. Please sort series on index before appending data."
587+
)
588+
504589
@pytest.mark.parametrize("data", ("data_float", "data_string"))
505590
def test__verify_and_prepare_series(self, client, data, request):
506591
data = request.getfixturevalue(data)

0 commit comments

Comments
 (0)