implement load data for Libre and Dexcom devices

staskh · staskh · commit ef0fad37a7d1 · 2025-06-25T11:12:37.000+03:00
diff --git a/iglu_python/__init__.py b/iglu_python/__init__.py
@@ -41,6 +41,7 @@
 from .sd_roc import sd_roc
 from .summary_glu import summary_glu
 from .utils import CGMS2DayByDay, check_data_columns, gd2d_to_df, is_iglu_r_compatible, set_iglu_r_compatible
+from .extension.load_data import load_libre, load_dexcom
 
 __all__ = [
     "above_percent",
@@ -74,6 +75,8 @@
     "iqr_glu",
     "j_index",
     "lbgi",
+    "load_dexcom",
+    "load_libre",
     "mad_glu",
     "mag",
     "mage",
diff --git a/iglu_python/extension/load_data.py b/iglu_python/extension/load_data.py
@@ -0,0 +1,178 @@
+"""
+This module is to load CGM timeseries from device specific files.
+It is inspired by https://github.com/cafoala/diametrics/blob/main/src/diametrics/transform.py
+"""
+
+from pathlib import Path
+import pandas as pd
+
+
+def load_libre(file_path: str) -> pd.Series:
+    """
+    Load Libre timeseries from file.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to the Libre device file.
+
+    Returns
+    -------
+    pd.Series
+        Series with datetime index and glucose values.(in mg/dL)
+
+    Examples
+    --------
+    >>> load_libre("tests/data/libre_amer_01.csv")
+    """
+    df = _open_file(file_path)
+
+    # Set third row as column headers
+    df.columns = df.iloc[2]
+    # Drop top rows
+    df = df.iloc[3:]
+    df.reset_index(inplace=True, drop=True)
+    # Keep important columns based on column names
+    convert = False
+    if 'Historic Glucose(mmol/L)' in df.columns:
+        df = df.loc[:, ('Meter Timestamp', 'Historic Glucose(mmol/L)', 'Scan Glucose(mmol/L)')]
+        format = '%d-%m-%Y %H:%M'
+        convert = True
+    elif 'Historic Glucose(mg/dL)' in df.columns:
+        df = df.loc[:, ('Meter Timestamp', 'Historic Glucose(mg/dL)', 'Scan Glucose(mg/dL)')]
+        format = '%m-%d-%Y %H:%M'
+    elif 'Historic Glucose mmol/L' in df.columns:
+        df = df.loc[:, ('Device Timestamp', 'Historic Glucose mmol/L', 'Scan Glucose mmol/L')]
+        format = '%d-%m-%Y %I:%M %p' 
+        convert = True
+    else:
+        df = df = df.loc[:, ('Device Timestamp', 'Historic Glucose mg/dL', 'Scan Glucose mg/dL')]
+        format = '%m-%d-%Y %I:%M %p'
+    # Rename columns
+    df.columns = ['time', 'glc', 'scan_glc']
+
+    # Convert 'time' column to datetime
+    df['time'] = pd.to_datetime(df['time'], format=format)
+
+    # convert to mg/dL if needed
+    if convert:
+        df['glc'] = df['glc'] * 18.01559    
+
+    # Drop NaN values and sort by 'time'
+    df = df.dropna(subset=['time', 'glc']).sort_values('time').reset_index(drop=True)
+
+    # convert into timeseries
+    timeseries = df.set_index('time')['glc']
+
+    return timeseries
+
+
+
+
+def load_dexcom(file_path: str) -> pd.Series:
+    """
+    Load Dexcom timeseries from file.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to the Dexcom device file.
+
+    Returns
+    -------
+    pd.Series
+        Series with datetime index and glucose values (in mg/dL)
+
+    Examples
+    --------
+    >>> load_dexcom("tests/data/dexcom_eur_01.xlsx")
+    """
+    df = _open_file(file_path)
+
+    # Set first row as column headers
+    df.columns = df.iloc[0]
+    # Drop top rows
+    df = df.iloc[1:]
+    df.reset_index(inplace=True, drop=True)
+    
+    # Find timestamp column
+    timestamp_cols = [col for col in df.columns if 'Timestamp' in str(col)]
+    if not timestamp_cols:
+        raise ValueError("No timestamp column found in Dexcom data")
+    timestamp_col = timestamp_cols[0]
+    
+    # Find glucose column
+    glucose_cols = [col for col in df.columns if 'Glucose' in str(col)]
+    if not glucose_cols:
+        raise ValueError("No glucose column found in Dexcom data")
+    glucose_col = glucose_cols[0]
+    
+    # Check if conversion is needed (mmol/L to mg/dL)
+    convert = False
+    if 'mmol/L' in str(glucose_col):
+        convert = True
+    
+    # Select relevant columns
+    df = df.loc[:, [timestamp_col, glucose_col]]
+    
+    # Rename columns
+    df.columns = ['time', 'glc']
+    
+    # Convert 'time' column to datetime
+    df['time'] = pd.to_datetime(df['time'], errors='coerce')
+    
+    # Convert glucose values to numeric
+    df['glc'] = pd.to_numeric(df['glc'], errors='coerce')
+    
+    # Convert to mg/dL if needed
+    if convert:
+        df['glc'] = df['glc'] * 18.01559
+    
+    # Drop NaN values and sort by 'time'
+    df = df.dropna(subset=['time', 'glc']).sort_values('time').reset_index(drop=True)
+    
+    # Convert into timeseries
+    timeseries = df.set_index('time')['glc']
+    
+    return timeseries
+
+
+
+def _open_file(filepath: str) -> pd.DataFrame:
+    """
+    Open a file and read its contents into a pandas DataFrame.
+
+    Args:
+        filepath (str): The path to the file.
+
+    Returns:
+        pandas.DataFrame: The DataFrame containing the file data.
+
+    Raises:
+        Exception: If an error occurs while reading the file.
+    """
+    # TODO: handle S3 path
+
+    if not Path(filepath).exists():
+        raise FileNotFoundError(f"File not found: {filepath}")
+
+    
+    # Get file extension using basename
+    extension = Path(filepath).suffix
+    
+    try:
+        if extension == '.csv':
+            # Assume that the user uploaded a CSV file
+            df = pd.read_csv(filepath, header=None, names=[i for i in range(0, 20)])
+        elif extension == '.xls' or extension == '.xlsx':
+            # Assume that the user uploaded an Excel file
+            df = pd.read_excel(filepath, header=None, names=[i for i in range(0, 20)])
+        elif extension == '.txt' or extension == '.tsv':
+            # Assume that the user uploaded a text file
+            df = pd.read_table(filepath, header=None, names=[i for i in range(0, 20)])
+        else:
+            raise ValueError(f"Unsupported file extension: {extension}")
+        
+        return df
+    except Exception as e:
+        raise ValueError(f"Error reading file: {filepath}") from e
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,8 @@ classifiers = [
 dependencies = [
     "numpy>=2.2.6",
     "pandas>=2.2.3",
-    "tzlocal>=5.3.1"
+    "tzlocal>=5.3.1",
+    "openpyxl >= 3.1.5"
 ]
 
 [project.urls]
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
 pandas >= 2.2.3
 numpy >= 2.2.6
-scipy >= 1.15.0
-tzlocal >= 5.3.1
+tzlocal >= 5.3.1
+openpyxl >= 3.1.5
diff --git a/tests/test_load_data.py b/tests/test_load_data.py

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,8 @@ classifiers = [`
`26`	`26`	`dependencies = [`
`27`	`27`	`"numpy>=2.2.6",`
`28`	`28`	`"pandas>=2.2.3",`
`29`		`- "tzlocal>=5.3.1"`
	`29`	`+ "tzlocal>=5.3.1",`
	`30`	`+ "openpyxl >= 3.1.5"`
`30`	`31`	`]`
`31`	`32`
`32`	`33`	`[project.urls]`