Merge branch 'main' into feature/ELI-702-code-signing

TOEL2 · web-flow · commit a94fdc0d4734 · 2026-03-30T10:10:39.000+01:00
diff --git a/.github/workflows/dev-tableau-update.yml b/.github/workflows/dev-tableau-update.yml
@@ -0,0 +1,87 @@
+name: "Daily Tableau Data Update"
+
+on:
+  schedule:
+    # This runs at 10:00 AM UTC every day
+    - cron: '0 10 * * *'
+  workflow_dispatch: # manual run for testing
+
+jobs:
+  metadata:
+    name: "Set CI/CD metadata"
+    runs-on: ubuntu-latest
+    timeout-minutes: 1
+    outputs:
+      build_datetime: ${{ steps.variables.outputs.build_datetime }}
+      build_timestamp: ${{ steps.variables.outputs.build_timestamp }}
+      build_epoch: ${{ steps.variables.outputs.build_epoch }}
+      nodejs_version: ${{ steps.variables.outputs.nodejs_version }}
+      python_version: ${{ steps.variables.outputs.python_version }}
+      terraform_version: ${{ steps.variables.outputs.terraform_version }}
+      version: ${{ steps.variables.outputs.version }}
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ github.ref_name }}
+
+      - name: "Set CI/CD variables"
+        id: variables
+        run: |
+          datetime=$(date -u +'%Y-%m-%dT%H:%M:%S%z')
+          echo "build_datetime=$datetime" >> $GITHUB_OUTPUT
+          echo "build_timestamp=$(date --date=$datetime -u +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT
+          echo "build_epoch=$(date --date=$datetime -u +'%s')" >> $GITHUB_OUTPUT
+          echo "nodejs_version=$(grep '^nodejs' .tool-versions | cut -f2 -d' ')" >> $GITHUB_OUTPUT
+          echo "python_version=$(grep '^python' .tool-versions | cut -f2 -d' ')" >> $GITHUB_OUTPUT
+          echo "terraform_version=$(grep '^terraform' .tool-versions | cut -f2 -d' ')" >> $GITHUB_OUTPUT
+          echo "version=dev-$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT
+
+      - name: "List variables"
+        run: |
+          echo "Deploying to: DEV"
+          echo "VERSION=${{ steps.variables.outputs.version }}"
+
+  publish:
+    name: "Publish to dev"
+    runs-on: ubuntu-latest
+    needs: [metadata]
+    timeout-minutes: 30
+    environment: "dev"
+    permissions:
+      id-token: write
+      contents: write
+    steps:
+
+      - name: "Set up Python"
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+
+      - name: "Checkout Repository"
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ github.ref_name }}
+
+      - name: "Install Dependencies"
+        run: |
+          pip install boto3 pandas tableauserverclient tableauhyperapi requests
+
+      - name: "Configure AWS Credentials (Main Deployment Role)"
+        uses: aws-actions/configure-aws-credentials@v6
+        with:
+          role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/service-roles/github-actions-api-deployment-role
+          aws-region: eu-west-2
+
+      - name: "Run S3 to Hyper Script"
+        run: python scripts/tableau/generate_tableau_data.py
+
+      - name: "Publish to Tableau"
+        env:
+          TABLEAU_TOKEN_NAME: ${{ secrets.TABLEAU_TOKEN_NAME }}
+          TABLEAU_TOKEN_VALUE: ${{ secrets.TABLEAU_TOKEN_VALUE }}
+          TABLEAU_SITE_ID: ${{ vars.TABLEAU_SITE_ID }}
+          TABLEAU_SERVER_URL: ${{ vars.TABLEAU_SERVER_URL }}
+          TABLEAU_DATASOURCE_ID: ${{ secrets.TABLEAU_DATASOURCE_ID }}
+        run: python scripts/tableau/tableau_refresh.py
+
diff --git a/scripts/tableau/generate_tableau_data.py b/scripts/tableau/generate_tableau_data.py
@@ -0,0 +1,115 @@
+import boto3
+import json
+import pandas as pd
+from datetime import datetime, timedelta
+
+# --- Configuration ---
+S3_BUCKET = "eligibility-signposting-api-dev-dq-metrics"
+LOCAL_HYPER_PATH = "converted.hyper"
+LOOKBACK_MONTHS = 3
+
+
+def get_filtered_s3_data():
+    s3 = boto3.client('s3')
+    all_data = []
+
+    # Calculate the date threshold (3 months ago) and corresponding date range
+    start_date = datetime.now() - timedelta(days=90)
+    end_date = datetime.now()
+    threshold_date = start_date.strftime('%Y%m%d')
+    print(f"Filtering for data where processing_date >= {threshold_date}...")
+
+    # List objects in the bucket, narrowed by expected processing_date=YYYYMMDD/ prefixes
+    paginator = s3.get_paginator('list_objects_v2')
+    current_date = start_date
+    while current_date <= end_date:
+        date_prefix = current_date.strftime('%Y%m%d')
+        s3_prefix = f"processing_date={date_prefix}/"
+
+        for page in paginator.paginate(Bucket=S3_BUCKET, Prefix=s3_prefix):
+            for obj in page.get('Contents', []):
+                key = obj['Key']
+
+                # Parse partition from Key (e.g. processing_date=20260303/datamart=cohorts/file.json)
+                # We check if the key contains our date pattern
+                try:
+                    if 'processing_date=' in key and key.endswith('.json'):
+                        date_part = key.split('processing_date=')[1].split('/')[0]
+
+                        # Keep the original threshold check as a safeguard
+                        if date_part >= threshold_date:
+                            # Read JSON file
+                            response = s3.get_object(Bucket=S3_BUCKET, Key=key)
+                            line = response['Body'].read().decode('utf-8')
+
+                            # Handle multiple JSON objects in one file (JSONL) if necessary
+                            for json_line in line.strip().split('\n'):
+                                if json_line:
+                                    all_data.append(json.loads(json_line))
+                except Exception as e:
+                    print(f"Skipping key {key} due to error: {e}")
+
+        current_date += timedelta(days=1)
+    return pd.DataFrame(all_data)
+
+
+def create_hyper_from_df(df, hyper_path):
+    from tableauhyperapi import (
+        Connection, CreateMode, HyperProcess, Inserter,
+        TableDefinition, Telemetry, SqlType, TableName, Date, Timestamp
+    )
+
+    # Define the schema and table names clearly
+    SCHEMA_NAME = "Extract"
+    TABLE_NAME = "Extract"
+
+    with HyperProcess(Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper:
+        with Connection(endpoint=hyper.endpoint,
+                        database=hyper_path,
+                        create_mode=CreateMode.CREATE_AND_REPLACE) as connection:
+            # Align TableName with the schema we are about to create
+            target_table = TableName(SCHEMA_NAME, TABLE_NAME)
+
+            schema_def = TableDefinition(target_table, [
+                TableDefinition.Column("timestamp", SqlType.timestamp()),
+                TableDefinition.Column("datamart", SqlType.text()),
+                TableDefinition.Column("attribute_type", SqlType.text()),
+                TableDefinition.Column("attribute", SqlType.text()),
+                TableDefinition.Column("dimension", SqlType.text()),
+                TableDefinition.Column("success_count", SqlType.big_int()),
+                TableDefinition.Column("total_rows", SqlType.big_int()),
+                TableDefinition.Column("success_percent", SqlType.double()),
+                TableDefinition.Column("processing_date", SqlType.date()),
+            ])
+
+            # Create the schema first, then the table within that schema
+            connection.catalog.create_schema(SCHEMA_NAME)
+            connection.catalog.create_table(schema_def)
+
+            with Inserter(connection, schema_def) as inserter:
+                for _, row in df.iterrows():
+                    ts = datetime.strptime(str(row['timestamp']), "%Y-%m-%d %H:%M:%S")
+                    pd_str = str(row['processing_date'])
+                    pd_dt = datetime.strptime(pd_str, "%Y%m%d")
+
+                    inserter.add_row([
+                        Timestamp(ts.year, ts.month, ts.day, ts.hour, ts.minute, ts.second),
+                        row.get('datamart'),
+                        row.get('attribute_type'),
+                        row.get('attribute', ''),
+                        row.get('dimension'),
+                        int(row.get('success_count', 0)),
+                        int(row.get('total_rows', 0)),
+                        float(row.get('success_percent', 0.0)),
+                        Date(pd_dt.year, pd_dt.month, pd_dt.day)
+                    ])
+                inserter.execute()
+    print(f"Hyper file created at {hyper_path} with schema '{SCHEMA_NAME}'")
+
+
+# --- Main Execution ---
+df = get_filtered_s3_data()
+if not df.empty:
+    create_hyper_from_df(df, LOCAL_HYPER_PATH)
+else:
+    print("No data found for the selected date range.")
diff --git a/scripts/tableau/tableau_refresh.py b/scripts/tableau/tableau_refresh.py
@@ -0,0 +1,110 @@
+import argparse
+import os
+import sys
+
+import requests
+import tableauserverclient as TSC
+
+SUPPORTED_EXTENSIONS = {".tds", ".tdsx", ".tde", ".hyper", ".parquet"}
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Publish a Tableau datasource to Tableau Server."
+    )
+    # Added a flag to control whether cache refresh failure should crash the script
+    parser.add_argument(
+        "--ignore-refresh-failure",
+        action="store_true",
+        help="Do not exit with error if the cache refresh ping fails."
+    )
+    return parser.parse_args()
+
+
+def validate_file_type(file_path: str) -> None:
+    file_extension = os.path.splitext(file_path)[1].lower()
+    if file_extension not in SUPPORTED_EXTENSIONS:
+        raise ValueError(
+            f"Unsupported datasource file type '{file_extension}'. "
+            f"Supported types: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
+        )
+
+
+def main() -> None:
+    args = parse_args()
+    new_data_file = "./converted.hyper"
+
+    if not os.path.isfile(new_data_file):
+        raise FileNotFoundError(f"Datasource file not found: {new_data_file}")
+
+    validate_file_type(new_data_file)
+
+    # --- Environment Variable Validation ---
+    tableau_token_name = os.getenv("TABLEAU_TOKEN_NAME")
+    tableau_token_value = os.getenv("TABLEAU_TOKEN_VALUE")
+    tableau_server_url = os.getenv("TABLEAU_SERVER_URL")
+    datasource_id = os.getenv("TABLEAU_DATASOURCE_ID")
+    tableau_site = os.getenv("TABLEAU_SITE_ID", "NHSD_DEV")
+
+    missing_vars = []
+    if not tableau_token_name: missing_vars.append("TABLEAU_TOKEN_NAME")
+    if not tableau_token_value: missing_vars.append("TABLEAU_TOKEN_VALUE")
+    if not tableau_server_url: missing_vars.append("TABLEAU_SERVER_URL")
+    if not datasource_id: missing_vars.append("TABLEAU_DATASOURCE_ID")
+
+    if missing_vars:
+        raise EnvironmentError(
+            f"Missing required environment variables: {', '.join(missing_vars)}. "
+            "Please ensure your environment is configured correctly."
+        )
+
+    # --- Tableau Operations ---
+    tableau_auth = TSC.PersonalAccessTokenAuth(
+        tableau_token_name,
+        tableau_token_value,
+        tableau_site,
+    )
+
+    server = TSC.Server(tableau_server_url, use_server_version=True)
+
+    with server.auth.sign_in(tableau_auth):
+        print(f"Signing into {tableau_server_url} (Site: {tableau_site})...")
+        target_item = server.datasources.get_by_id(datasource_id)
+
+        # Publish (overwrite)
+        server.datasources.publish(
+            target_item,
+            new_data_file,
+            mode=TSC.Server.PublishMode.Overwrite,
+        )
+        print(f"Successfully overwritten datasource ID: {datasource_id}")
+
+        # --- Cache Refresh Ping ---
+        workbook_name = "EligibilityData-DQMetrics"
+        view_name = "DataQualityMetrics"
+        base_url = tableau_server_url.rstrip('/')
+        ping_url = f"{base_url}/views/{workbook_name}/{view_name}?:refresh=y"
+
+        print(f"Pinging Tableau Server for cache refresh: {view_name}...")
+
+        headers = {"X-Tableau-Auth": server.auth_token}
+
+        try:
+            response = requests.get(ping_url, headers=headers, timeout=30)
+            response.raise_for_status()  # Raises HTTPError for 4xx/5xx responses
+            print("Cache refresh triggered successfully.")
+
+        except Exception as e:
+            error_msg = f"CRITICAL: Cache refresh failed: {e}"
+            if args.ignore_refresh_failure:
+                print(f"WARNING: {error_msg} (Continuing due to --ignore-refresh-failure)")
+            else:
+                print(error_msg)
+                sys.exit(1)  # Exit with error code for CI visibility
+
+        print("-" * 30)
+        print("FINISHED: Data overwritten and refresh processed.")
+
+
+if __name__ == "__main__":
+    main()