Skip to content

Commit 41cdc7f

Browse files
ELI-545 | github workflow to refresh tableau data programmatically (#618)
* ELI-545 | Tableau automatic update * ELI-545 | dummy commit * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * ELI-545 | copilot suggestions * ELI-545 | copilot suggestions * ELI-545 | name fix --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 31ec71c commit 41cdc7f

3 files changed

Lines changed: 312 additions & 0 deletions

File tree

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
name: "Daily Tableau Data Update"
2+
3+
on:
4+
schedule:
5+
# This runs at 10:00 AM UTC every day
6+
- cron: '0 10 * * *'
7+
workflow_dispatch: # manual run for testing
8+
9+
jobs:
10+
metadata:
11+
name: "Set CI/CD metadata"
12+
runs-on: ubuntu-latest
13+
timeout-minutes: 1
14+
outputs:
15+
build_datetime: ${{ steps.variables.outputs.build_datetime }}
16+
build_timestamp: ${{ steps.variables.outputs.build_timestamp }}
17+
build_epoch: ${{ steps.variables.outputs.build_epoch }}
18+
nodejs_version: ${{ steps.variables.outputs.nodejs_version }}
19+
python_version: ${{ steps.variables.outputs.python_version }}
20+
terraform_version: ${{ steps.variables.outputs.terraform_version }}
21+
version: ${{ steps.variables.outputs.version }}
22+
steps:
23+
- name: "Checkout code"
24+
uses: actions/checkout@v6
25+
with:
26+
ref: ${{ github.ref_name }}
27+
28+
- name: "Set CI/CD variables"
29+
id: variables
30+
run: |
31+
datetime=$(date -u +'%Y-%m-%dT%H:%M:%S%z')
32+
echo "build_datetime=$datetime" >> $GITHUB_OUTPUT
33+
echo "build_timestamp=$(date --date=$datetime -u +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT
34+
echo "build_epoch=$(date --date=$datetime -u +'%s')" >> $GITHUB_OUTPUT
35+
echo "nodejs_version=$(grep '^nodejs' .tool-versions | cut -f2 -d' ')" >> $GITHUB_OUTPUT
36+
echo "python_version=$(grep '^python' .tool-versions | cut -f2 -d' ')" >> $GITHUB_OUTPUT
37+
echo "terraform_version=$(grep '^terraform' .tool-versions | cut -f2 -d' ')" >> $GITHUB_OUTPUT
38+
echo "version=dev-$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT
39+
40+
- name: "List variables"
41+
run: |
42+
echo "Deploying to: DEV"
43+
echo "VERSION=${{ steps.variables.outputs.version }}"
44+
45+
publish:
46+
name: "Publish to dev"
47+
runs-on: ubuntu-latest
48+
needs: [metadata]
49+
timeout-minutes: 30
50+
environment: "dev"
51+
permissions:
52+
id-token: write
53+
contents: write
54+
steps:
55+
56+
- name: "Set up Python"
57+
uses: actions/setup-python@v6
58+
with:
59+
python-version: '3.13'
60+
61+
- name: "Checkout Repository"
62+
uses: actions/checkout@v6
63+
with:
64+
ref: ${{ github.ref_name }}
65+
66+
- name: "Install Dependencies"
67+
run: |
68+
pip install boto3 pandas tableauserverclient tableauhyperapi requests
69+
70+
- name: "Configure AWS Credentials (Main Deployment Role)"
71+
uses: aws-actions/configure-aws-credentials@v6
72+
with:
73+
role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/service-roles/github-actions-api-deployment-role
74+
aws-region: eu-west-2
75+
76+
- name: "Run S3 to Hyper Script"
77+
run: python scripts/tableau/generate_tableau_data.py
78+
79+
- name: "Publish to Tableau"
80+
env:
81+
TABLEAU_TOKEN_NAME: ${{ secrets.TABLEAU_TOKEN_NAME }}
82+
TABLEAU_TOKEN_VALUE: ${{ secrets.TABLEAU_TOKEN_VALUE }}
83+
TABLEAU_SITE_ID: ${{ vars.TABLEAU_SITE_ID }}
84+
TABLEAU_SERVER_URL: ${{ vars.TABLEAU_SERVER_URL }}
85+
TABLEAU_DATASOURCE_ID: ${{ secrets.TABLEAU_DATASOURCE_ID }}
86+
run: python scripts/tableau/tableau_refresh.py
87+
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import boto3
2+
import json
3+
import pandas as pd
4+
from datetime import datetime, timedelta
5+
6+
# --- Configuration ---
7+
S3_BUCKET = "eligibility-signposting-api-dev-dq-metrics"
8+
LOCAL_HYPER_PATH = "converted.hyper"
9+
LOOKBACK_MONTHS = 3
10+
11+
12+
def get_filtered_s3_data():
13+
s3 = boto3.client('s3')
14+
all_data = []
15+
16+
# Calculate the date threshold (3 months ago) and corresponding date range
17+
start_date = datetime.now() - timedelta(days=90)
18+
end_date = datetime.now()
19+
threshold_date = start_date.strftime('%Y%m%d')
20+
print(f"Filtering for data where processing_date >= {threshold_date}...")
21+
22+
# List objects in the bucket, narrowed by expected processing_date=YYYYMMDD/ prefixes
23+
paginator = s3.get_paginator('list_objects_v2')
24+
current_date = start_date
25+
while current_date <= end_date:
26+
date_prefix = current_date.strftime('%Y%m%d')
27+
s3_prefix = f"processing_date={date_prefix}/"
28+
29+
for page in paginator.paginate(Bucket=S3_BUCKET, Prefix=s3_prefix):
30+
for obj in page.get('Contents', []):
31+
key = obj['Key']
32+
33+
# Parse partition from Key (e.g. processing_date=20260303/datamart=cohorts/file.json)
34+
# We check if the key contains our date pattern
35+
try:
36+
if 'processing_date=' in key and key.endswith('.json'):
37+
date_part = key.split('processing_date=')[1].split('/')[0]
38+
39+
# Keep the original threshold check as a safeguard
40+
if date_part >= threshold_date:
41+
# Read JSON file
42+
response = s3.get_object(Bucket=S3_BUCKET, Key=key)
43+
line = response['Body'].read().decode('utf-8')
44+
45+
# Handle multiple JSON objects in one file (JSONL) if necessary
46+
for json_line in line.strip().split('\n'):
47+
if json_line:
48+
all_data.append(json.loads(json_line))
49+
except Exception as e:
50+
print(f"Skipping key {key} due to error: {e}")
51+
52+
current_date += timedelta(days=1)
53+
return pd.DataFrame(all_data)
54+
55+
56+
def create_hyper_from_df(df, hyper_path):
57+
from tableauhyperapi import (
58+
Connection, CreateMode, HyperProcess, Inserter,
59+
TableDefinition, Telemetry, SqlType, TableName, Date, Timestamp
60+
)
61+
62+
# Define the schema and table names clearly
63+
SCHEMA_NAME = "Extract"
64+
TABLE_NAME = "Extract"
65+
66+
with HyperProcess(Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper:
67+
with Connection(endpoint=hyper.endpoint,
68+
database=hyper_path,
69+
create_mode=CreateMode.CREATE_AND_REPLACE) as connection:
70+
# Align TableName with the schema we are about to create
71+
target_table = TableName(SCHEMA_NAME, TABLE_NAME)
72+
73+
schema_def = TableDefinition(target_table, [
74+
TableDefinition.Column("timestamp", SqlType.timestamp()),
75+
TableDefinition.Column("datamart", SqlType.text()),
76+
TableDefinition.Column("attribute_type", SqlType.text()),
77+
TableDefinition.Column("attribute", SqlType.text()),
78+
TableDefinition.Column("dimension", SqlType.text()),
79+
TableDefinition.Column("success_count", SqlType.big_int()),
80+
TableDefinition.Column("total_rows", SqlType.big_int()),
81+
TableDefinition.Column("success_percent", SqlType.double()),
82+
TableDefinition.Column("processing_date", SqlType.date()),
83+
])
84+
85+
# Create the schema first, then the table within that schema
86+
connection.catalog.create_schema(SCHEMA_NAME)
87+
connection.catalog.create_table(schema_def)
88+
89+
with Inserter(connection, schema_def) as inserter:
90+
for _, row in df.iterrows():
91+
ts = datetime.strptime(str(row['timestamp']), "%Y-%m-%d %H:%M:%S")
92+
pd_str = str(row['processing_date'])
93+
pd_dt = datetime.strptime(pd_str, "%Y%m%d")
94+
95+
inserter.add_row([
96+
Timestamp(ts.year, ts.month, ts.day, ts.hour, ts.minute, ts.second),
97+
row.get('datamart'),
98+
row.get('attribute_type'),
99+
row.get('attribute', ''),
100+
row.get('dimension'),
101+
int(row.get('success_count', 0)),
102+
int(row.get('total_rows', 0)),
103+
float(row.get('success_percent', 0.0)),
104+
Date(pd_dt.year, pd_dt.month, pd_dt.day)
105+
])
106+
inserter.execute()
107+
print(f"Hyper file created at {hyper_path} with schema '{SCHEMA_NAME}'")
108+
109+
110+
# --- Main Execution ---
111+
df = get_filtered_s3_data()
112+
if not df.empty:
113+
create_hyper_from_df(df, LOCAL_HYPER_PATH)
114+
else:
115+
print("No data found for the selected date range.")

scripts/tableau/tableau_refresh.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
import argparse
2+
import os
3+
import sys
4+
5+
import requests
6+
import tableauserverclient as TSC
7+
8+
SUPPORTED_EXTENSIONS = {".tds", ".tdsx", ".tde", ".hyper", ".parquet"}
9+
10+
11+
def parse_args() -> argparse.Namespace:
12+
parser = argparse.ArgumentParser(
13+
description="Publish a Tableau datasource to Tableau Server."
14+
)
15+
# Added a flag to control whether cache refresh failure should crash the script
16+
parser.add_argument(
17+
"--ignore-refresh-failure",
18+
action="store_true",
19+
help="Do not exit with error if the cache refresh ping fails."
20+
)
21+
return parser.parse_args()
22+
23+
24+
def validate_file_type(file_path: str) -> None:
25+
file_extension = os.path.splitext(file_path)[1].lower()
26+
if file_extension not in SUPPORTED_EXTENSIONS:
27+
raise ValueError(
28+
f"Unsupported datasource file type '{file_extension}'. "
29+
f"Supported types: {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
30+
)
31+
32+
33+
def main() -> None:
34+
args = parse_args()
35+
new_data_file = "./converted.hyper"
36+
37+
if not os.path.isfile(new_data_file):
38+
raise FileNotFoundError(f"Datasource file not found: {new_data_file}")
39+
40+
validate_file_type(new_data_file)
41+
42+
# --- Environment Variable Validation ---
43+
tableau_token_name = os.getenv("TABLEAU_TOKEN_NAME")
44+
tableau_token_value = os.getenv("TABLEAU_TOKEN_VALUE")
45+
tableau_server_url = os.getenv("TABLEAU_SERVER_URL")
46+
datasource_id = os.getenv("TABLEAU_DATASOURCE_ID")
47+
tableau_site = os.getenv("TABLEAU_SITE_ID", "NHSD_DEV")
48+
49+
missing_vars = []
50+
if not tableau_token_name: missing_vars.append("TABLEAU_TOKEN_NAME")
51+
if not tableau_token_value: missing_vars.append("TABLEAU_TOKEN_VALUE")
52+
if not tableau_server_url: missing_vars.append("TABLEAU_SERVER_URL")
53+
if not datasource_id: missing_vars.append("TABLEAU_DATASOURCE_ID")
54+
55+
if missing_vars:
56+
raise EnvironmentError(
57+
f"Missing required environment variables: {', '.join(missing_vars)}. "
58+
"Please ensure your environment is configured correctly."
59+
)
60+
61+
# --- Tableau Operations ---
62+
tableau_auth = TSC.PersonalAccessTokenAuth(
63+
tableau_token_name,
64+
tableau_token_value,
65+
tableau_site,
66+
)
67+
68+
server = TSC.Server(tableau_server_url, use_server_version=True)
69+
70+
with server.auth.sign_in(tableau_auth):
71+
print(f"Signing into {tableau_server_url} (Site: {tableau_site})...")
72+
target_item = server.datasources.get_by_id(datasource_id)
73+
74+
# Publish (overwrite)
75+
server.datasources.publish(
76+
target_item,
77+
new_data_file,
78+
mode=TSC.Server.PublishMode.Overwrite,
79+
)
80+
print(f"Successfully overwritten datasource ID: {datasource_id}")
81+
82+
# --- Cache Refresh Ping ---
83+
workbook_name = "EligibilityData-DQMetrics"
84+
view_name = "DataQualityMetrics"
85+
base_url = tableau_server_url.rstrip('/')
86+
ping_url = f"{base_url}/views/{workbook_name}/{view_name}?:refresh=y"
87+
88+
print(f"Pinging Tableau Server for cache refresh: {view_name}...")
89+
90+
headers = {"X-Tableau-Auth": server.auth_token}
91+
92+
try:
93+
response = requests.get(ping_url, headers=headers, timeout=30)
94+
response.raise_for_status() # Raises HTTPError for 4xx/5xx responses
95+
print("Cache refresh triggered successfully.")
96+
97+
except Exception as e:
98+
error_msg = f"CRITICAL: Cache refresh failed: {e}"
99+
if args.ignore_refresh_failure:
100+
print(f"WARNING: {error_msg} (Continuing due to --ignore-refresh-failure)")
101+
else:
102+
print(error_msg)
103+
sys.exit(1) # Exit with error code for CI visibility
104+
105+
print("-" * 30)
106+
print("FINISHED: Data overwritten and refresh processed.")
107+
108+
109+
if __name__ == "__main__":
110+
main()

0 commit comments

Comments
 (0)