Skip to content

Commit 7d50ebe

Browse files
committed
feat: update retry mechanism for nightlies
1 parent 583195d commit 7d50ebe

1 file changed

Lines changed: 116 additions & 84 deletions

File tree

.github/workflows/NightlyDispatcher.yml

Lines changed: 116 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
name: Nightly Dispatcher
22

3+
permissions:
4+
actions: write
5+
contents: read
6+
37
on:
48
schedule:
59
# Run main branch at 10:00 PM (22:00 UTC)
@@ -12,7 +16,7 @@ jobs:
1216
if: github.event.schedule == '0 22 * * *'
1317
runs-on: ubuntu-latest
1418
outputs:
15-
run-id: ${{ steps.trigger.outputs.run-id }}
19+
run-id: ${{ steps.trigger.outputs.run_id }}
1620
steps:
1721
- name: "Trigger Native Pipeline on main"
1822
id: trigger
@@ -30,52 +34,62 @@ jobs:
3034
}
3135
});
3236
37+
console.log('Workflow dispatch triggered successfully');
38+
3339
// Wait longer and find the correct run
3440
let runId = null;
3541
let attempts = 0;
36-
const maxAttempts = 12; // 2 minutes total
42+
const maxAttempts = 18; // Increase to 3 minutes
3743
3844
while (!runId && attempts < maxAttempts) {
3945
await new Promise(resolve => setTimeout(resolve, 10000)); // Wait 10 seconds
4046
attempts++;
47+
console.log(`Attempt ${attempts}/${maxAttempts} to find triggered run...`);
4148
42-
const runs = await github.rest.actions.listWorkflowRuns({
43-
owner: context.repo.owner,
44-
repo: context.repo.repo,
45-
workflow_id: 'NativePipeline.yml',
46-
branch: 'main',
47-
per_page: 5
48-
});
49-
50-
// Find run created in the last 5 minutes
51-
const fiveMinutesAgo = new Date();
52-
fiveMinutesAgo.setMinutes(fiveMinutesAgo.getMinutes() - 5);
53-
54-
const recentRun = runs.data.workflow_runs.find(run => {
55-
const runDate = new Date(run.created_at);
56-
return runDate > fiveMinutesAgo && run.event === 'workflow_dispatch';
57-
});
58-
59-
if (recentRun) {
60-
runId = recentRun.id;
61-
console.log(`Found triggered run ID: ${runId}`);
62-
break;
49+
try {
50+
const runs = await github.rest.actions.listWorkflowRuns({
51+
owner: context.repo.owner,
52+
repo: context.repo.repo,
53+
workflow_id: 'NativePipeline.yml',
54+
branch: 'main',
55+
per_page: 10
56+
});
57+
58+
// Find run created in the last 10 minutes
59+
const tenMinutesAgo = new Date();
60+
tenMinutesAgo.setMinutes(tenMinutesAgo.getMinutes() - 10);
61+
62+
const recentRun = runs.data.workflow_runs.find(run => {
63+
const runDate = new Date(run.created_at);
64+
return runDate > tenMinutesAgo && run.event === 'workflow_dispatch';
65+
});
66+
67+
if (recentRun) {
68+
runId = recentRun.id;
69+
console.log(`Found triggered run ID: ${runId}`);
70+
break;
71+
}
72+
} catch (error) {
73+
console.log(`Error finding run: ${error.message}`);
6374
}
6475
}
6576
6677
if (!runId) {
67-
core.setFailed('Could not find the triggered workflow run');
78+
console.log('Could not find the triggered workflow run - will continue without monitoring');
79+
core.setOutput('dispatch_success', 'false');
80+
core.setOutput('run_id', '');
6881
return;
6982
}
7083
71-
core.setOutput('run-id', runId);
84+
core.setOutput('dispatch_success', 'true');
85+
core.setOutput('run_id', runId);
7286
return runId;
7387
7488
dispatch-version-mx-10:
7589
if: github.event.schedule == '0 4 * * *'
7690
runs-on: ubuntu-latest
7791
outputs:
78-
run-id: ${{ steps.trigger.outputs.run-id }}
92+
run-id: ${{ steps.trigger.outputs.run_id }}
7993
steps:
8094
- name: "Trigger Native Pipeline on version/mx/10"
8195
id: trigger
@@ -93,50 +107,60 @@ jobs:
93107
}
94108
});
95109
110+
console.log('Workflow dispatch triggered successfully');
111+
96112
// Wait longer and find the correct run
97113
let runId = null;
98114
let attempts = 0;
99-
const maxAttempts = 12; // 2 minutes total
115+
const maxAttempts = 18; // Increase to 3 minutes
100116
101117
while (!runId && attempts < maxAttempts) {
102118
await new Promise(resolve => setTimeout(resolve, 10000)); // Wait 10 seconds
103119
attempts++;
120+
console.log(`Attempt ${attempts}/${maxAttempts} to find triggered run...`);
104121
105-
const runs = await github.rest.actions.listWorkflowRuns({
106-
owner: context.repo.owner,
107-
repo: context.repo.repo,
108-
workflow_id: 'NativePipeline.yml',
109-
branch: 'version/mx/10',
110-
per_page: 5
111-
});
112-
113-
// Find run created in the last 5 minutes
114-
const fiveMinutesAgo = new Date();
115-
fiveMinutesAgo.setMinutes(fiveMinutesAgo.getMinutes() - 5);
116-
117-
const recentRun = runs.data.workflow_runs.find(run => {
118-
const runDate = new Date(run.created_at);
119-
return runDate > fiveMinutesAgo && run.event === 'workflow_dispatch';
120-
});
121-
122-
if (recentRun) {
123-
runId = recentRun.id;
124-
console.log(`Found triggered run ID: ${runId}`);
125-
break;
122+
try {
123+
const runs = await github.rest.actions.listWorkflowRuns({
124+
owner: context.repo.owner,
125+
repo: context.repo.repo,
126+
workflow_id: 'NativePipeline.yml',
127+
branch: 'version/mx/10',
128+
per_page: 10
129+
});
130+
131+
// Find run created in the last 10 minutes
132+
const tenMinutesAgo = new Date();
133+
tenMinutesAgo.setMinutes(tenMinutesAgo.getMinutes() - 10);
134+
135+
const recentRun = runs.data.workflow_runs.find(run => {
136+
const runDate = new Date(run.created_at);
137+
return runDate > tenMinutesAgo && run.event === 'workflow_dispatch';
138+
});
139+
140+
if (recentRun) {
141+
runId = recentRun.id;
142+
console.log(`Found triggered run ID: ${runId}`);
143+
break;
144+
}
145+
} catch (error) {
146+
console.log(`Error finding run: ${error.message}`);
126147
}
127148
}
128149
129150
if (!runId) {
130-
core.setFailed('Could not find the triggered workflow run');
151+
console.log('Could not find the triggered workflow run - will continue without monitoring');
152+
core.setOutput('dispatch_success', 'false');
153+
core.setOutput('run_id', '');
131154
return;
132155
}
133156
134-
core.setOutput('run-id', runId);
157+
core.setOutput('dispatch_success', 'true');
158+
core.setOutput('run_id', runId);
135159
return runId;
136160
137161
auto-retry-main:
138162
needs: dispatch-main
139-
if: always() && needs.dispatch-main.result == 'success' && github.event.schedule == '0 22 * * *'
163+
if: always() && needs.dispatch-main.outputs.run-id != '' && github.event.schedule == '0 22 * * *'
140164
runs-on: ubuntu-latest
141165
env:
142166
RETRY_COUNT: 0 # Track retry attempts
@@ -148,12 +172,12 @@ jobs:
148172
const runId = '${{ needs.dispatch-main.outputs.run-id }}';
149173
const MAX_RETRIES = 1; // Only retry once
150174
151-
if (!runId || runId === 'null') {
152-
core.setFailed('No run ID available from dispatch job');
175+
if (!runId || runId === 'null' || runId === '') {
176+
console.log('No run ID available from dispatch job - skipping monitoring');
153177
return;
154178
}
155179
156-
console.log(`Monitoring run ID: ${runId}`);
180+
console.log(`Starting monitoring for run ID: ${runId}`);
157181
158182
// Poll for completion with timeout
159183
let run;
@@ -175,28 +199,32 @@ jobs:
175199
repo: context.repo.repo,
176200
run_id: runId
177201
});
178-
console.log(`Poll #${pollAttempts}: Run status: ${run.data.status}, conclusion: ${run.data.conclusion || 'N/A'}`);
202+
console.log(`Poll #${pollAttempts}: Run status: ${run.data.status}, conclusion: ${run.data.conclusion || 'N/A'}, attempt: ${run.data.run_attempt}`);
179203
} catch (error) {
180204
console.log(`Error getting run status: ${error.message}`);
181205
continue;
182206
}
183207
} while (run.data.status === 'in_progress' || run.data.status === 'queued');
184208
185-
// Check if we should retry (only once)
209+
// Check if we should retry
186210
if (run.data.conclusion === 'failure') {
187-
console.log('Pipeline failed. Checking retry count...');
211+
console.log(`Pipeline failed on attempt ${run.data.run_attempt}. Checking if retry needed...`);
188212
189-
// Use GitHub's run attempt number to track retries
190-
if (run.data.run_attempt <= MAX_RETRIES) {
191-
console.log(`Triggering retry (attempt ${run.data.run_attempt}/${MAX_RETRIES})...`);
192-
await github.rest.actions.reRunWorkflowFailedJobs({
193-
owner: context.repo.owner,
194-
repo: context.repo.repo,
195-
run_id: runId
196-
});
197-
console.log('Retry triggered for failed jobs only.');
213+
// Only retry if this is the first attempt
214+
if (run.data.run_attempt === 1) {
215+
console.log('Triggering retry for failed jobs...');
216+
try {
217+
await github.rest.actions.reRunWorkflowFailedJobs({
218+
owner: context.repo.owner,
219+
repo: context.repo.repo,
220+
run_id: runId
221+
});
222+
console.log('Retry triggered successfully for failed jobs only.');
223+
} catch (error) {
224+
console.log(`Failed to trigger retry: ${error.message}`);
225+
}
198226
} else {
199-
console.log(`Maximum retries (${MAX_RETRIES}) reached. Not retrying again.`);
227+
console.log('This was already a retry attempt. Not retrying again.');
200228
}
201229
} else if (run.data.conclusion === 'success') {
202230
console.log('Pipeline completed successfully!');
@@ -206,7 +234,7 @@ jobs:
206234
207235
auto-retry-version-mx-10:
208236
needs: dispatch-version-mx-10
209-
if: always() && needs.dispatch-version-mx-10.result == 'success' && github.event.schedule == '0 4 * * *'
237+
if: always() && needs.dispatch-version-mx-10.outputs.run-id != '' && github.event.schedule == '0 4 * * *'
210238
runs-on: ubuntu-latest
211239
env:
212240
RETRY_COUNT: 0 # Track retry attempts
@@ -218,12 +246,12 @@ jobs:
218246
const runId = '${{ needs.dispatch-version-mx-10.outputs.run-id }}';
219247
const MAX_RETRIES = 1; // Only retry once
220248
221-
if (!runId || runId === 'null') {
222-
core.setFailed('No run ID available from dispatch job');
249+
if (!runId || runId === 'null' || runId === '') {
250+
console.log('No run ID available from dispatch job - skipping monitoring');
223251
return;
224252
}
225253
226-
console.log(`Monitoring run ID: ${runId}`);
254+
console.log(`Starting monitoring for run ID: ${runId}`);
227255
228256
// Poll for completion with timeout
229257
let run;
@@ -245,28 +273,32 @@ jobs:
245273
repo: context.repo.repo,
246274
run_id: runId
247275
});
248-
console.log(`Poll #${pollAttempts}: Run status: ${run.data.status}, conclusion: ${run.data.conclusion || 'N/A'}`);
276+
console.log(`Poll #${pollAttempts}: Run status: ${run.data.status}, conclusion: ${run.data.conclusion || 'N/A'}, attempt: ${run.data.run_attempt}`);
249277
} catch (error) {
250278
console.log(`Error getting run status: ${error.message}`);
251279
continue;
252280
}
253281
} while (run.data.status === 'in_progress' || run.data.status === 'queued');
254282
255-
// Check if we should retry (only once)
283+
// Check if we should retry
256284
if (run.data.conclusion === 'failure') {
257-
console.log('Pipeline failed. Checking retry count...');
285+
console.log(`Pipeline failed on attempt ${run.data.run_attempt}. Checking if retry needed...`);
258286
259-
// Use GitHub's run attempt number to track retries
260-
if (run.data.run_attempt <= MAX_RETRIES) {
261-
console.log(`Triggering retry (attempt ${run.data.run_attempt}/${MAX_RETRIES})...`);
262-
await github.rest.actions.reRunWorkflowFailedJobs({
263-
owner: context.repo.owner,
264-
repo: context.repo.repo,
265-
run_id: runId
266-
});
267-
console.log('Retry triggered for failed jobs only.');
287+
// Only retry if this is the first attempt
288+
if (run.data.run_attempt === 1) {
289+
console.log('Triggering retry for failed jobs...');
290+
try {
291+
await github.rest.actions.reRunWorkflowFailedJobs({
292+
owner: context.repo.owner,
293+
repo: context.repo.repo,
294+
run_id: runId
295+
});
296+
console.log('Retry triggered successfully for failed jobs only.');
297+
} catch (error) {
298+
console.log(`Failed to trigger retry: ${error.message}`);
299+
}
268300
} else {
269-
console.log(`Maximum retries (${MAX_RETRIES}) reached. Not retrying again.`);
301+
console.log('This was already a retry attempt. Not retrying again.');
270302
}
271303
} else if (run.data.conclusion === 'success') {
272304
console.log('Pipeline completed successfully!');

0 commit comments

Comments
 (0)