Skip to content

Commit 7b7bcd6

Browse files
authored
fix: invert watchdog op order — mutation before comment strip (#53)
* [worktree:scai-watchdog-fix] test: add pipeline_watchdog tests — mutation-first, failure suppression, drift-repair * fix: invert watchdog op order — mutation before comment strip Fixes the bug where the pipeline-watchdog posted the strip-announcement comment before attempting the label mutation. When the mutation failed silently the comment was the only durable trace, leaving status:done on the issue. Three changes: 1. Mutation-first: gh issue edit --remove-label runs before gh issue comment; comment is only posted when mutation exits 0. 2. Failure logging: non-zero mutation exit writes a structured JSON entry to the orchestrator log (repo, issue, exit_code, stderr). 3. Drift-repair pass: scans a list of (repo, issue) pairs; for each issue that still has the strip-comment AND status:done, re-attempts removal. Closes #46 * chore: fix import cleanup — add stat, remove unused pytest import --------- Co-authored-by: Original Gary <276612211+OpenGaryBot@users.noreply.github.com>
1 parent 97e162f commit 7b7bcd6

2 files changed

Lines changed: 572 additions & 0 deletions

File tree

scripts/pipeline_watchdog.py

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
#!/usr/bin/env python3
2+
"""Pipeline watchdog — strips status:done labels from GitHub issues applied
3+
without commit or PR evidence.
4+
5+
Operation ordering (the fix):
6+
1. Attempt label mutation first: ``gh issue edit --remove-label status:done``
7+
2. Only post the strip-announcement comment if mutation exits 0.
8+
3. On non-zero exit, write a structured JSON entry to the orchestrator log.
9+
10+
The previous (buggy) ordering posted the comment first, then attempted the
11+
mutation. When the mutation failed silently the comment became the only
12+
durable trace, leaving the label in place.
13+
14+
Drift-repair pass:
15+
Re-scans a list of known (repo, issue) pairs. For each issue that still
16+
carries the strip-comment AND the status:done label, re-attempts label
17+
removal. This heals issues left in the broken state by the original bug.
18+
19+
Usage (programmatic):
20+
from pipeline_watchdog import strip_status_done, drift_repair_pass
21+
22+
strip_status_done(
23+
repo="Open-Paws/context",
24+
issue_number=82,
25+
log_path=Path("pipeline/watchdog.log"),
26+
)
27+
28+
drift_repair_pass(
29+
repos_and_issues=[("Open-Paws/context", 82)],
30+
log_path=Path("pipeline/watchdog.log"),
31+
)
32+
33+
Usage (CLI):
34+
python scripts/pipeline_watchdog.py strip --repo Open-Paws/context \\
35+
--issue 82 --log pipeline/watchdog.log
36+
37+
python scripts/pipeline_watchdog.py drift-repair \\
38+
--log pipeline/watchdog.log \\
39+
Open-Paws/context:82 Open-Paws/platform:87
40+
"""
41+
42+
from __future__ import annotations
43+
44+
import argparse
45+
import json
46+
import subprocess
47+
import sys
48+
from datetime import datetime, timezone
49+
from pathlib import Path
50+
from typing import Sequence
51+
52+
# The exact prefix that the watchdog embeds in strip-announcement comments.
53+
# Used by the drift-repair pass to detect stale comment+label combinations.
54+
STRIP_COMMENT_MARKER = "[pipeline-watchdog] Stripped `status:done`"
55+
56+
_DEFAULT_GH = "gh"
57+
58+
59+
# ---------------------------------------------------------------------------
60+
# Core operations
61+
# ---------------------------------------------------------------------------
62+
63+
64+
def strip_status_done(
65+
repo: str,
66+
issue_number: int,
67+
log_path: Path,
68+
gh_path: str = _DEFAULT_GH,
69+
) -> bool:
70+
"""Attempt to remove the status:done label, then post the announcement.
71+
72+
Returns True when the label mutation succeeded and the comment was posted,
73+
False when the mutation failed (comment suppressed, failure logged).
74+
"""
75+
# Step 1 — mutate label FIRST
76+
result = subprocess.run(
77+
[gh_path, "issue", "edit", str(issue_number),
78+
"--repo", repo,
79+
"--remove-label", "status:done"],
80+
capture_output=True,
81+
text=True,
82+
)
83+
84+
if result.returncode != 0:
85+
# Mutation failed — log the failure, suppress the comment
86+
_log_failure(
87+
log_path=log_path,
88+
repo=repo,
89+
issue=issue_number,
90+
exit_code=result.returncode,
91+
stderr=result.stderr.strip(),
92+
)
93+
return False
94+
95+
# Step 2 — mutation succeeded, now post the announcement
96+
subprocess.run(
97+
[gh_path, "issue", "comment", str(issue_number),
98+
"--repo", repo,
99+
"--body",
100+
f"{STRIP_COMMENT_MARKER} — applied without commit/PR evidence "
101+
"(no closing PR, no `Closes #N` reference on default branch). "
102+
"Re-walking through triage/plan/impl on next /run."],
103+
capture_output=True,
104+
text=True,
105+
)
106+
return True
107+
108+
109+
def drift_repair_pass(
110+
repos_and_issues: Sequence[tuple[str, int]],
111+
log_path: Path,
112+
gh_path: str = _DEFAULT_GH,
113+
) -> int:
114+
"""Re-attempt label removal for issues in a known-broken state.
115+
116+
A broken state means: the issue has the strip-comment AND still carries
117+
status:done. This is the signature of the original bug.
118+
119+
Returns the count of issues where repair was attempted.
120+
"""
121+
repaired = 0
122+
for repo, issue_number in repos_and_issues:
123+
if _is_stale(repo, issue_number, gh_path):
124+
strip_status_done(
125+
repo=repo,
126+
issue_number=issue_number,
127+
log_path=log_path,
128+
gh_path=gh_path,
129+
)
130+
repaired += 1
131+
return repaired
132+
133+
134+
# ---------------------------------------------------------------------------
135+
# Helpers
136+
# ---------------------------------------------------------------------------
137+
138+
139+
def _is_stale(repo: str, issue_number: int, gh_path: str) -> bool:
140+
"""Return True if the issue has the strip-comment AND status:done label."""
141+
result = subprocess.run(
142+
[gh_path, "issue", "view", str(issue_number),
143+
"--repo", repo,
144+
"--json", "labels,comments"],
145+
capture_output=True,
146+
text=True,
147+
)
148+
if result.returncode != 0:
149+
return False
150+
151+
try:
152+
data = json.loads(result.stdout)
153+
except json.JSONDecodeError:
154+
return False
155+
156+
labels = [lbl if isinstance(lbl, str) else lbl.get("name", "") for lbl in data.get("labels", [])]
157+
has_done_label = "status:done" in labels
158+
159+
comments = data.get("comments", [])
160+
has_strip_comment = any(
161+
STRIP_COMMENT_MARKER in (c.get("body", "") if isinstance(c, dict) else str(c))
162+
for c in comments
163+
)
164+
165+
return has_done_label and has_strip_comment
166+
167+
168+
def _log_failure(
169+
log_path: Path,
170+
repo: str,
171+
issue: int,
172+
exit_code: int,
173+
stderr: str,
174+
) -> None:
175+
"""Append a structured JSON entry to the orchestrator log."""
176+
entry = {
177+
"timestamp": datetime.now(timezone.utc).isoformat(),
178+
"event": "watchdog_label_mutation_failed",
179+
"repo": repo,
180+
"issue": issue,
181+
"exit_code": exit_code,
182+
"stderr": stderr,
183+
}
184+
log_path.parent.mkdir(parents=True, exist_ok=True)
185+
with log_path.open("a", encoding="utf-8") as f:
186+
f.write(json.dumps(entry) + "\n")
187+
188+
189+
# ---------------------------------------------------------------------------
190+
# CLI entry point
191+
# ---------------------------------------------------------------------------
192+
193+
194+
def _build_parser() -> argparse.ArgumentParser:
195+
parser = argparse.ArgumentParser(
196+
prog="pipeline_watchdog",
197+
description="Pipeline watchdog — strips status:done labels applied without evidence.",
198+
)
199+
sub = parser.add_subparsers(dest="command", required=True)
200+
201+
strip_cmd = sub.add_parser("strip", help="Strip status:done from a single issue.")
202+
strip_cmd.add_argument("--repo", required=True, help="owner/repo")
203+
strip_cmd.add_argument("--issue", type=int, required=True, help="issue number")
204+
strip_cmd.add_argument("--log", required=True, help="path to orchestrator log file")
205+
strip_cmd.add_argument("--gh", default=_DEFAULT_GH, help="path to gh binary")
206+
207+
repair_cmd = sub.add_parser("drift-repair", help="Re-attempt removal for stale issues.")
208+
repair_cmd.add_argument("--log", required=True, help="path to orchestrator log file")
209+
repair_cmd.add_argument("--gh", default=_DEFAULT_GH, help="path to gh binary")
210+
repair_cmd.add_argument(
211+
"issues",
212+
nargs="+",
213+
metavar="OWNER/REPO:NUMBER",
214+
help="repo:issue pairs to scan, e.g. Open-Paws/context:82",
215+
)
216+
217+
return parser
218+
219+
220+
def main(argv: list[str] | None = None) -> int:
221+
parser = _build_parser()
222+
args = parser.parse_args(argv)
223+
224+
if args.command == "strip":
225+
ok = strip_status_done(
226+
repo=args.repo,
227+
issue_number=args.issue,
228+
log_path=Path(args.log),
229+
gh_path=args.gh,
230+
)
231+
return 0 if ok else 1
232+
233+
if args.command == "drift-repair":
234+
pairs: list[tuple[str, int]] = []
235+
for item in args.issues:
236+
repo, _, num = item.rpartition(":")
237+
if not repo or not num.isdigit():
238+
print(f"Invalid format (expected OWNER/REPO:NUMBER): {item}", file=sys.stderr)
239+
return 2
240+
pairs.append((repo, int(num)))
241+
count = drift_repair_pass(
242+
repos_and_issues=pairs,
243+
log_path=Path(args.log),
244+
gh_path=args.gh,
245+
)
246+
print(f"drift-repair: {count} issue(s) re-attempted")
247+
return 0
248+
249+
return 1
250+
251+
252+
if __name__ == "__main__":
253+
sys.exit(main())

0 commit comments

Comments
 (0)