From a10a9f1f1fae33fe74883c7f4477a90c23b0d9e0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 1 May 2026 00:19:29 +0000 Subject: [PATCH] fix: orchestration monitor checks events.jsonl for lazy placeholder workers After app relaunch, MonitorAndSynthesizeAsync could see workers as idle before InvokeOnUI had set IsProcessing=true (race between background thread and UI thread dispatch). This caused the monitor to collect empty/stale results and abort orchestration. Fix: - Add IsWorkerIdleForMonitor() that checks events.jsonl directly for lazy placeholder workers (Session == null), bypassing the InvokeOnUI race condition - Wait for IsRestoring to become false before starting the poll loop - Add 2s settle time for queued InvokeOnUI callbacks Tests: 10 new tests (8 behavioral + 2 structural guards) Fixes #400 Co-authored-by: copilot-agentic-workflow[bot] <224017+copilot-agentic-workflow[bot]@users.noreply.github.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../OrchestrationRelaunchTests.cs | 57 ++++ PolyPilot.Tests/OrchestrationRelaunchTests.cs | 318 ++++++++++++++++++ .../Services/CopilotService.Organization.cs | 43 ++- 3 files changed, 417 insertions(+), 1 deletion(-) create mode 100644 PolyPilot.IntegrationTests/OrchestrationRelaunchTests.cs create mode 100644 PolyPilot.Tests/OrchestrationRelaunchTests.cs diff --git a/PolyPilot.IntegrationTests/OrchestrationRelaunchTests.cs b/PolyPilot.IntegrationTests/OrchestrationRelaunchTests.cs new file mode 100644 index 000000000..35ed47112 --- /dev/null +++ b/PolyPilot.IntegrationTests/OrchestrationRelaunchTests.cs @@ -0,0 +1,57 @@ +using PolyPilot.IntegrationTests.Fixtures; + +namespace PolyPilot.IntegrationTests; + +/// +/// Integration tests for orchestration relaunch resilience (issue #400). +/// Verifies the multi-agent group UI and orchestration resume indicators +/// are accessible through the live Blazor UI via DevFlow CDP. +/// +[Collection("PolyPilot")] +[Trait("Category", "OrchestrationRelaunch")] +public class OrchestrationRelaunchTests : IntegrationTestBase +{ + public OrchestrationRelaunchTests(AppFixture app, ITestOutputHelper output) + : base(app, output) { } + + [Fact] + public async Task Dashboard_MultiAgentGroupsRenderable() + { + await WaitForCdpReadyAsync(); + // The dashboard should be able to render multi-agent groups. + // This doesn't require an active orchestration — just verifies the + // rendering pipeline handles the group list without errors. + var dashboardExists = await ExistsAsync("#dashboard"); + Assert.True(dashboardExists, "Dashboard should load successfully — multi-agent group rendering must not crash"); + await ScreenshotAsync("dashboard-multiagent-groups"); + } + + [Fact] + public async Task Dashboard_OrchestratorPhaseIndicatorRenderable() + { + await WaitForCdpReadyAsync(); + // Verify the dashboard component that shows orchestrator phase + // (Planning → Dispatching → WaitingForWorkers → Synthesizing → Complete → Resuming) + // doesn't crash even with no active orchestration. + var dashboardExists = await ExistsAsync("#dashboard"); + Assert.True(dashboardExists, + "Dashboard should load without errors — phase indicator rendering must not crash when no orchestration is active"); + await ScreenshotAsync("dashboard-no-active-orchestration"); + } + + [Fact] + public async Task Settings_HasConnectionModeForReconnect() + { + await WaitForCdpReadyAsync(); + var navigated = await NavigateToAsync("Settings", "#settings-page"); + if (navigated) + { + await ScreenshotAsync("settings-reconnect-mode"); + // Settings page should have reconnect capability — needed for + // orchestration resume after relaunch + var content = await GetTextAsync("#settings-page"); + Assert.False(string.IsNullOrWhiteSpace(content), + "Settings page should have visible content for connection mode configuration"); + } + } +} diff --git a/PolyPilot.Tests/OrchestrationRelaunchTests.cs b/PolyPilot.Tests/OrchestrationRelaunchTests.cs new file mode 100644 index 000000000..1720cd237 --- /dev/null +++ b/PolyPilot.Tests/OrchestrationRelaunchTests.cs @@ -0,0 +1,318 @@ +using System.Reflection; +using System.Text.Json; +using Microsoft.Extensions.DependencyInjection; +using PolyPilot.Models; +using PolyPilot.Services; + +namespace PolyPilot.Tests; + +/// +/// Behavioral tests for the orchestration relaunch fix (issue #400). +/// Verifies that MonitorAndSynthesizeAsync correctly detects active workers +/// that are lazy placeholders (Session == null) after app relaunch, instead of +/// prematurely considering them idle due to the InvokeOnUI race condition. +/// +[Collection("BaseDir")] +public class OrchestrationRelaunchTests +{ + private readonly StubChatDatabase _chatDb = new(); + private readonly StubServerManager _serverManager = new(); + private readonly StubWsBridgeClient _bridgeClient = new(); + private readonly StubDemoService _demoService = new(); + private readonly IServiceProvider _serviceProvider; + + private static readonly BindingFlags NonPublic = BindingFlags.NonPublic | BindingFlags.Instance; + private static readonly BindingFlags NonPublicStatic = BindingFlags.NonPublic | BindingFlags.Static; + private static readonly BindingFlags AnyInstance = BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance; + + public OrchestrationRelaunchTests() + { + _serviceProvider = new ServiceCollection().BuildServiceProvider(); + } + + private CopilotService CreateService() => + new CopilotService(_chatDb, _serverManager, _bridgeClient, new RepoManager(), _serviceProvider, _demoService); + + #region Helpers + + /// Get the SessionStatePath used by CopilotService (redirected to test temp dir). + private static string GetSessionStatePath() + { + var prop = typeof(CopilotService).GetProperty("SessionStatePath", NonPublicStatic)!; + return (string)prop.GetValue(null)!; + } + + /// Get the SessionState type (private nested class). + private static Type GetSessionStateType() + { + return typeof(CopilotService).GetNestedType("SessionState", BindingFlags.NonPublic)!; + } + + /// Create a SessionState with the given AgentSessionInfo via reflection. + /// GetUninitializedObject bypasses constructors, so Session is null (lazy placeholder). + /// We manually initialize fields that would be set by field initializers. + private static object CreateSessionState(AgentSessionInfo info) + { + var stateType = GetSessionStateType(); + var state = System.Runtime.CompilerServices.RuntimeHelpers.GetUninitializedObject(stateType); + stateType.GetProperty("Info")!.SetValue(state, info); + + // Initialize readonly field that would normally be set by the field initializer + var signalField = stateType.GetField("PrematureIdleSignal", AnyInstance); + signalField?.SetValue(state, new ManualResetEventSlim(initialState: false)); + + return state; + } + + /// Add a session to the CopilotService._sessions dictionary. + private static void AddSession(CopilotService svc, string name, object sessionState) + { + var field = typeof(CopilotService).GetField("_sessions", NonPublic)!; + var dict = field.GetValue(svc)!; + dict.GetType().GetMethod("TryAdd")!.Invoke(dict, new[] { name, sessionState }); + } + + /// Create an events.jsonl file for a given session ID in the test directory. + private static string CreateEventsFile(string sessionId, params string[] lines) + { + var sessionDir = Path.Combine(GetSessionStatePath(), sessionId); + Directory.CreateDirectory(sessionDir); + var eventsFile = Path.Combine(sessionDir, "events.jsonl"); + File.WriteAllLines(eventsFile, lines); + return eventsFile; + } + + /// Build a JSONL event line with the given type. + private static string BuildEventLine(string type, object data) + { + var obj = new { type, data, timestamp = DateTimeOffset.UtcNow.ToString("o") }; + return JsonSerializer.Serialize(obj); + } + + #endregion + + #region IsWorkerIdleForMonitor behavioral tests + + [Fact] + public void IsWorkerIdleForMonitor_SessionNotFound_ReturnsTrue() + { + // Worker name not in _sessions → treat as idle (handled in result collection) + var svc = CreateService(); + Assert.True(svc.IsWorkerIdleForMonitor("nonexistent-worker")); + } + + [Fact] + public void IsWorkerIdleForMonitor_ConnectedAndProcessing_ReturnsFalse() + { + // Worker has IsProcessing=true → not idle regardless of Session state + var svc = CreateService(); + var sessionId = Guid.NewGuid().ToString(); + var info = new AgentSessionInfo + { + Name = "worker-1", + Model = "gpt-4", + SessionId = sessionId, + WorkingDirectory = "/tmp" + }; + info.IsProcessing = true; + + var state = CreateSessionState(info); + AddSession(svc, "worker-1", state); + + Assert.False(svc.IsWorkerIdleForMonitor("worker-1")); + } + + [Fact] + public void IsWorkerIdleForMonitor_ConnectedAndIdle_ReturnsTrue() + { + // Worker has IsProcessing=false and Session would be connected → idle + var svc = CreateService(); + var sessionId = Guid.NewGuid().ToString(); + var info = new AgentSessionInfo + { + Name = "worker-1", + Model = "gpt-4", + SessionId = sessionId, + WorkingDirectory = "/tmp" + }; + info.IsProcessing = false; + + var state = CreateSessionState(info); + AddSession(svc, "worker-1", state); + + // No events.jsonl at all → IsSessionStillProcessing returns false + Assert.True(svc.IsWorkerIdleForMonitor("worker-1")); + } + + [Fact] + public void IsWorkerIdleForMonitor_LazyPlaceholderWithActiveCli_ReturnsFalse() + { + // KEY FIX TEST: Worker is a lazy placeholder (Session == null) with + // IsProcessing=false (InvokeOnUI hasn't fired yet), but the CLI + // events.jsonl shows active tool execution → NOT idle. + var svc = CreateService(); + var sessionId = Guid.NewGuid().ToString(); + var info = new AgentSessionInfo + { + Name = "worker-1", + Model = "gpt-4", + SessionId = sessionId, + WorkingDirectory = "/tmp" + }; + info.IsProcessing = false; // InvokeOnUI hasn't set this yet + + var state = CreateSessionState(info); + // Session is null (lazy placeholder) — GetUninitializedObject doesn't call constructor + AddSession(svc, "worker-1", state); + + // Create events.jsonl showing active tool execution + CreateEventsFile(sessionId, + BuildEventLine("assistant.turn_start", new { }), + BuildEventLine("tool.execution_start", new { toolCallId = "tc1", name = "bash" })); + + Assert.False(svc.IsWorkerIdleForMonitor("worker-1")); + } + + [Fact] + public void IsWorkerIdleForMonitor_LazyPlaceholderWithIdleCli_ReturnsTrue() + { + // Worker is a lazy placeholder but CLI has completed (terminal event) → idle + var svc = CreateService(); + var sessionId = Guid.NewGuid().ToString(); + var info = new AgentSessionInfo + { + Name = "worker-1", + Model = "gpt-4", + SessionId = sessionId, + WorkingDirectory = "/tmp" + }; + info.IsProcessing = false; + + var state = CreateSessionState(info); + AddSession(svc, "worker-1", state); + + // Create events.jsonl showing session shutdown (terminal) + CreateEventsFile(sessionId, + BuildEventLine("assistant.turn_start", new { }), + BuildEventLine("assistant.message", new { content = "Done!" }), + BuildEventLine("session.shutdown", new { })); + + Assert.True(svc.IsWorkerIdleForMonitor("worker-1")); + } + + [Fact] + public void IsWorkerIdleForMonitor_LazyPlaceholderWithNoEvents_ReturnsTrue() + { + // Worker is a lazy placeholder with no events.jsonl → idle + var svc = CreateService(); + var sessionId = Guid.NewGuid().ToString(); + var info = new AgentSessionInfo + { + Name = "worker-1", + Model = "gpt-4", + SessionId = sessionId, + WorkingDirectory = "/tmp" + }; + info.IsProcessing = false; + + var state = CreateSessionState(info); + AddSession(svc, "worker-1", state); + + // No events.jsonl created → IsSessionStillProcessing returns false + Assert.True(svc.IsWorkerIdleForMonitor("worker-1")); + } + + [Fact] + public void IsWorkerIdleForMonitor_LazyPlaceholderWithStaleEvents_ReturnsTrue() + { + // Worker is a lazy placeholder but events.jsonl is old → CLI finished long ago → idle + var svc = CreateService(); + var sessionId = Guid.NewGuid().ToString(); + var info = new AgentSessionInfo + { + Name = "worker-1", + Model = "gpt-4", + SessionId = sessionId, + WorkingDirectory = "/tmp" + }; + info.IsProcessing = false; + + var state = CreateSessionState(info); + AddSession(svc, "worker-1", state); + + // Create events.jsonl showing active event BUT backdate the file + var eventsFile = CreateEventsFile(sessionId, + BuildEventLine("tool.execution_start", new { toolCallId = "tc1", name = "bash" })); + + // Backdate to be older than the staleness threshold + var staleTime = DateTime.UtcNow.AddSeconds(-(CopilotService.WatchdogToolExecutionTimeoutSeconds + 60)); + File.SetLastWriteTimeUtc(eventsFile, staleTime); + + Assert.True(svc.IsWorkerIdleForMonitor("worker-1")); + } + + [Fact] + public void IsWorkerIdleForMonitor_LazyPlaceholderNoSessionId_ReturnsTrue() + { + // Worker is a lazy placeholder with no SessionId → can't check events → idle + var svc = CreateService(); + var info = new AgentSessionInfo + { + Name = "worker-1", + Model = "gpt-4", + SessionId = null!, + WorkingDirectory = "/tmp" + }; + info.IsProcessing = false; + + var state = CreateSessionState(info); + AddSession(svc, "worker-1", state); + + Assert.True(svc.IsWorkerIdleForMonitor("worker-1")); + } + + #endregion + + #region MonitorAndSynthesizeAsync source structure (supplementary invariant guards) + + private static string GetRepoRoot() + { + var dir = AppContext.BaseDirectory; + while (dir != null && !File.Exists(Path.Combine(dir, "PolyPilot.slnx"))) + dir = Path.GetDirectoryName(dir); + return dir ?? throw new InvalidOperationException("Could not find repo root"); + } + + [Fact] + public void MonitorAndSynthesizeAsync_CallsIsWorkerIdleForMonitor() + { + // Structural guard: MonitorAndSynthesizeAsync must use IsWorkerIdleForMonitor + // instead of directly checking state.Info.IsProcessing. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs")); + + var methodStart = source.IndexOf("private async Task MonitorAndSynthesizeAsync"); + Assert.True(methodStart >= 0, "MonitorAndSynthesizeAsync method not found"); + + var methodBody = source.Substring(methodStart, Math.Min(2000, source.Length - methodStart)); + + Assert.Contains("IsWorkerIdleForMonitor", methodBody); + } + + [Fact] + public void MonitorAndSynthesizeAsync_WaitsForRestoreToComplete() + { + // Structural guard: MonitorAndSynthesizeAsync must wait for IsRestoring to become false + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs")); + + var methodStart = source.IndexOf("private async Task MonitorAndSynthesizeAsync"); + Assert.True(methodStart >= 0, "MonitorAndSynthesizeAsync method not found"); + + var methodBody = source.Substring(methodStart, Math.Min(2000, source.Length - methodStart)); + + Assert.Contains("IsRestoring", methodBody); + } + + #endregion +} diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs index 13d064685..22c4b95d6 100644 --- a/PolyPilot/Services/CopilotService.Organization.cs +++ b/PolyPilot/Services/CopilotService.Organization.cs @@ -3222,18 +3222,59 @@ internal async Task ResumeOrchestrationIfPendingAsync(CancellationToken ct = def }); } + /// + /// Checks whether a worker session is idle for purposes of orchestration resume monitoring. + /// A worker is NOT idle if: + /// 1. Its in-memory IsProcessing flag is true, OR + /// 2. It's a lazy placeholder (Session == null) and the CLI events.jsonl shows active processing. + /// + /// Case 2 handles the race condition (issue #400) where IsProcessing is set via InvokeOnUI + /// (async UI thread dispatch) but MonitorAndSynthesizeAsync checks it from a background thread + /// before the callback fires. + /// + internal bool IsWorkerIdleForMonitor(string workerName) + { + if (!_sessions.TryGetValue(workerName, out var state)) + return true; // session not found — will be handled in result collection + + if (state.Info.IsProcessing) + return false; + + // Lazy placeholder workers: check events.jsonl directly. + // IsProcessing may not be set yet (InvokeOnUI hasn't fired on UI thread). + if (state.Session == null && !string.IsNullOrEmpty(state.Info.SessionId) + && IsSessionStillProcessing(state.Info.SessionId)) + { + Debug($"[DISPATCH] Worker '{workerName}' is a lazy placeholder with active CLI session — treating as not idle"); + return false; + } + + return true; + } + private async Task MonitorAndSynthesizeAsync(PendingOrchestration pending, CancellationToken ct) { var timeout = TimeSpan.FromMinutes(15); var started = DateTime.UtcNow; + // Wait for session restore to complete before checking worker state. + // Worker IsProcessing flags are set via InvokeOnUI during restore, which is + // async (queued on UI thread). Without this wait, we may see workers as idle + // before the restore loop has set their processing state (issue #400). + var restoreWaitStart = DateTime.UtcNow; + while (IsRestoring && !ct.IsCancellationRequested && (DateTime.UtcNow - restoreWaitStart).TotalSeconds < 30) + await Task.Delay(500, ct); + + // Extra settle time for queued InvokeOnUI callbacks to execute on the UI thread + await Task.Delay(2000, ct); + // Poll every 5 seconds until all workers are idle while (!ct.IsCancellationRequested && (DateTime.UtcNow - started) < timeout) { var allIdle = true; foreach (var workerName in pending.WorkerNames) { - if (_sessions.TryGetValue(workerName, out var state) && state.Info.IsProcessing) + if (!IsWorkerIdleForMonitor(workerName)) { allIdle = false; break;