88
99import { execFileSync } from 'child_process'
1010import { Bot , InlineKeyboard , type Context } from 'grammy'
11- import { JOB_STATUS , type Job , type JobEvent } from '@wright/shared'
11+ import { JOB_STATUS , REAPER_INTERVAL_MS , STALE_HEARTBEAT_MS , STALE_CLAIMED_MS , type Job , type JobEvent } from '@wright/shared'
1212import {
13+ getSupabase ,
1314 insertJob ,
1415 getJob ,
1516 getJobByPrefix ,
@@ -889,6 +890,138 @@ bot.catch((err) => {
889890 console . error ( 'Unhandled bot error:' , err )
890891} )
891892
893+ // ---------------------------------------------------------------------------
894+ // Stale job reaper — detects dead workers via heartbeat expiry
895+ // ---------------------------------------------------------------------------
896+
897+ function startReaper ( ) : void {
898+ const sb = getSupabase ( )
899+
900+ setInterval ( async ( ) => {
901+ try {
902+ const cutoff = new Date ( Date . now ( ) - STALE_HEARTBEAT_MS ) . toISOString ( )
903+
904+ // Find running jobs with stale or missing heartbeats
905+ const { data : staleJobs , error } = await sb
906+ . from ( 'job_queue' )
907+ . select ( 'id, attempt, max_attempts, worker_id, telegram_chat_id, task, heartbeat_at, started_at' )
908+ . eq ( 'status' , 'running' )
909+ . or ( `heartbeat_at.lt.${ cutoff } ,and(heartbeat_at.is.null,started_at.lt.${ cutoff } )` )
910+
911+ if ( error ) {
912+ console . error ( '[reaper] Query error:' , error . message )
913+ return
914+ }
915+
916+ if ( ! staleJobs || staleJobs . length === 0 ) return
917+
918+ console . log ( `[reaper] Found ${ staleJobs . length } stale running job(s)` )
919+
920+ for ( const job of staleJobs ) {
921+ if ( job . attempt < job . max_attempts ) {
922+ // Re-queue for retry
923+ const { data : updated } = await sb
924+ . from ( 'job_queue' )
925+ . update ( {
926+ status : 'queued' ,
927+ worker_id : null ,
928+ claimed_at : null ,
929+ started_at : null ,
930+ heartbeat_at : null ,
931+ attempt : job . attempt + 1 ,
932+ error : `Re-queued by reaper: worker stopped responding (attempt ${ job . attempt + 1 } /${ job . max_attempts } )` ,
933+ } )
934+ . eq ( 'id' , job . id )
935+ . eq ( 'status' , 'running' ) // CAS: only if still running
936+ . select ( 'id' )
937+
938+ if ( updated && updated . length > 0 ) {
939+ console . log ( `[reaper] Re-queued job ${ job . id } (attempt ${ job . attempt + 1 } /${ job . max_attempts } )` )
940+
941+ if ( job . telegram_chat_id ) {
942+ try {
943+ await bot . api . sendMessage (
944+ job . telegram_chat_id ,
945+ `<b>[${ job . id . slice ( 0 , 8 ) } ]</b> Worker stopped responding. `
946+ + `Re-queuing automatically (attempt ${ job . attempt + 1 } /${ job . max_attempts } ).` ,
947+ { parse_mode : 'HTML' } ,
948+ )
949+ } catch {
950+ // Best effort notification
951+ }
952+ }
953+
954+ wakeWorker ( )
955+ }
956+ } else {
957+ // Max attempts exceeded — mark as permanently failed
958+ const { data : updated } = await sb
959+ . from ( 'job_queue' )
960+ . update ( {
961+ status : 'failed' ,
962+ completed_at : new Date ( ) . toISOString ( ) ,
963+ heartbeat_at : null ,
964+ error : `Failed: worker stopped responding after ${ job . max_attempts } attempts` ,
965+ } )
966+ . eq ( 'id' , job . id )
967+ . eq ( 'status' , 'running' ) // CAS
968+ . select ( 'id' )
969+
970+ if ( updated && updated . length > 0 ) {
971+ console . log ( `[reaper] Job ${ job . id } permanently failed (max attempts)` )
972+
973+ if ( job . telegram_chat_id ) {
974+ try {
975+ await bot . api . sendMessage (
976+ job . telegram_chat_id ,
977+ `<b>[${ job . id . slice ( 0 , 8 ) } ]</b> Worker stopped responding. `
978+ + `Job has failed permanently after ${ job . max_attempts } attempts.` ,
979+ { parse_mode : 'HTML' } ,
980+ )
981+ } catch {
982+ // Best effort notification
983+ }
984+ }
985+ }
986+ }
987+ }
988+
989+ // Also check for stale claimed jobs (worker died before transitioning to running)
990+ const claimedCutoff = new Date ( Date . now ( ) - STALE_CLAIMED_MS ) . toISOString ( )
991+ const { data : staleClaimed } = await sb
992+ . from ( 'job_queue' )
993+ . select ( 'id, worker_id' )
994+ . eq ( 'status' , 'claimed' )
995+ . lt ( 'claimed_at' , claimedCutoff )
996+
997+ if ( staleClaimed && staleClaimed . length > 0 ) {
998+ for ( const job of staleClaimed ) {
999+ await sb
1000+ . from ( 'job_queue' )
1001+ . update ( {
1002+ status : 'queued' ,
1003+ worker_id : null ,
1004+ claimed_at : null ,
1005+ heartbeat_at : null ,
1006+ error : `Re-queued by reaper: claimed by ${ job . worker_id } but never started` ,
1007+ } )
1008+ . eq ( 'id' , job . id )
1009+ . eq ( 'status' , 'claimed' ) // CAS
1010+
1011+ console . log ( `[reaper] Reset stale claimed job ${ job . id } ` )
1012+ }
1013+ wakeWorker ( )
1014+ }
1015+ } catch ( err ) {
1016+ console . error ( '[reaper] Unexpected error:' , err )
1017+ }
1018+ } , REAPER_INTERVAL_MS )
1019+
1020+ console . log (
1021+ `[reaper] Stale job reaper started (interval: ${ REAPER_INTERVAL_MS } ms, staleness: ${ STALE_HEARTBEAT_MS } ms)` ,
1022+ )
1023+ }
1024+
8921025// ---------------------------------------------------------------------------
8931026// Startup
8941027// ---------------------------------------------------------------------------
@@ -901,6 +1034,9 @@ async function main(): Promise<void> {
9011034 // intentional -- we want a loud failure at startup.
9021035 startRealtimeBridge ( )
9031036
1037+ // Start the stale job reaper — detects dead workers via heartbeat expiry
1038+ startReaper ( )
1039+
9041040 // Start long polling. This will block until the process is stopped.
9051041 console . log ( 'Bot is now polling for updates.' )
9061042 await bot . start ( {
0 commit comments