-
Notifications
You must be signed in to change notification settings - Fork 112
Expand file tree
/
Copy pathrp_scale.py
More file actions
273 lines (213 loc) · 9.68 KB
/
rp_scale.py
File metadata and controls
273 lines (213 loc) · 9.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
"""
runpod | serverless | rp_scale.py
Provides the functionality for scaling the runpod serverless worker.
"""
import asyncio
import signal
import sys
import traceback
from typing import Any, Dict
from ...http_client import AsyncClientSession, ClientSession, TooManyRequests
from .rp_job import get_job, handle_job
from .rp_logger import RunPodLogger
from .worker_state import JobsProgress, IS_LOCAL_TEST
log = RunPodLogger()
def _handle_uncaught_exception(exc_type, exc_value, exc_traceback):
exc = traceback.format_exception(exc_type, exc_value, exc_traceback)
log.error(f"Uncaught exception | {exc}")
def _default_concurrency_modifier(current_concurrency: int) -> int:
"""
Default concurrency modifier.
This function returns the current concurrency without any modification.
Args:
current_concurrency (int): The current concurrency.
Returns:
int: The current concurrency.
"""
return current_concurrency
class JobScaler:
"""
Job Scaler. This class is responsible for scaling the number of concurrent requests.
"""
def __init__(self, config: Dict[str, Any]):
self._shutdown_event = asyncio.Event()
self.current_concurrency = 1
self.config = config
self.job_progress = JobsProgress() # Cache the singleton instance
self.jobs_queue = asyncio.Queue()
self.concurrency_modifier = _default_concurrency_modifier
self.jobs_fetcher = get_job
self.jobs_fetcher_timeout = 90
self.jobs_handler = handle_job
if concurrency_modifier := config.get("concurrency_modifier"):
self.concurrency_modifier = concurrency_modifier
if not IS_LOCAL_TEST:
# below cannot be changed unless local
return
if jobs_fetcher := self.config.get("jobs_fetcher"):
self.jobs_fetcher = jobs_fetcher
if jobs_fetcher_timeout := self.config.get("jobs_fetcher_timeout"):
self.jobs_fetcher_timeout = jobs_fetcher_timeout
if jobs_handler := self.config.get("jobs_handler"):
self.jobs_handler = jobs_handler
async def set_scale(self):
new_concurrency = self.concurrency_modifier(self.current_concurrency)
if new_concurrency == self.current_concurrency:
# no need to resize
return
while self.current_occupancy() > 0:
# not safe to scale when jobs are in flight
await asyncio.sleep(1)
continue
self.current_concurrency = new_concurrency
log.debug(f"JobScaler.set_scale | New concurrency set to: {self.current_concurrency}")
def start(self):
"""
This is required for the worker to be able to shut down gracefully
when the user sends a SIGTERM or SIGINT signal. This is typically
the case when the worker is running in a container.
"""
sys.excepthook = _handle_uncaught_exception
try:
# Register signal handlers for graceful shutdown
signal.signal(signal.SIGTERM, self.handle_shutdown)
signal.signal(signal.SIGINT, self.handle_shutdown)
except ValueError:
log.warn("Signal handling is only supported in the main thread.")
# Start the main loop
# Run forever until the worker is signalled to shut down.
asyncio.run(self.run())
def handle_shutdown(self, signum, frame):
"""
Called when the worker is signalled to shut down.
This function is called when the worker receives a signal to shut down, such as
SIGTERM or SIGINT. It sets the shutdown event, which will cause the worker to
exit its main loop and shut down gracefully.
Args:
signum: The signal number that was received.
frame: The current stack frame.
"""
log.debug(f"Received shutdown signal: {signum}.")
self.kill_worker()
async def run(self):
# Create an async session that will be closed when the worker is killed.
async with AsyncClientSession() as session:
# Create tasks for getting and running jobs.
jobtake_task = asyncio.create_task(self.get_jobs(session))
jobrun_task = asyncio.create_task(self.run_jobs(session))
tasks = [jobtake_task, jobrun_task]
# Concurrently run both tasks and wait for both to finish.
await asyncio.gather(*tasks)
def is_alive(self):
"""
Return whether the worker is alive or not.
"""
return not self._shutdown_event.is_set()
def kill_worker(self):
"""
Whether to kill the worker.
"""
log.debug("Kill worker.")
self._shutdown_event.set()
def current_occupancy(self) -> int:
current_queue_count = self.jobs_queue.qsize()
current_progress_count = self.job_progress.get_job_count()
log.debug(
f"JobScaler.status | concurrency: {self.current_concurrency}; queue: {current_queue_count}; progress: {current_progress_count}"
)
return current_progress_count + current_queue_count
async def get_jobs(self, session: ClientSession):
"""
Retrieve multiple jobs from the server in batches using blocking requests.
Runs the block in an infinite loop while the worker is alive.
Adds jobs to the JobsQueue
"""
while self.is_alive():
await self.set_scale()
jobs_needed = self.current_concurrency - self.current_occupancy()
if jobs_needed <= 0:
log.debug("JobScaler.get_jobs | Queue is full. Retrying soon.")
await asyncio.sleep(1) # don't go rapidly
continue
try:
log.debug("JobScaler.get_jobs | Starting job acquisition.")
# Keep the connection to the blocking call with timeout
acquired_jobs = await asyncio.wait_for(
self.jobs_fetcher(session, jobs_needed),
timeout=self.jobs_fetcher_timeout,
)
if not acquired_jobs:
log.debug("JobScaler.get_jobs | No jobs acquired.")
continue
for job in acquired_jobs:
await self.jobs_queue.put(job)
log.debug("Job Queued", job["id"])
log.info(f"Jobs in queue: {self.jobs_queue.qsize()}")
except TooManyRequests:
log.debug(
"JobScaler.get_jobs | Too many requests. Debounce for 5 seconds."
)
await asyncio.sleep(5) # debounce for 5 seconds
except asyncio.CancelledError:
log.debug("JobScaler.get_jobs | Request was cancelled.")
raise # CancelledError is a BaseException
except asyncio.TimeoutError:
log.debug("JobScaler.get_jobs | Job acquisition timed out. Retrying.")
except TypeError as error:
log.debug(f"JobScaler.get_jobs | Unexpected error: {error}.")
except Exception as error:
log.error(
f"Failed to get job. | Error Type: {type(error).__name__} | Error Message: {str(error)}"
)
finally:
# Yield control back to the event loop
await asyncio.sleep(0.1)
async def run_jobs(self, session: ClientSession):
"""
Retrieve jobs from the jobs queue and process them concurrently.
Runs the block in an infinite loop while the worker is alive or jobs queue is not empty.
"""
tasks: set[asyncio.Task] = set() # Store the tasks for concurrent job processing
while self.is_alive() or not self.jobs_queue.empty() or tasks:
# Fetch as many jobs as the concurrency allows
while len(tasks) < self.current_concurrency and not self.jobs_queue.empty():
# log.debug(f"About to get a job from the queue. Queue size: {self.jobs_queue.qsize()}")
job = await self.jobs_queue.get()
self.job_progress.add(job)
log.debug(f"Dequeued job {job['id']}, now running. Queue size: {self.jobs_queue.qsize()}")
# Create a new task for each job and add it to the task list
task = asyncio.create_task(self.handle_job(session, job))
tasks.add(task)
# 2. If jobs are running, wait a little for completions
if tasks:
# Wait for at least one task to finish
done, pending = await asyncio.wait(
tasks,
timeout=0.1,
return_when=asyncio.FIRST_COMPLETED,
)
# Remove completed tasks
tasks.difference_update(done)
else:
# Nothing running — don’t spin CPU
await asyncio.sleep(0.5)
# Ensure all remaining tasks finish before stopping
await asyncio.gather(*tasks)
async def handle_job(self, session: ClientSession, job: dict):
"""
Process an individual job. This function is run concurrently for multiple jobs.
"""
try:
log.debug("Handling Job", job["id"])
await self.jobs_handler(session, self.config, job)
if self.config.get("refresh_worker", False):
self.kill_worker()
except Exception as err:
log.error(f"Error handling job: {err}", job["id"])
raise err
finally:
# Inform Queue of a task completion
self.jobs_queue.task_done()
# Job is no longer in progress
self.job_progress.remove(job)
log.debug("Finished Job", job["id"])