9393from jetstream .core .proto import jetstream_pb2_grpc
9494from jetstream .core .utils import async_multifuture
9595from jetstream .engine import engine_api
96- import numpy as np
9796
97+ import numpy as np
98+ import prometheus_client
9899
99100root = logging .getLogger ()
100101root .setLevel (logging .DEBUG )
@@ -209,6 +210,9 @@ class Driver:
209210 # todo: remove jax_padding after all then engine migrate to np padding
210211 _jax_padding = True
211212
213+ # Record metrics for prefill_backlog size
214+ _prefill_backlog_size_metric : prometheus_client .Gauge
215+
212216 def __init__ (
213217 self ,
214218 prefill_engines : Optional [list [engine_api .Engine ]] = None ,
@@ -242,6 +246,10 @@ def __init__(
242246 # Stage 1
243247 # At first, a request is placed here in order to get prefilled.
244248 self ._prefill_backlog = queue .Queue ()
249+ self ._prefill_backlog_size_metric = prometheus_client .Gauge (
250+ "jetstream_prefill_backlog_size" , "Size of prefill queue"
251+ )
252+
245253 # Stage 2
246254 # After prefilling, it is placed here in order to get transferred to
247255 # one of the generate backlogs.
@@ -421,6 +429,7 @@ def place_request_on_prefill_queue(self, request: ActiveRequest):
421429 """Used to place new requests for prefilling and generation."""
422430 # Don't block so we can fail and shed load when the queue is full.
423431 self ._prefill_backlog .put (request , block = False )
432+ self ._prefill_backlog_size_metric .set (self ._prefill_backlog .qsize ())
424433
425434 def _load_cache_history (self , path : str ) -> Union [None , Any ]:
426435 """Loads previous kv cache for a longer conversation."""
@@ -442,6 +451,8 @@ def _prefill_thread(self, idx: int):
442451 my_transfer_backlog = self ._transfer_backlogs [idx ]
443452 # The prefill thread can just sleep until it has work to do.
444453 request = self ._prefill_backlog .get (block = True )
454+ self ._prefill_backlog_size_metric .set (self ._prefill_backlog .qsize ())
455+
445456 if request is None :
446457 break
447458 # Tokenize, and introduce a leading dimension
0 commit comments