Skip to content

Commit 2f8924d

Browse files
authored
Prometheus Metrics (#71)
* initial commit * updated requirements.txt * correct parameters for Gauge * Update orchestrator.py * Update orchestrator.py * Update server_lib.py * Fix prometheus client port collision * prometheus port 9100 -> 9090 * Pyink formatting * Addressed above comment * logging in wrong place * Conditional fix * Update server_lib.py * reformat
1 parent ec78937 commit 2f8924d

5 files changed

Lines changed: 36 additions & 3 deletions

File tree

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ google_jetstream.egg-info/
99
data/
1010
logs/
1111
tmp/
12-
venv/
12+
venv/
13+
.vscode/

jetstream/core/orchestrator.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,9 @@
9393
from jetstream.core.proto import jetstream_pb2_grpc
9494
from jetstream.core.utils import async_multifuture
9595
from jetstream.engine import engine_api
96-
import numpy as np
9796

97+
import numpy as np
98+
import prometheus_client
9899

99100
root = logging.getLogger()
100101
root.setLevel(logging.DEBUG)
@@ -209,6 +210,9 @@ class Driver:
209210
# todo: remove jax_padding after all then engine migrate to np padding
210211
_jax_padding = True
211212

213+
# Record metrics for prefill_backlog size
214+
_prefill_backlog_size_metric: prometheus_client.Gauge
215+
212216
def __init__(
213217
self,
214218
prefill_engines: Optional[list[engine_api.Engine]] = None,
@@ -242,6 +246,10 @@ def __init__(
242246
# Stage 1
243247
# At first, a request is placed here in order to get prefilled.
244248
self._prefill_backlog = queue.Queue()
249+
self._prefill_backlog_size_metric = prometheus_client.Gauge(
250+
"jetstream_prefill_backlog_size", "Size of prefill queue"
251+
)
252+
245253
# Stage 2
246254
# After prefilling, it is placed here in order to get transferred to
247255
# one of the generate backlogs.
@@ -421,6 +429,7 @@ def place_request_on_prefill_queue(self, request: ActiveRequest):
421429
"""Used to place new requests for prefilling and generation."""
422430
# Don't block so we can fail and shed load when the queue is full.
423431
self._prefill_backlog.put(request, block=False)
432+
self._prefill_backlog_size_metric.set(self._prefill_backlog.qsize())
424433

425434
def _load_cache_history(self, path: str) -> Union[None, Any]:
426435
"""Loads previous kv cache for a longer conversation."""
@@ -442,6 +451,8 @@ def _prefill_thread(self, idx: int):
442451
my_transfer_backlog = self._transfer_backlogs[idx]
443452
# The prefill thread can just sleep until it has work to do.
444453
request = self._prefill_backlog.get(block=True)
454+
self._prefill_backlog_size_metric.set(self._prefill_backlog.qsize())
455+
445456
if request is None:
446457
break
447458
# Tokenize, and introduce a leading dimension

jetstream/core/server_lib.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import asyncio
2121
from concurrent import futures
2222
import logging
23+
import os
2324
import threading
2425
from typing import Any, Type
2526

@@ -29,8 +30,14 @@
2930
from jetstream.core import orchestrator
3031
from jetstream.core.proto import jetstream_pb2_grpc
3132

33+
from prometheus_client import start_http_server
3234

3335
_HOST = "[::]"
36+
PROMETHEUS_ENABLED_ON_PORT = (
37+
int(os.getenv("PROMETHEUS_ENABLED_ON_PORT"))
38+
if os.getenv("PROMETHEUS_ENABLED_ON_PORT")
39+
else None
40+
)
3441

3542

3643
class JetStreamServer:
@@ -130,6 +137,17 @@ def run(
130137
logging.info("Starting server on port %d with %d threads", port, threads)
131138

132139
jetstream_server.start()
140+
141+
# Setup Prometheus server
142+
if PROMETHEUS_ENABLED_ON_PORT is not None:
143+
logging.info(
144+
"Starting Prometheus server on port %d", PROMETHEUS_ENABLED_ON_PORT
145+
)
146+
start_http_server(PROMETHEUS_ENABLED_ON_PORT)
147+
else:
148+
logging.info(
149+
"Not starting Prometheus server: PROMETHEUS_ENABLED_ON_PORT not set"
150+
)
133151
return jetstream_server
134152

135153

requirements.in

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ jax
66
jaxlib
77
numpy
88
portpicker
9+
prometheus-client
910
pytest
1011
seqio
1112
tiktoken
12-
blobfile
13+
blobfile

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,8 @@ pluggy==1.4.0
177177
# via pytest
178178
portpicker==1.6.0
179179
# via -r requirements.in
180+
prometheus-client==0.20.0
181+
# via -r requirements.in
180182
promise==2.3
181183
# via tfds-nightly
182184
protobuf==3.20.3

0 commit comments

Comments
 (0)