Skip to content

Commit af1b918

Browse files
authored
Free engine resource for the slot after finished one request decoding (#119)
* Add free resource function after finished one request decoding * fix lint error * fix pyint error
1 parent 1830342 commit af1b918

2 files changed

Lines changed: 13 additions & 0 deletions

File tree

jetstream/core/orchestrator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -784,6 +784,7 @@ def _detokenize_thread(self, idx: int):
784784
# Place the slot back on the free queue.
785785
my_live_requests[slot] = None
786786
my_slots.put(slot, block=False) # This should always have space.
787+
my_generate_engine.free_resource(slot)
787788
logging.info(
788789
"Detokenizing generate step %d took %.2fms",
789790
generate_timestep_added,

jetstream/engine/engine_api.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,18 @@ def insert(
187187
a [0, n) range of slots and converted internally.
188188
"""
189189

190+
def free_resource(
191+
self,
192+
slot: int, # pylint: disable=unused-argument
193+
) -> Any:
194+
"""Free cache and other decode resource for the slot.
195+
196+
This function is needed for advanced attetnion kenel like PageAttetion.
197+
After finishing one request, the engine need to free all used page block
198+
resource and reuse for coming requests.
199+
"""
200+
return None
201+
190202
@abc.abstractmethod
191203
def load_params(self, *args, **kwargs) -> Params:
192204
"""Loads parameters.

0 commit comments

Comments
 (0)