Skip to content

Commit e06faa5

Browse files
unamedkrclaude
andcommitted
feat: chat-mode KV cache reuse — O(history^2) → O(new tokens) per turn
User-reported issue: chat mode gets exponentially slower as history accumulates. Each turn re-prefills the entire conversation through all transformer layers because both quant_generate (single-header) and the HTTP server's tq_generate were freeing the KV state on every call. Result: turn N's prefill cost was O(N * total_history_tokens), which is O(N²) cumulative. Fix: introduce tq_generate_continue / quant_chat that: 1. Keeps the KV state alive across calls (caller-managed) 2. Tracks the token IDs currently committed to the KV cache 3. On each call, computes the longest common prefix (LCP) between the cached tokens and the new prompt, and only prefills the diverging suffix [LCP, n_new) 4. Updates the cache record with the prompt + generated tokens Three layers wired up: 1. quant.h (single-header / Python wheel) - quant_ctx now stores cached_tokens / n_cached / cached_capacity - new public quant_chat(ctx, prompt, cb, ud) — pass NULL prompt to reset the session - existing quant_generate unchanged for backwards compat 2. src/engine/tq_generate.c (library build) - new tq_generate_continue(model, tok, state, prompt, config, **cached, *n_cached, *cap, output, size) - same prefix-match logic, mirrors the single-header impl 3. src/server/tq_server.c (HTTP server) - tq_server now holds a persistent kv_state + cached_tokens - both /v1/chat/completions paths (streaming + non-streaming) call tq_generate_continue instead of tq_generate - state freed on tq_server_free 4. bindings/python/quantcpp - _binding.py: optional binding for quant_chat (gracefully missing on older single-header builds) - Model.chat(prompt) — generator with KV reuse, falls back to generate() if symbol unavailable - Model.reset_chat() — wipes the session - cli.py: `quantcpp run` interactive loop now accumulates ChatML history and uses Model.chat() for cheap re-sends Measured (SmolLM2-135M, M1 Pro, single thread, 10 turns of accumulating synthetic chat history, max_tokens=8/turn): quant_generate (no reuse): 295 → 681 → 1105 → 1581 → 2105 → 2660 → 3245 → 3926 → 4679 → 5386 ms quant_chat (with reuse): 294 → 430 → 451 → 509 → 545 → 608 → 693 → 750 → 796 → 902 ms Turn 10 speedup: 5386 → 902 ms (5.97x) Identical-prompt repeat (perfect LCP): 366 → 91/91/91/91 ms (4x) Caveat: when assistant responses contain text that re-tokenizes differently in the larger context (BPE merge non-roundtripping), LCP truncates and the suffix re-prefills. Real-world chat clients that replay the exact assistant response see >90% of the speedup. Worst-case is still better than the no-reuse baseline. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 45f5d58 commit e06faa5

6 files changed

Lines changed: 566 additions & 7 deletions

File tree

bindings/python/quantcpp/__init__.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,79 @@ def _run():
383383
if error_box[0] is not None:
384384
raise error_box[0]
385385

386+
def chat(self, prompt: str) -> Iterator[str]:
387+
"""Multi-turn chat with KV cache reuse.
388+
389+
Like ``generate()``, but the KV cache persists across calls. When you
390+
re-send the conversation history each turn, only the new tokens are
391+
prefilled — turn N's latency is O(new_tokens), not O(history^2).
392+
393+
Pass ``prompt=None`` to reset the chat session.
394+
395+
Falls back to ``generate()`` on older library builds without
396+
``quant_chat`` symbol.
397+
"""
398+
self._ensure_open()
399+
lib = get_lib()
400+
401+
if not hasattr(lib, "quant_chat"):
402+
# Older library — silently fall back to non-reusing generate
403+
yield from self.generate(prompt or "")
404+
return
405+
406+
if prompt is None:
407+
with self._lock:
408+
lib.quant_chat(self._ctx, None, ON_TOKEN_CB(0), None)
409+
return
410+
411+
if self._chat:
412+
prompt = self._apply_chat_template(prompt)
413+
414+
tokens = []
415+
done = threading.Event()
416+
error_box = [None]
417+
418+
def _on_token(text_ptr, _user_data):
419+
if text_ptr:
420+
tokens.append(text_ptr.decode("utf-8", errors="replace"))
421+
422+
cb = ON_TOKEN_CB(_on_token)
423+
424+
def _run():
425+
try:
426+
with self._lock:
427+
lib.quant_chat(self._ctx, prompt.encode("utf-8"), cb, None)
428+
except Exception as e:
429+
error_box[0] = e
430+
finally:
431+
done.set()
432+
433+
thread = threading.Thread(target=_run, daemon=True)
434+
thread.start()
435+
436+
yielded = 0
437+
while not done.is_set() or yielded < len(tokens):
438+
if yielded < len(tokens):
439+
yield tokens[yielded]
440+
yielded += 1
441+
else:
442+
done.wait(timeout=0.01)
443+
444+
while yielded < len(tokens):
445+
yield tokens[yielded]
446+
yielded += 1
447+
448+
if error_box[0] is not None:
449+
raise error_box[0]
450+
451+
def reset_chat(self) -> None:
452+
"""Reset the chat KV cache. Next chat() call starts fresh."""
453+
self._ensure_open()
454+
lib = get_lib()
455+
if hasattr(lib, "quant_chat"):
456+
with self._lock:
457+
lib.quant_chat(self._ctx, None, ON_TOKEN_CB(0), None)
458+
386459
def save_context(self, path: str) -> None:
387460
"""Save the current KV cache to disk.
388461

bindings/python/quantcpp/_binding.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,20 @@ def _setup_signatures(lib: ctypes.CDLL) -> None:
132132
]
133133
lib.quant_generate.restype = ctypes.c_int
134134

135+
# int quant_chat(quant_ctx* ctx, const char* prompt,
136+
# void (*on_token)(const char*, void*), void* user_data)
137+
# Multi-turn chat with KV cache reuse — avoids the O(n^2) prefill cost
138+
# of quant_generate when the user re-sends conversation history.
139+
# Optional: only present in single-header builds (>= v0.13).
140+
if hasattr(lib, "quant_chat"):
141+
lib.quant_chat.argtypes = [
142+
ctypes.c_void_p,
143+
ctypes.c_char_p,
144+
ON_TOKEN_CB,
145+
ctypes.c_void_p,
146+
]
147+
lib.quant_chat.restype = ctypes.c_int
148+
135149
# char* quant_ask(quant_ctx* ctx, const char* prompt)
136150
lib.quant_ask.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
137151
lib.quant_ask.restype = ctypes.c_void_p # We use c_void_p so we can free()

bindings/python/quantcpp/cli.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,15 +152,23 @@ def cmd_run(args):
152152
print()
153153
else:
154154
print("quantcpp \u2014 type your message, Ctrl+C to exit", file=sys.stderr)
155+
# Multi-turn chat: accumulate history as ChatML so the model sees
156+
# prior turns. m.chat() reuses the KV cache via prefix-match, so
157+
# repeating the history is cheap (O(new tokens), not O(n^2)).
158+
history = ""
155159
try:
156160
while True:
157161
question = input("\nYou: ")
158162
if not question.strip():
159163
continue
164+
history += f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
160165
print("AI: ", end="", flush=True)
161-
for tok in m.generate(question):
166+
reply_buf = []
167+
for tok in m.chat(history):
162168
print(tok, end="", flush=True)
169+
reply_buf.append(tok)
163170
print()
171+
history += "".join(reply_buf) + "<|im_end|>\n"
164172
except (KeyboardInterrupt, EOFError):
165173
print("\nBye!", file=sys.stderr)
166174

0 commit comments

Comments
 (0)