@@ -739,17 +739,25 @@ struct LLM::Impl {
739739 std::vector<std::vector<unsigned short >> &kv_v,
740740 int _precompute_len, int input_num_token)
741741 {
742+ // Always start from the largest group by default, then pick the first group that fits.
743+ _attr.prefill_grpid = (int )_attr.prefill_max_kv_cache_num_grp .size ();
742744 for (size_t i = 0 ; i < _attr.prefill_max_kv_cache_num_grp .size (); i++)
743745 {
744746 if (_precompute_len + input_num_token <= _attr.prefill_max_kv_cache_num_grp [i]) { _attr.prefill_grpid = (int )i + 1 ; break ; }
745747 }
746748 int kv_cache_num = _attr.prefill_max_kv_cache_num_grp [_attr.prefill_grpid - 1 ];
747749 ALOGI (" prefill_grpid:%d kv_cache_num:%d precompute_len:%d input_num_token:%d" , _attr.prefill_grpid , kv_cache_num, _precompute_len, input_num_token);
748- _attr.prefill_max_token_num = ALIGN_DOWN (_attr.prefill_max_token_num - _precompute_len, _attr.prefill_token_num );
749- ALOGI (" current prefill_max_token_num:%d" , _attr.prefill_max_token_num );
750- if (_precompute_len == 0 ) { ALOGI (" first run" ); return 0 ; }
750+ // Remaining prefill budget should be derived from the model capacity, not accumulated across calls.
751+ // Otherwise, a failed prefill (e.g. context overflow) can make it negative and break `/reset`.
752+ const int max_cap = _attr.prefill_max_kv_cache_num_grp .empty () ? 0 : _attr.prefill_max_kv_cache_num_grp .back ();
753+ int remaining = max_cap - _precompute_len;
754+ if (remaining < 0 ) remaining = 0 ;
755+ remaining = ALIGN_DOWN (remaining, _attr.prefill_token_num );
756+ _attr.prefill_max_token_num = remaining;
757+ ALOGI (" current prefill_max_token_num:%d" , remaining);
751758 if (_precompute_len + input_num_token > kv_cache_num) { ALOGE (" precompute_len(%d) + input_num_token(%d) > kv_cache_num(%d)" , _precompute_len, input_num_token, kv_cache_num); return -1 ; }
752- if (input_num_token > _attr.prefill_max_token_num ) { ALOGE (" input_num_token(%d) > prefill_max_token_num(%d)" , input_num_token, _attr.prefill_max_token_num ); return -1 ; }
759+ if (input_num_token > remaining) { ALOGE (" input_num_token(%d) > prefill_max_token_num(%d)" , input_num_token, remaining); return -1 ; }
760+ if (_precompute_len == 0 ) { ALOGI (" first run" ); return 0 ; }
753761 if (!b_os_kvcache) return 0 ;
754762 if (kv_k.size () != kv_v.size () || (int )kv_k.size () != _attr.axmodel_num ) { ALOGE (" kv cache size mismatch" ); return -1 ; }
755763 for (int i = 0 ; i < _attr.axmodel_num ; i++)
@@ -777,6 +785,11 @@ struct LLM::Impl {
777785 void ResetKVCache ()
778786 {
779787 last_tokens_ids.clear (); k_caches.clear (); v_caches.clear (); precompute_len = 0 ;
788+ _attr.prefill_grpid = (int )_attr.prefill_max_kv_cache_num_grp .size ();
789+ if (!_attr.prefill_max_kv_cache_num_grp .empty ())
790+ {
791+ _attr.prefill_max_token_num = _attr.prefill_max_kv_cache_num_grp .back ();
792+ }
780793 for (int i = 0 ; i < _attr.axmodel_num ; i++)
781794 {
782795 auto &lyr = llama_layers[i]; int devid = LLM_DEVID (lyr);
@@ -1194,7 +1207,12 @@ struct LLM::Impl {
11941207 if (!new_tokens.empty ()) { precompute_len = (int )new_tokens.size () - 1 ; tokens_diff = {new_tokens.back ()}; }
11951208 else { ResetKVCache (); precompute_len = 0 ; }
11961209 }
1197- SetKVCache (k_caches, v_caches, precompute_len, (int )tokens_diff.size ());
1210+ const int kv_ret = SetKVCache (k_caches, v_caches, precompute_len, (int )tokens_diff.size ());
1211+ if (kv_ret != 0 )
1212+ {
1213+ ALOGE (" SetKVCache failed" );
1214+ return history;
1215+ }
11981216 std::vector<unsigned short > out_embed (tokens_diff.size () * _attr.tokens_embed_size );
11991217 for (size_t i = 0 ; i < tokens_diff.size (); i++)
12001218 {
0 commit comments