LLM: plugin module - initial commit

Chris Warren-Smith · Chris Warren-Smith · commit 9e2c60eb5faa · 2025-12-17T11:24:33.000+10:30
diff --git a/llama/llama-sb.cpp b/llama/llama-sb.cpp
@@ -97,13 +97,9 @@ void Llama::configure_sampler(float temperature) {
   }
 }
 
-string Llama::generate(const string &prompt, int max_tokens, float temperature, bool echo, bool clear_cache) {
+string Llama::generate(const string &prompt, int max_tokens, float temperature) {
   string out;
 
-  if (clear_cache) {
-    // llama_kv_cache_clear(_ctx);
-  }
-
   // find the number of tokens in the prompt
   int n_prompt = -llama_tokenize(_vocab, prompt.c_str(), prompt.size(), nullptr, 0, true, true);
 
@@ -133,10 +129,6 @@ string Llama::generate(const string &prompt, int max_tokens, float temperature,
     batch = llama_batch_get_one(&decoder_start_token_id, 1);
   }
 
-  if (echo) {
-    out += prompt;
-  }
-
   for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + max_tokens;) {
     // evaluate the current batch with the transformer model
     if (llama_decode(_ctx, batch)) {
diff --git a/llama/llama-sb.h b/llama/llama-sb.h
@@ -19,11 +19,7 @@ struct Llama {
   void append_response(const string &response);
   const string build_chat_prompt(const string &user_msg);
   bool construct(string model_path, int n_ctx, bool disable_log);
-  string generate(const string &prompt,
-                  int max_tokens = 128,
-                  float temperature = 0.8f,
-                  bool echo = true,
-                  bool clear_cache = true);
+  string generate(const string &prompt, int max_tokens, float temperature);
   const char *last_error() { return _last_error.c_str(); }
   void reset();
 
diff --git a/llama/main.cpp b/llama/main.cpp
@@ -62,14 +62,14 @@ static int cmd_llama_chat(var_s *self, int argc, slib_par_t *arg, var_s *retval)
     if (id != -1) {
       Llama &llama = g_map.at(id);
       auto prompt = get_param_str(argc, arg, 0, "");
-      int max_tokens = get_param_int(argc, arg, 0, 512);
-      var_num_t temperature = get_param_num(argc, arg, 0, 0);
+      int max_tokens = get_param_int(argc, arg, 1, 32);
+      var_num_t temperature = get_param_num(argc, arg, 2, 0.8f);
 
       // build accumulated prompt
       string updated_prompt = llama.build_chat_prompt(prompt);
 
       // run generation WITHOUT clearing cache
-      string response = llama.generate(updated_prompt, max_tokens, temperature, false, false);
+      string response = llama.generate(updated_prompt, max_tokens, temperature);
 
       // append assistant reply to history
       llama.append_response(response);
@@ -111,11 +111,9 @@ static int cmd_llama_generate(var_s *self, int argc, slib_par_t *arg, var_s *ret
     if (id != -1) {
       Llama &llama = g_map.at(id);
       auto prompt = get_param_str(argc, arg, 0, "");
-      int max_tokens = get_param_int(argc, arg, 0, 512);
-      var_num_t temperature = get_param_num(argc, arg, 0, 0);
-
-      // run generation WITHOUT clearing cache
-      string response = llama.generate(prompt, max_tokens, temperature, false, true);
+      int max_tokens = get_param_int(argc, arg, 1, 32);
+      var_num_t temperature = get_param_num(argc, arg, 2, 0.8f);
+      string response = llama.generate(prompt, max_tokens, temperature);
       v_setstr(retval, response.c_str());
       result = 1;
     }
@@ -127,7 +125,7 @@ static int cmd_create_llama(int argc, slib_par_t *params, var_t *retval) {
   int result;
   auto model = expand_path(get_param_str(argc, params, 0, ""));
   int n_ctx = get_param_int(argc, params, 0, 2048);
-  int disable_log = get_param_int(argc, params, 0, 1);
+  int disable_log = get_param_int(argc, params, 1, 1);
   int id = ++g_nextId;
   Llama &llama = g_map[id];
   if (llama.construct(model, n_ctx, disable_log)) {
diff --git a/llama/test_main.cpp b/llama/test_main.cpp
@@ -1,5 +1,4 @@
 #include "llama-sb.h"
-
 #include <cstdio>
 #include <cstring>
 
@@ -18,49 +17,47 @@ int main(int argc, char ** argv) {
   int n_predict = 32;
 
   // parse command line arguments
-  {
-    int i = 1;
-    for (; i < argc; i++) {
-      if (strcmp(argv[i], "-m") == 0) {
-        if (i + 1 < argc) {
-          model_path = argv[++i];
-        } else {
-          print_usage(argc, argv);
-          return 1;
-        }
-      } else if (strcmp(argv[i], "-n") == 0) {
-        if (i + 1 < argc) {
-          try {
-            n_predict = std::stoi(argv[++i]);
-          } catch (...) {
-            print_usage(argc, argv);
-            return 1;
-          }
-        } else {
+  int i = 1;
+  for (; i < argc; i++) {
+    if (strcmp(argv[i], "-m") == 0) {
+      if (i + 1 < argc) {
+        model_path = argv[++i];
+      } else {
+        print_usage(argc, argv);
+        return 1;
+      }
+    } else if (strcmp(argv[i], "-n") == 0) {
+      if (i + 1 < argc) {
+        try {
+          n_predict = std::stoi(argv[++i]);
+        } catch (...) {
           print_usage(argc, argv);
           return 1;
         }
       } else {
-        // prompt starts here
-        break;
+        print_usage(argc, argv);
+        return 1;
       }
+    } else {
+      // prompt starts here
+      break;
     }
-    if (model_path.empty()) {
-      print_usage(argc, argv);
-      return 1;
-    }
-    if (i < argc) {
-      prompt = argv[i++];
-      for (; i < argc; i++) {
-        prompt += " ";
-        prompt += argv[i];
-      }
+  }
+  if (model_path.empty()) {
+    print_usage(argc, argv);
+    return 1;
+  }
+  if (i < argc) {
+    prompt = argv[i++];
+    for (; i < argc; i++) {
+      prompt += " ";
+      prompt += argv[i];
     }
   }
 
   Llama llama;
   if (llama.construct(model_path, 1024, true)) {
-    string out = llama. generate(prompt, n_predict, 0.8f, true, true);
+    string out = llama. generate(prompt, n_predict, 0.8f);
     printf("\033[33m");
     printf(out.c_str());
     printf("\n\033[0m");