[ml service] Support flexible tensors for compatibility with filters like llama.cpp

songgot · songgot · commit 95837571c57d · 2025-07-31T08:28:12.000+09:00
- Refactor tensor allocation logic to support flexible tensor
- Enable compatibility with filters such as llama.cpp by allowing dynamic tensor management

Signed-off-by: hyunil park &lt;hyunil46.park@samsung.com&gt;
diff --git a/c/include/nnstreamer-tizen-internal.h b/c/include/nnstreamer-tizen-internal.h
@@ -37,6 +37,8 @@ typedef struct {
   char *models;                  /**< Comma separated neural network model files. */
   char *custom_option;           /**< Custom option string for neural network framework. */
   char *fw_name;                 /**< The explicit framework name given by user */
+  int invoke_dynamic;            /**< True for supporting invoke with flexible output. */
+  int invoke_async;              /**< The sub-plugin must support asynchronous output to use this option. If set to TRUE, the sub-plugin can generate multiple outputs asynchronously per single input. Otherwise, only synchronous single-output is expected and async callback/handle are ignored. */
 } ml_single_preset;
 
 /**
diff --git a/c/src/ml-api-common.c b/c/src/ml-api-common.c
@@ -832,6 +832,26 @@ _ml_tensors_data_clone_no_alloc (const ml_tensors_data_s * data_src,
   return ML_ERROR_NONE;
 }
 
+/**
+ * @brief  Allocates zero-initialized memory of the given size for the tensor at the specified index
+ * in the tensor data structure, and sets the size value for that tensor.
+ */
+static int
+_ml_tensor_data_alloc (ml_tensors_data_s * data, int index, const size_t size)
+{
+  if (!data || index < 0)
+    _ml_error_report_return (ML_ERROR_INVALID_PARAMETER,
+        "Invalid parameter: data pointer is null or index is out of range.");
+
+  data->tensors[index].size = size;
+  data->tensors[index].data = g_malloc0 (size);
+  if (data->tensors[index].data == NULL)
+    _ml_error_report_return (ML_ERROR_OUT_OF_MEMORY,
+        "Failed to allocate memory for tensor data.");
+
+  return ML_ERROR_NONE;
+}
+
 /**
  * @brief Copies the tensor data frame.
  */
@@ -840,7 +860,8 @@ ml_tensors_data_clone (const ml_tensors_data_h in, ml_tensors_data_h * out)
 {
   int status;
   unsigned int i;
-  ml_tensors_data_s *_in, *_out;
+  ml_tensors_data_s *_in, *_out = NULL;
+  ml_tensors_info_s *_info = NULL;
 
   check_feature_state (ML_FEATURE);
 
@@ -862,12 +883,25 @@ ml_tensors_data_clone (const ml_tensors_data_h in, ml_tensors_data_h * out)
   }
 
   _out = (ml_tensors_data_s *) (*out);
+  _info = (ml_tensors_info_s *) _in->info;
+
+  if (_info->info.format == _NNS_TENSOR_FORMAT_FLEXIBLE) {
+    for (i = 0; i < _in->num_tensors; i++) {
+      status = _ml_tensor_data_alloc (_out, i, _in->tensors[i].size);
+      if (status != ML_ERROR_NONE) {
+        goto error;
+      }
+    }
+  }
 
   for (i = 0; i < _out->num_tensors; ++i) {
     memcpy (_out->tensors[i].data, _in->tensors[i].data, _in->tensors[i].size);
   }
 
 error:
+  if (status != ML_ERROR_NONE)
+    _ml_tensors_data_destroy_internal (_out, TRUE);
+
   G_UNLOCK_UNLESS_NOLOCK (*_in);
   return status;
 }
@@ -914,6 +948,7 @@ int
 ml_tensors_data_create (const ml_tensors_info_h info, ml_tensors_data_h * data)
 {
   gint status = ML_ERROR_STREAMS_PIPE;
+  ml_tensors_info_s *_info = NULL;
   ml_tensors_data_s *_data = NULL;
   guint i;
   bool valid;
@@ -944,21 +979,29 @@ ml_tensors_data_create (const ml_tensors_info_h info, ml_tensors_data_h * data)
         status);
   }
 
+  _info = (ml_tensors_info_s *) info;
+  if (_info->info.format == _NNS_TENSOR_FORMAT_FLEXIBLE) {
+    _ml_logw
+        ("[ml_tensors_data_create] format is FLEXIBLE, skipping tensor memory allocation. "
+          "Use ml_tensors_data_set_tensor_data() to update data buffer.");
+    *data = _data;
+    return ML_ERROR_NONE;
+  }
+
   for (i = 0; i < _data->num_tensors; i++) {
-    _data->tensors[i].data = g_malloc0 (_data->tensors[i].size);
-    if (_data->tensors[i].data == NULL) {
-      goto failed_oom;
-    }
+    status = _ml_tensor_data_alloc (_data, i, _data->tensors[i].size);
+    if (status != ML_ERROR_NONE)
+      goto error;
   }
 
   *data = _data;
   return ML_ERROR_NONE;
 
-failed_oom:
-  _ml_tensors_data_destroy_internal (_data, TRUE);
+error:
+  if (status != ML_ERROR_NONE)
+    _ml_tensors_data_destroy_internal (_data, TRUE);
 
-  _ml_error_report_return (ML_ERROR_OUT_OF_MEMORY,
-      "Failed to allocate memory blocks for tensors data. Check if it's out-of-memory.");
+  return status;
 }
 
 /**
@@ -1009,6 +1052,8 @@ int
 ml_tensors_data_set_tensor_data (ml_tensors_data_h data, unsigned int index,
     const void *raw_data, const size_t data_size)
 {
+
+  ml_tensors_info_s *_info = NULL;
   ml_tensors_data_s *_data;
   int status = ML_ERROR_NONE;
 
@@ -1033,6 +1078,29 @@ ml_tensors_data_set_tensor_data (ml_tensors_data_h data, unsigned int index,
     goto report;
   }
 
+  /**
+   * By default, the tensor format is _NNS_TENSOR_FORMAT_STATIC.
+   * In this case, memory allocation and the setting of _data->tensors[index].size
+   * are already handled in ml_tensors_data_create().
+   * So for the STATIC format, both the `size` and `data` pointer should already be valid here.
+   *
+   * For FLEXIBLE format, memory may not be allocated yet and will be handled here.
+   */
+  _info = (ml_tensors_info_s *) _data->info;
+  if (_info && _info->info.format == _NNS_TENSOR_FORMAT_FLEXIBLE) {
+    if (_data->tensors[index].data != NULL) {
+      g_free (_data->tensors[index].data);
+      _data->tensors[index].data = NULL;
+      _data->tensors[index].size = 0;
+    }
+    _ml_logw
+        ("Memory allocation was not performed in ml_tensor_data_create() when tensor format is _NNS_TENSOR_FORMAT_FLEXIBLE.");
+    status = _ml_tensor_data_alloc (_data, index, data_size);
+    if (status != ML_ERROR_NONE) {
+      goto report;
+    }
+  }
+
   if (data_size <= 0 || _data->tensors[index].size < data_size) {
     _ml_error_report
         ("The parameter, data_size (%zu), is invalid. It should be larger than 0 and not larger than the required size of tensors[index: %u] (%zu).",
diff --git a/c/src/ml-api-inference-single.c b/c/src/ml-api-inference-single.c
@@ -139,9 +139,9 @@ typedef struct
   gboolean invoking;                  /**< invoke running flag */
   ml_tensors_data_h in_tensors;       /**< input tensor wrapper for processing */
   ml_tensors_data_h out_tensors;      /**< output tensor wrapper for processing */
-  gboolean is_flexible;               /**< true if tensor filter handles flexible input/output */
 
   GList *destroy_data_list;         /**< data to be freed by filter */
+  tensor_format format;             /**< current format */
 } ml_single;
 
 /**
@@ -781,11 +781,10 @@ ml_single_set_info_in_handle (ml_single_h single, gboolean is_input,
     ml_tensors_info_h info = NULL;
 
     ml_single_get_gst_info (single_h, is_input, &gst_info);
-    if (single_h->is_flexible) {
-      gst_info.format = _NNS_TENSOR_FORMAT_FLEXIBLE;
-      gst_info.num_tensors = 1U;        /* TODO: Consider multiple input tensors filter */
+    if (single_h->format == _NNS_TENSOR_FORMAT_FLEXIBLE) {
+      gst_info.format = single_h->format;
+      gst_info.num_tensors = 1U; /* TODO: Consider multiple input tensors filter */
     }
-
     _ml_tensors_info_create_from_gst (&info, &gst_info);
 
     gst_tensors_info_free (&gst_info);
@@ -854,7 +853,6 @@ ml_single_create_handle (ml_nnfw_type_e nnfw)
   single_h->output = NULL;
   single_h->destroy_data_list = NULL;
   single_h->invoking = FALSE;
-  single_h->is_flexible = FALSE;
 
   gst_tensors_info_init (&single_h->in_info);
   gst_tensors_info_init (&single_h->out_info);
@@ -955,7 +953,6 @@ ml_single_open_custom (ml_single_h * single, ml_single_preset * info)
   gchar **list_models;
   guint i, num_models;
   char *hw_name;
-  gboolean invoke_dynamic = FALSE;
 
   check_feature_state (ML_FEATURE_INFERENCE);
 
@@ -1078,10 +1075,14 @@ ml_single_open_custom (ml_single_h * single, ml_single_preset * info)
     fw_name = _ml_get_nnfw_subplugin_name (nnfw);       /* retry for "auto" */
   }
   hw_name = _ml_nnfw_to_str_prop (hw);
+
   g_object_set (filter_obj, "framework", fw_name, "accelerator", hw_name,
-      "model", converted_models, NULL);
+      "model", converted_models, "invoke-dynamic", info->invoke_dynamic, NULL);
   g_free (hw_name);
 
+  if (info->invoke_dynamic)
+    single_h->format = _NNS_TENSOR_FORMAT_FLEXIBLE;
+
   if (info->custom_option) {
     g_object_set (filter_obj, "custom", info->custom_option, NULL);
   }
@@ -1105,11 +1106,6 @@ ml_single_open_custom (ml_single_h * single, ml_single_preset * info)
    *
    */
 
-  if (invoke_dynamic) {
-    single_h->is_flexible = TRUE;
-    g_object_set (filter_obj, "invoke-dynamic", TRUE, NULL);
-  }
-
   if (nnfw == ML_NNFW_TYPE_NNTR_INF) {
     if (!in_tensors_info || !out_tensors_info) {
       if (!in_tensors_info) {
@@ -1230,6 +1226,14 @@ ml_single_open_with_option (ml_single_h * single, const ml_option_h option)
   if (ML_ERROR_NONE == ml_option_get (option, "framework_name", &value) ||
       ML_ERROR_NONE == ml_option_get (option, "framework", &value))
     info.fw_name = (gchar *) value;
+  if (ML_ERROR_NONE == ml_option_get (option, "invoke_dynamic", &value)) {
+    if (strcasecmp ((gchar *) value, "TRUE") == 0)
+      info.invoke_dynamic = TRUE;
+  }
+  if (ML_ERROR_NONE == ml_option_get (option, "invoke_async", &value)) {
+    if (strcasecmp ((gchar *) value, "TRUE") == 0)
+      info.invoke_async = TRUE;
+  }
 
   return ml_single_open_custom (single, &info);
 }
@@ -1345,10 +1349,8 @@ _ml_single_invoke_validate_data (ml_single_h single,
           "The %d-th input tensor is not valid. There is no valid dimension metadata for this tensor.",
           i);
 
-    if (single_h->is_flexible) {
-      /* Skip data size check for flexible */
+    if (single_h->format == _NNS_TENSOR_FORMAT_FLEXIBLE)
       continue;
-    }
 
     raw_size = _model->tensors[i].size;
     if (G_UNLIKELY (_data->tensors[i].size != raw_size))
diff --git a/c/src/ml-api-service-extension.c b/c/src/ml-api-service-extension.c
@@ -339,6 +339,23 @@ _ml_extension_conf_parse_single (ml_service_s * mls, JsonObject * single)
       ml_option_set (option, "custom", g_strdup (custom), g_free);
   }
 
+  if (json_object_has_member (single, "invoke_dynamic")) {
+    const gchar *invoke_dynamic =
+        json_object_get_string_member (single, "invoke_dynamic");
+
+    if (STR_IS_VALID (invoke_dynamic))
+      ml_option_set (option, "invoke_dynamic", g_strdup (invoke_dynamic),
+          g_free);
+  }
+
+  if (json_object_has_member (single, "invoke_async")) {
+    const gchar *invoke_async =
+        json_object_get_string_member (single, "invoke_async");
+
+    if (STR_IS_VALID (invoke_async))
+      ml_option_set (option, "invoke_async", g_strdup (invoke_async), g_free);
+  }
+
 error:
   if (status == ML_ERROR_NONE)
     status = ml_single_open_with_option (&ext->single, option);
diff --git a/tests/capi/unittest_capi_service_extension.cc b/tests/capi/unittest_capi_service_extension.cc
@@ -390,6 +390,109 @@ _extension_test_imgclf (ml_service_h handle, gboolean is_pipeline)
   _free_test_data (tdata);
 }
 
+/**
+ * @brief Callback function for scenario test.
+ */
+static void
+_extension_test_llamacpp_cb (
+    ml_service_event_e event, ml_information_h event_data, void *user_data)
+{
+  extension_test_data_s *tdata = (extension_test_data_s *) user_data;
+  ml_tensors_data_h data = NULL;
+  void *_raw = NULL;
+  size_t _size = 0;
+  int status;
+
+  switch (event) {
+    case ML_SERVICE_EVENT_NEW_DATA:
+      ASSERT_TRUE (event_data != NULL);
+
+      status = ml_information_get (event_data, "data", &data);
+      EXPECT_EQ (status, ML_ERROR_NONE);
+
+      status = ml_tensors_data_get_tensor_data (data, 0U, &_raw, &_size);
+      EXPECT_EQ (status, ML_ERROR_NONE);
+
+      g_print ("%s", (char *) _raw);
+
+      if (tdata)
+        tdata->received++;
+      break;
+    default:
+      break;
+  }
+}
+
+/**
+ * @brief Internal function to run test with ml-service extension handle.
+ */
+static inline void
+_extension_test_llamacpp (ml_service_h handle, gboolean is_pipeline)
+{
+  extension_test_data_s *tdata;
+  ml_tensors_info_h info;
+  ml_tensors_data_h input;
+  int status;
+
+  const gchar input_text[] = "Hello my name is";
+
+  tdata = _create_test_data (is_pipeline);
+  ASSERT_TRUE (tdata != NULL);
+
+  status = ml_service_set_event_cb (handle, _extension_test_llamacpp_cb, tdata);
+  EXPECT_EQ (status, ML_ERROR_NONE);
+
+  /* Create and push input data. */
+  status = ml_service_get_input_information (handle, NULL, &info);
+  EXPECT_EQ (status, ML_ERROR_NONE);
+
+  ml_tensors_data_create (info, &input);
+
+  ml_tensors_data_set_tensor_data (input, 0U, input_text, strlen (input_text));
+
+  status = ml_service_request (handle, NULL, input);
+  EXPECT_EQ (status, ML_ERROR_NONE);
+
+  g_usleep (5000000U);
+  EXPECT_GT (tdata->received, 0);
+
+  /* Clear callback before releasing tdata. */
+  status = ml_service_set_event_cb (handle, NULL, NULL);
+  EXPECT_EQ (status, ML_ERROR_NONE);
+
+  ml_tensors_info_destroy (info);
+  ml_tensors_data_destroy (input);
+
+  _free_test_data (tdata);
+}
+
+/**
+ * @brief Usage of ml-service extension API.
+ */
+TEST (MLServiceExtension, scenarioConfigLlamacpp)
+{
+  ml_service_h handle;
+  int status;
+
+  g_autofree gchar *model_file = _get_model_path ("llama-2-7b-chat.Q2_K.gguf");
+  if (!g_file_test (model_file, G_FILE_TEST_EXISTS)) {
+    g_critical ("Skipping scenarioConfigLlamacpp test due to missing model file. "
+                "Please download model file from https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF");
+    return;
+  }
+
+  g_autofree gchar *config = get_config_path ("config_single_llamacpp.conf");
+
+  status = ml_service_new (config, &handle);
+  ASSERT_EQ (status, ML_ERROR_NONE);
+
+  _extension_test_llamacpp (handle, FALSE);
+
+  status = ml_service_destroy (handle);
+  EXPECT_EQ (status, ML_ERROR_NONE);
+}
+
+
 /**
  * @brief Usage of ml-service extension API.
  */
diff --git a/tests/test_models/config/config_single_llamacpp.conf b/tests/test_models/config/config_single_llamacpp.conf
@@ -0,0 +1,10 @@
+{
+    "single" :
+    {
+        "framework" : "llamacpp",
+        "model" : ["../tests/test_models/models/llama-2-7b-chat.Q2_K.gguf"],
+        "custom" : "num_predict:32",
+        "invoke_dynamic" : "true",
+        "invoke_async" : "false"
+    }
+}