fix: fix partition service

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 10ebc37f3897 · 2026-01-29T20:41:17.000+08:00
diff --git a/graphgen/bases/base_operator.py b/graphgen/bases/base_operator.py
@@ -152,7 +152,7 @@ def store(self, results: list, meta_update: dict):
         self.kv_storage.index_done_callback()
 
     @abstractmethod
-    def process(self, batch: list) -> Tuple[Union[list, Iterable[list]], dict]:
+    def process(self, batch: list) -> Tuple[Union[list, Iterable[dict]], dict]:
         """
         Process the input batch and return the result.
         :param batch
diff --git a/graphgen/models/generator/vqa_generator.py b/graphgen/models/generator/vqa_generator.py
@@ -1,3 +1,4 @@
+import json
 import re
 from typing import Any
 
@@ -75,62 +76,46 @@ async def generate(
         nodes, _ = batch
         for node in nodes:
             node_data = node[1]
-            if "image_data" in node_data and node_data["image_data"]:
-                img_path = node_data["image_data"]["img_path"]
+            if "metadata" in node_data and node_data["metadata"]:
+                metadata = json.loads(node_data["metadata"])["metadata"]
+                img_path = metadata.get("path", "")
                 for qa in qa_pairs:
                     qa["img_path"] = img_path
         return qa_pairs
 
     @staticmethod
-    def format_generation_results(
-        result: list[dict], output_data_format: str
-    ) -> list[dict[str, Any]]:
+    def format_generation_results(result: dict, output_data_format: str) -> dict:
+        question = result.get("question", "")
+        answer = result.get("answer", "")
+        img_path = result.get("img_path", "")
         if output_data_format == "Alpaca":
-            result = [
-                {
-                    "instruction": v["question"],
-                    "input": "",
-                    "output": v["answer"],
-                    "image": v.get("img_path", ""),
-                }
-                for item in result
-                for k, v in item.items()
-            ]
-        elif output_data_format == "Sharegpt":
-            result = [
-                {
-                    "conversations": [
-                        {
-                            "from": "human",
-                            "value": [
-                                {"text": v["question"], "image": v.get("img_path", "")}
-                            ],
-                        },
-                        {"from": "gpt", "value": [{"text": v["answer"]}]},
-                    ]
-                }
-                for item in result
-                for k, v in item.items()
-            ]
-        elif output_data_format == "ChatML":
-            result = [
-                {
-                    "messages": [
-                        {
-                            "role": "user",
-                            "content": [
-                                {"text": v["question"], "image": v.get("img_path", "")}
-                            ],
-                        },
-                        {
-                            "role": "assistant",
-                            "content": [{"type": "text", "text": v["answer"]}],
-                        },
-                    ]
-                }
-                for item in result
-                for k, v in item.items()
-            ]
-        else:
-            raise ValueError(f"Unknown output data format: {output_data_format}")
-        return result
+            return {
+                "instruction": question,
+                "input": "",
+                "output": answer,
+                "image": img_path,
+            }
+        if output_data_format == "Sharegpt":
+            return {
+                "conversations": [
+                    {
+                        "from": "human",
+                        "value": [{"text": question, "image": img_path}],
+                    },
+                    {"from": "gpt", "value": [{"text": answer}]},
+                ]
+            }
+        if output_data_format == "ChatML":
+            return {
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [{"text": question, "image": img_path}],
+                    },
+                    {
+                        "role": "assistant",
+                        "content": [{"type": "text", "text": answer}],
+                    },
+                ]
+            }
+        raise ValueError(f"Unknown output data format: {output_data_format}")
diff --git a/graphgen/models/kg_builder/light_rag_kg_builder.py b/graphgen/models/kg_builder/light_rag_kg_builder.py
@@ -1,3 +1,4 @@
+import json
 import re
 from collections import Counter, defaultdict
 from typing import Dict, List, Tuple
@@ -130,15 +131,25 @@ async def merge_nodes(
             set([dp["source_id"] for dp in node_data] + source_ids)
         )
 
-        node_data = {
+        node_data_dict = {
             "entity_type": entity_type,
             "entity_name": entity_name,
             "description": description,
             "source_id": source_id,
             "length": self.tokenizer.count_tokens(description),
         }
-        kg_instance.upsert_node(entity_name, node_data=node_data)
-        return node_data
+
+        if entity_type in ("IMAGE", "TABLE", "FORMULA"):
+            metadata = next(
+                (dp["metadata"] for dp in node_data if dp.get("metadata")), None
+            )
+            if metadata:
+                node_data_dict["metadata"] = json.dumps(
+                    metadata, ensure_ascii=False, default=str
+                )
+
+        kg_instance.upsert_node(entity_name, node_data=node_data_dict)
+        return node_data_dict
 
     async def merge_edges(
         self,
diff --git a/graphgen/models/kg_builder/mm_kg_builder.py b/graphgen/models/kg_builder/mm_kg_builder.py
@@ -70,6 +70,8 @@ async def extract(
 
                 entity = await handle_single_entity_extraction(attributes, chunk_id)
                 if entity is not None:
+                    if entity["entity_type"] == "IMAGE":
+                        entity["metadata"] = chunk.metadata
                     nodes[entity["entity_name"]].append(entity)
                     continue
 
diff --git a/graphgen/operators/partition/partition_service.py b/graphgen/operators/partition/partition_service.py
@@ -55,7 +55,7 @@ def __init__(
         else:
             raise ValueError(f"Unsupported partition method: {method}")
 
-    def process(self, batch: list) -> Tuple[Iterable[list], dict]:
+    def process(self, batch: list) -> Tuple[Iterable[dict], dict]:
         # this operator does not consume any batch data
         # but for compatibility we keep the interface
         self.kg_instance.reload()
@@ -68,50 +68,14 @@ def generator():
             count = 0
             for community in communities:
                 count += 1
-                batch = self.partitioner.community2batch(community, g=self.kg_instance)
-                # batch = self._attach_additional_data_to_node(batch)
+                b = self.partitioner.community2batch(community, g=self.kg_instance)
 
                 result = {
-                    "nodes": batch[0],
-                    "edges": batch[1],
+                    "nodes": b[0],
+                    "edges": b[1],
                 }
                 result["_trace_id"] = self.get_trace_id(result)
                 yield result
             logger.info("Total communities partitioned: %d", count)
 
         return generator(), {}
-
-    # def _attach_additional_data_to_node(self, batch: tuple) -> tuple:
-    #     """
-    #     Attach additional data from chunk_storage to nodes in the batch.
-    #     :param batch: tuple of (nodes_data, edges_data)
-    #     :return: updated batch with additional data attached to nodes
-    #     """
-    #     nodes_data, edges_data = batch
-    #
-    #     for node_id, node_data in nodes_data:
-    #         entity_type = (node_data.get("entity_type") or "").lower()
-    #         if not entity_type:
-    #             continue
-    #
-    #         source_ids = [
-    #             sid.strip()
-    #             for sid in node_data.get("source_id", "").split("<SEP>")
-    #             if sid.strip()
-    #         ]
-    #
-    #         # Handle images
-    #         if "image" in entity_type:
-    #             image_chunks = [
-    #                 data
-    #                 for sid in source_ids
-    #                 if "image" in sid.lower()
-    #                 and (data := self.chunk_storage.get_by_id(sid))
-    #             ]
-    #             if image_chunks:
-    #                 # The generator expects a dictionary with an 'img_path' key, not a list of captions.
-    #                 # We'll use the first image chunk found for this node.
-    #                 node_data["image_data"] = json.loads(image_chunks[0]["content"])
-    #                 logger.debug("Attached image data to node %s", node_id)
-    #
-    #     return nodes_data, edges_data