refactor: refactor graphgen to integrete orchestration engine

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 07fc078303de · 2025-11-06T15:49:39.000+08:00
diff --git a/graphgen/configs/aggregated_config.yaml b/graphgen/configs/aggregated_config.yaml
@@ -1,22 +1,23 @@
-read:
-  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-partition: # graph partition configuration
-  method: ece # ece is a custom partition method based on comprehension loss
-  method_params:
-    max_units_per_community: 20 # max nodes and edges per community
-    min_units_per_community: 5 # min nodes and edges per community
-    max_tokens_per_community: 10240 # max tokens per community
-    unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
-generate:
-  mode: aggregated # atomic, aggregated, multi_hop, cot, vqa
-  data_format: ChatML # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: insert
+    params:
+      input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+  - name: quiz_and_judge
+    params:
+      quiz_samples: 2 # number of quiz samples to generate
+      re_judge: false # whether to re-judge the existing quiz samples
+  - name: partition
+    deps: [insert, quiz_and_judge] # ece depends on both insert and quiz_and_judge steps
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 20 # max nodes and edges per community
+        min_units_per_community: 5 # min nodes and edges per community
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+  - name: generate
+    params:
+      method: aggregated # atomic, aggregated, multi_hop, cot, vqa
+      data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/engine.py b/graphgen/engine.py
@@ -3,8 +3,12 @@
 """
 
 import threading
+import traceback
+from functools import wraps
 from typing import Any, Callable, List
 
+from graphgen.utils import logger
+
 
 class Context(dict):
     _lock = threading.Lock()
@@ -25,9 +29,16 @@ def __init__(
         self.name, self.deps, self.func = name, deps, func
 
 
-def op(name: str, deps: List[str] = None):
-    def decorator(f: Callable[["OpNode", Context], Any]):
-        return OpNode(name, deps or [], f)
+def op(name: str, deps=None):
+    deps = deps or []
+
+    def decorator(func):
+        @wraps(func)
+        def _wrapper(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        _wrapper.op_node = OpNode(name, deps, lambda self, ctx: func(self, **ctx))
+        return _wrapper
 
     return decorator
 
@@ -73,7 +84,8 @@ def _exec(n: str):
                 try:
                     name2op[n].func(name2op[n], ctx)
                 except Exception as e:  # pylint: disable=broad-except
-                    exc[n] = e
+                    logger.error("Operation %s failed: %s", n, e)
+                    exc[n] = traceback.format_exc()
                 done[n].set()
 
         ts = [threading.Thread(target=_exec, args=(n,), daemon=True) for n in topo]
@@ -82,4 +94,28 @@ def _exec(n: str):
         for t in ts:
             t.join()
         if exc:
-            raise RuntimeError(f"Some operations failed: {exc}")
+            raise RuntimeError(
+                "Some operations failed:\n"
+                + "\n".join(f"---- {op} ----\n{tb}" for op, tb in exc.items())
+            )
+
+
+def collect_ops(config: dict, graph_gen) -> List[OpNode]:
+    """
+    build operation nodes from yaml config
+    :param config
+    :param graph_gen
+    """
+    ops: List[OpNode] = []
+    for stage in config["pipeline"]:
+        name = stage["name"]
+        method = getattr(graph_gen, name)
+        op_node = method.op_node
+
+        # if there are runtime dependencies, override them
+        runtime_deps = stage.get("deps", op_node.deps)
+        op_node.deps = runtime_deps
+
+        op_node.func = lambda self, ctx, m=method, sc=stage: m(sc.get("params"))
+        ops.append(op_node)
+    return ops
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -8,6 +8,7 @@
 from graphgen.bases import BaseLLMWrapper
 from graphgen.bases.base_storage import StorageNameSpace
 from graphgen.bases.datatypes import Chunk
+from graphgen.engine import op
 from graphgen.models import (
     JsonKVStorage,
     JsonListStorage,
@@ -69,6 +70,9 @@ def __init__(
         self.rephrase_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="rephrase"
         )
+        self.partition_storage: JsonListStorage = JsonListStorage(
+            self.working_dir, namespace="partition"
+        )
         self.qa_storage: JsonListStorage = JsonListStorage(
             os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
             namespace="qa",
@@ -77,13 +81,14 @@ def __init__(
         # webui
         self.progress_bar: gr.Progress = progress_bar
 
+    @op("insert", deps=[])
     @async_to_sync_method
-    async def insert(self, read_config: Dict, split_config: Dict):
+    async def insert(self, insert_config: Dict):
         """
         insert chunks into the graph
         """
         # Step 1: Read files
-        data = read_files(read_config["input_file"], self.working_dir)
+        data = read_files(insert_config["input_file"], self.working_dir)
         if len(data) == 0:
             logger.warning("No data to process")
             return
@@ -102,8 +107,8 @@ async def insert(self, read_config: Dict, split_config: Dict):
 
         inserting_chunks = await chunk_documents(
             new_docs,
-            split_config["chunk_size"],
-            split_config["chunk_overlap"],
+            insert_config["chunk_size"],
+            insert_config["chunk_overlap"],
             self.tokenizer_instance,
             self.progress_bar,
         )
@@ -148,6 +153,7 @@ async def _insert_done(self):
             tasks.append(cast(StorageNameSpace, storage_instance).index_done_callback())
         await asyncio.gather(*tasks)
 
+    @op("search", deps=["insert"])
     @async_to_sync_method
     async def search(self, search_config: Dict):
         logger.info(
@@ -183,13 +189,13 @@ async def search(self, search_config: Dict):
                 # TODO: fix insert after search
                 await self.insert()
 
+    @op("quiz_and_judge", deps=["insert"])
     @async_to_sync_method
     async def quiz_and_judge(self, quiz_and_judge_config: Dict):
-        if quiz_and_judge_config is None or not quiz_and_judge_config.get(
-            "enabled", False
-        ):
-            logger.warning("Quiz and Judge is not used in this pipeline.")
-            return
+        logger.warning(
+            "Quiz and Judge operation needs trainee LLM client."
+            " Make sure to provide one."
+        )
         max_samples = quiz_and_judge_config["quiz_samples"]
         await quiz(
             self.synthesizer_llm_client,
@@ -222,15 +228,26 @@ async def quiz_and_judge(self, quiz_and_judge_config: Dict):
         logger.info("Restarting synthesizer LLM client.")
         self.synthesizer_llm_client.restart()
 
+    @op("partition", deps=["insert"])
     @async_to_sync_method
-    async def generate(self, partition_config: Dict, generate_config: Dict):
-        # Step 1: partition the graph
+    async def partition(self, partition_config: Dict):
         batches = await partition_kg(
             self.graph_storage,
             self.chunks_storage,
             self.tokenizer_instance,
             partition_config,
         )
+        await self.partition_storage.upsert(batches)
+        return batches
+
+    @op("generate", deps=["insert", "partition"])
+    @async_to_sync_method
+    async def generate(self, generate_config: Dict):
+
+        batches = self.partition_storage.data
+        if not batches:
+            logger.warning("No partitions found for QA generation")
+            return
 
         # Step 2： generate QA pairs
         results = await generate_qas(
@@ -258,3 +275,6 @@ async def clear(self):
         await self.qa_storage.drop()
 
         logger.info("All caches are cleared")
+
+    # TODO: add data filtering step here in the future
+    # graph_gen.filter(filter_config=config["filter"])
diff --git a/graphgen/operators/generate/generate_qas.py b/graphgen/operators/generate/generate_qas.py
@@ -29,21 +29,21 @@ async def generate_qas(
     :param progress_bar
     :return: QA pairs
     """
-    mode = generation_config["mode"]
-    logger.info("[Generation] mode: %s, batches: %d", mode, len(batches))
+    method = generation_config["method"]
+    logger.info("[Generation] mode: %s, batches: %d", method, len(batches))
 
-    if mode == "atomic":
+    if method == "atomic":
         generator = AtomicGenerator(llm_client)
-    elif mode == "aggregated":
+    elif method == "aggregated":
         generator = AggregatedGenerator(llm_client)
-    elif mode == "multi_hop":
+    elif method == "multi_hop":
         generator = MultiHopGenerator(llm_client)
-    elif mode == "cot":
+    elif method == "cot":
         generator = CoTGenerator(llm_client)
-    elif mode in ["vqa"]:
+    elif method in ["vqa"]:
         generator = VQAGenerator(llm_client)
     else:
-        raise ValueError(f"Unsupported generation mode: {mode}")
+        raise ValueError(f"Unsupported generation mode: {method}")
 
     results = await run_concurrent(
         generator.generate,
diff --git a/graphgen/run.py b/graphgen/run.py
@@ -6,6 +6,7 @@
 import yaml
 from dotenv import load_dotenv
 
+from graphgen.engine import Context, Engine, collect_ops
 from graphgen.graphgen import GraphGen
 from graphgen.utils import logger, set_logger
 
@@ -50,38 +51,29 @@ def main():
     with open(args.config_file, "r", encoding="utf-8") as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
 
-    mode = config["generate"]["mode"]
     unique_id = int(time.time())
 
     output_path = os.path.join(working_dir, "data", "graphgen", f"{unique_id}")
     set_working_dir(output_path)
 
     set_logger(
-        os.path.join(output_path, f"{unique_id}_{mode}.log"),
+        os.path.join(output_path, f"{unique_id}.log"),
         if_stream=True,
     )
     logger.info(
         "GraphGen with unique ID %s logging to %s",
         unique_id,
-        os.path.join(working_dir, f"{unique_id}_{mode}.log"),
+        os.path.join(working_dir, f"{unique_id}.log"),
     )
 
     graph_gen = GraphGen(unique_id=unique_id, working_dir=working_dir)
 
-    graph_gen.insert(read_config=config["read"], split_config=config["split"])
+    # share context between different steps
+    ctx = Context(config=config, graph_gen=graph_gen)
+    ops = collect_ops(config, graph_gen)
 
-    graph_gen.search(search_config=config["search"])
-
-    if config.get("quiz_and_judge", {}).get("enabled"):
-        graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
-
-    # TODO: add data filtering step here in the future
-    # graph_gen.filter(filter_config=config["filter"])
-
-    graph_gen.generate(
-        partition_config=config["partition"],
-        generate_config=config["generate"],
-    )
+    # run operations
+    Engine(max_workers=config.get("max_workers", 4)).run(ops, ctx)
 
     save_config(os.path.join(output_path, "config.yaml"), config)
     logger.info("GraphGen completed successfully. Data saved to %s", output_path)