fix: switch to new configs

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 60892455dc31 · 2025-11-06T16:05:32.000+08:00
diff --git a/graphgen/configs/aggregated_config.yaml b/graphgen/configs/aggregated_config.yaml
@@ -4,10 +4,12 @@ pipeline:
       input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting
+
   - name: quiz_and_judge
     params:
       quiz_samples: 2 # number of quiz samples to generate
       re_judge: false # whether to re-judge the existing quiz samples
+
   - name: partition
     deps: [insert, quiz_and_judge] # ece depends on both insert and quiz_and_judge steps
     params:
@@ -17,6 +19,7 @@ pipeline:
         min_units_per_community: 5 # min nodes and edges per community
         max_tokens_per_community: 10240 # max tokens per community
         unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+
   - name: generate
     params:
       method: aggregated # atomic, aggregated, multi_hop, cot, vqa
diff --git a/graphgen/configs/atomic_config.yaml b/graphgen/configs/atomic_config.yaml
@@ -1,19 +1,15 @@
-read:
-  input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-partition: # graph partition configuration
-  method: dfs # partition method, support: dfs, bfs, ece, leiden
-  method_params:
-    max_units_per_community: 1 # atomic partition, one node or edge per community
-generate:
-  mode: atomic # atomic, aggregated, multi_hop, cot, vqa
-  data_format: Alpaca # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: insert
+    params:
+      input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+  - name: partition
+    params:
+      method: dfs # partition method, support: dfs, bfs, ece, leiden
+      method_params:
+        max_units_per_community: 1 # atomic partition, one node or edge per community
+  - name: generate
+    params:
+      method: atomic # atomic, aggregated, multi_hop, cot, vqa
+      data_format: Alpaca # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/cot_config.yaml b/graphgen/configs/cot_config.yaml
@@ -1,19 +1,19 @@
-read:
-  input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: false
-partition: # graph partition configuration
-  method: leiden # leiden is a partitioner detection algorithm
-  method_params:
-    max_size: 20 # Maximum size of communities
-    use_lcc: false # whether to use the largest connected component
-    random_seed: 42 # random seed for partitioning
-generate:
-  mode: cot # atomic, aggregated, multi_hop, cot, vqa
-  data_format: Sharegpt # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: insert
+    params:
+      input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: partition
+    params:
+      method: leiden # leiden is a partitioner detection algorithm
+      method_params:
+        max_size: 20 # Maximum size of communities
+        use_lcc: false # whether to use the largest connected component
+        random_seed: 42 # random seed for partitioning
+
+  - name: generate
+    params:
+      method: cot # atomic, aggregated, multi_hop, cot, vqa
+      data_format: Sharegpt # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/multi_hop_config.yaml b/graphgen/configs/multi_hop_config.yaml
@@ -1,22 +1,20 @@
-read:
-  input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: false
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-partition: # graph partition configuration
-  method: ece # ece is a custom partition method based on comprehension loss
-  method_params:
-    max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
-    min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
-    max_tokens_per_community: 10240 # max tokens per community
-    unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
-generate:
-  mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa
-  data_format: ChatML # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: insert
+    params:
+      input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: partition
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
+        min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
+
+  - name: generate
+    params:
+      method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
+      data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/vqa_config.yaml b/graphgen/configs/vqa_config.yaml
@@ -1,18 +1,18 @@
-read:
-  input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: false
-partition: # graph partition configuration
-  method: anchor_bfs # partition method
-  method_params:
-    anchor_type: image # node type to select anchor nodes
-    max_units_per_community: 10 # atomic partition, one node or edge per community
-generate:
-  mode: vqa # atomic, aggregated, multi_hop, cot, vqa
-  data_format: ChatML # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: insert
+    params:
+      input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: partition
+    params:
+      method: anchor_bfs # partition method
+      method_params:
+        anchor_type: image # node type to select anchor nodes
+        max_units_per_community: 10 # atomic partition, one node or edge per community
+
+  - name: generate
+    params:
+      method: vqa # atomic, aggregated, multi_hop, cot, vqa
+      data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/engine.py b/graphgen/engine.py
@@ -7,8 +7,6 @@
 from functools import wraps
 from typing import Any, Callable, List
 
-from graphgen.utils import logger
-
 
 class Context(dict):
     _lock = threading.Lock()
@@ -83,8 +81,7 @@ def _exec(n: str):
                     return
                 try:
                     name2op[n].func(name2op[n], ctx)
-                except Exception as e:  # pylint: disable=broad-except
-                    logger.error("Operation %s failed: %s", n, e)
+                except Exception:  # pylint: disable=broad-except
                     exc[n] = traceback.format_exc()
                 done[n].set()