Skip to content

Commit 6089245

Browse files
fix: switch to new configs
1 parent 9b7499e commit 6089245

6 files changed

Lines changed: 76 additions & 82 deletions

File tree

graphgen/configs/aggregated_config.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@ pipeline:
44
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
55
chunk_size: 1024 # chunk size for text splitting
66
chunk_overlap: 100 # chunk overlap for text splitting
7+
78
- name: quiz_and_judge
89
params:
910
quiz_samples: 2 # number of quiz samples to generate
1011
re_judge: false # whether to re-judge the existing quiz samples
12+
1113
- name: partition
1214
deps: [insert, quiz_and_judge] # ece depends on both insert and quiz_and_judge steps
1315
params:
@@ -17,6 +19,7 @@ pipeline:
1719
min_units_per_community: 5 # min nodes and edges per community
1820
max_tokens_per_community: 10240 # max tokens per community
1921
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
22+
2023
- name: generate
2124
params:
2225
method: aggregated # atomic, aggregated, multi_hop, cot, vqa
Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,15 @@
1-
read:
2-
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
3-
split:
4-
chunk_size: 1024 # chunk size for text splitting
5-
chunk_overlap: 100 # chunk overlap for text splitting
6-
search: # web search configuration
7-
enabled: false # whether to enable web search
8-
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9-
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10-
enabled: true
11-
quiz_samples: 2 # number of quiz samples to generate
12-
re_judge: false # whether to re-judge the existing quiz samples
13-
partition: # graph partition configuration
14-
method: dfs # partition method, support: dfs, bfs, ece, leiden
15-
method_params:
16-
max_units_per_community: 1 # atomic partition, one node or edge per community
17-
generate:
18-
mode: atomic # atomic, aggregated, multi_hop, cot, vqa
19-
data_format: Alpaca # Alpaca, Sharegpt, ChatML
1+
pipeline:
2+
- name: insert
3+
params:
4+
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
5+
chunk_size: 1024 # chunk size for text splitting
6+
chunk_overlap: 100 # chunk overlap for text splitting
7+
- name: partition
8+
params:
9+
method: dfs # partition method, support: dfs, bfs, ece, leiden
10+
method_params:
11+
max_units_per_community: 1 # atomic partition, one node or edge per community
12+
- name: generate
13+
params:
14+
method: atomic # atomic, aggregated, multi_hop, cot, vqa
15+
data_format: Alpaca # Alpaca, Sharegpt, ChatML

graphgen/configs/cot_config.yaml

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
read:
2-
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
3-
split:
4-
chunk_size: 1024 # chunk size for text splitting
5-
chunk_overlap: 100 # chunk overlap for text splitting
6-
search: # web search configuration
7-
enabled: false # whether to enable web search
8-
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9-
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10-
enabled: false
11-
partition: # graph partition configuration
12-
method: leiden # leiden is a partitioner detection algorithm
13-
method_params:
14-
max_size: 20 # Maximum size of communities
15-
use_lcc: false # whether to use the largest connected component
16-
random_seed: 42 # random seed for partitioning
17-
generate:
18-
mode: cot # atomic, aggregated, multi_hop, cot, vqa
19-
data_format: Sharegpt # Alpaca, Sharegpt, ChatML
1+
pipeline:
2+
- name: insert
3+
params:
4+
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5+
chunk_size: 1024 # chunk size for text splitting
6+
chunk_overlap: 100 # chunk overlap for text splitting
7+
8+
- name: partition
9+
params:
10+
method: leiden # leiden is a partitioner detection algorithm
11+
method_params:
12+
max_size: 20 # Maximum size of communities
13+
use_lcc: false # whether to use the largest connected component
14+
random_seed: 42 # random seed for partitioning
15+
16+
- name: generate
17+
params:
18+
method: cot # atomic, aggregated, multi_hop, cot, vqa
19+
data_format: Sharegpt # Alpaca, Sharegpt, ChatML
Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,20 @@
1-
read:
2-
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
3-
split:
4-
chunk_size: 1024 # chunk size for text splitting
5-
chunk_overlap: 100 # chunk overlap for text splitting
6-
search: # web search configuration
7-
enabled: false # whether to enable web search
8-
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9-
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10-
enabled: false
11-
quiz_samples: 2 # number of quiz samples to generate
12-
re_judge: false # whether to re-judge the existing quiz samples
13-
partition: # graph partition configuration
14-
method: ece # ece is a custom partition method based on comprehension loss
15-
method_params:
16-
max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
17-
min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
18-
max_tokens_per_community: 10240 # max tokens per community
19-
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
20-
generate:
21-
mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa
22-
data_format: ChatML # Alpaca, Sharegpt, ChatML
1+
pipeline:
2+
- name: insert
3+
params:
4+
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5+
chunk_size: 1024 # chunk size for text splitting
6+
chunk_overlap: 100 # chunk overlap for text splitting
7+
8+
- name: partition
9+
params:
10+
method: ece # ece is a custom partition method based on comprehension loss
11+
method_params:
12+
max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
13+
min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
14+
max_tokens_per_community: 10240 # max tokens per community
15+
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
16+
17+
- name: generate
18+
params:
19+
method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
20+
data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/configs/vqa_config.yaml

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
1-
read:
2-
input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
3-
split:
4-
chunk_size: 1024 # chunk size for text splitting
5-
chunk_overlap: 100 # chunk overlap for text splitting
6-
search: # web search configuration
7-
enabled: false # whether to enable web search
8-
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9-
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10-
enabled: false
11-
partition: # graph partition configuration
12-
method: anchor_bfs # partition method
13-
method_params:
14-
anchor_type: image # node type to select anchor nodes
15-
max_units_per_community: 10 # atomic partition, one node or edge per community
16-
generate:
17-
mode: vqa # atomic, aggregated, multi_hop, cot, vqa
18-
data_format: ChatML # Alpaca, Sharegpt, ChatML
1+
pipeline:
2+
- name: insert
3+
params:
4+
input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5+
chunk_size: 1024 # chunk size for text splitting
6+
chunk_overlap: 100 # chunk overlap for text splitting
7+
8+
- name: partition
9+
params:
10+
method: anchor_bfs # partition method
11+
method_params:
12+
anchor_type: image # node type to select anchor nodes
13+
max_units_per_community: 10 # atomic partition, one node or edge per community
14+
15+
- name: generate
16+
params:
17+
method: vqa # atomic, aggregated, multi_hop, cot, vqa
18+
data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/engine.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
from functools import wraps
88
from typing import Any, Callable, List
99

10-
from graphgen.utils import logger
11-
1210

1311
class Context(dict):
1412
_lock = threading.Lock()
@@ -83,8 +81,7 @@ def _exec(n: str):
8381
return
8482
try:
8583
name2op[n].func(name2op[n], ctx)
86-
except Exception as e: # pylint: disable=broad-except
87-
logger.error("Operation %s failed: %s", n, e)
84+
except Exception: # pylint: disable=broad-except
8885
exc[n] = traceback.format_exc()
8986
done[n].set()
9087

0 commit comments

Comments
 (0)