feat: add _meta.json to record processed chunks

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 8a2a1fcb0f26 · 2025-11-07T14:34:51.000+08:00
diff --git a/graphgen/configs/aggregated_config.yaml b/graphgen/configs/aggregated_config.yaml
@@ -1,17 +1,19 @@
 pipeline:
-  - name: insert
+  - name: read
     params:
       input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting
 
+  - name: build_kg
+
   - name: quiz_and_judge
     params:
       quiz_samples: 2 # number of quiz samples to generate
       re_judge: false # whether to re-judge the existing quiz samples
 
   - name: partition
-    deps: [insert, quiz_and_judge] # ece depends on both insert and quiz_and_judge steps
+    deps: [quiz_and_judge] # ece depends on quiz_and_judge steps
     params:
       method: ece # ece is a custom partition method based on comprehension loss
       method_params:
diff --git a/graphgen/configs/atomic_config.yaml b/graphgen/configs/atomic_config.yaml
@@ -1,9 +1,12 @@
 pipeline:
-  - name: insert
+  - name: read
     params:
       input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: build_kg
+
   - name: partition
     params:
       method: dfs # partition method, support: dfs, bfs, ece, leiden
diff --git a/graphgen/configs/cot_config.yaml b/graphgen/configs/cot_config.yaml
@@ -1,10 +1,12 @@
 pipeline:
-  - name: insert
+  - name: read
     params:
       input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting
 
+  - name: build_kg
+
   - name: partition
     params:
       method: leiden # leiden is a partitioner detection algorithm
diff --git a/graphgen/configs/multi_hop_config.yaml b/graphgen/configs/multi_hop_config.yaml
@@ -1,10 +1,12 @@
 pipeline:
-  - name: insert
+  - name: read
     params:
       input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting
 
+  - name: build_kg
+
   - name: partition
     params:
       method: ece # ece is a custom partition method based on comprehension loss
diff --git a/graphgen/configs/vqa_config.yaml b/graphgen/configs/vqa_config.yaml
@@ -1,10 +1,12 @@
 pipeline:
-  - name: insert
+  - name: read
     params:
       input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting
 
+  - name: build_kg
+
   - name: partition
     params:
       method: anchor_bfs # partition method
diff --git a/graphgen/engine.py b/graphgen/engine.py
@@ -113,6 +113,9 @@ def collect_ops(config: dict, graph_gen) -> List[OpNode]:
         runtime_deps = stage.get("deps", op_node.deps)
         op_node.deps = runtime_deps
 
-        op_node.func = lambda self, ctx, m=method, sc=stage: m(sc.get("params"))
+        if "params" in stage:
+            op_node.func = lambda self, ctx, m=method, sc=stage: m(sc.get("params", {}))
+        else:
+            op_node.func = lambda self, ctx, m=method: m()
         ops.append(op_node)
     return ops
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -1,17 +1,16 @@
-import asyncio
 import os
 import time
-from typing import Dict, cast
+from typing import Dict
 
 import gradio as gr
 
 from graphgen.bases import BaseLLMWrapper
-from graphgen.bases.base_storage import StorageNameSpace
 from graphgen.bases.datatypes import Chunk
 from graphgen.engine import op
 from graphgen.models import (
     JsonKVStorage,
     JsonListStorage,
+    MetaJsonKVStorage,
     NetworkXStorage,
     OpenAIClient,
     Tokenizer,
@@ -55,6 +54,10 @@ def __init__(
         )
         self.trainee_llm_client: BaseLLMWrapper = trainee_llm_client
 
+        self.meta_storage: MetaJsonKVStorage = MetaJsonKVStorage(
+            self.working_dir, namespace="_meta"
+        )
+
         self.full_docs_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="full_docs"
         )
@@ -81,14 +84,13 @@ def __init__(
         # webui
         self.progress_bar: gr.Progress = progress_bar
 
-    @op("insert", deps=[])
+    @op("read", deps=[])
     @async_to_sync_method
-    async def insert(self, insert_config: Dict):
+    async def read(self, read_config: Dict):
         """
-        insert chunks into the graph
+        read files from input sources
         """
-        # Step 1: Read files
-        data = read_files(insert_config["input_file"], self.working_dir)
+        data = read_files(read_config["input_file"], self.working_dir)
         if len(data) == 0:
             logger.warning("No data to process")
             return
@@ -107,8 +109,8 @@ async def insert(self, insert_config: Dict):
 
         inserting_chunks = await chunk_documents(
             new_docs,
-            insert_config["chunk_size"],
-            insert_config["chunk_overlap"],
+            read_config["chunk_size"],
+            read_config["chunk_overlap"],
             self.tokenizer_instance,
             self.progress_bar,
         )
@@ -124,9 +126,25 @@ async def insert(self, insert_config: Dict):
             logger.warning("All chunks are already in the storage")
             return
 
-        logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks))
+        await self.full_docs_storage.upsert(new_docs)
+        await self.full_docs_storage.index_done_callback()
         await self.chunks_storage.upsert(inserting_chunks)
+        await self.chunks_storage.index_done_callback()
+
+    @op("build_kg", deps=["read"])
+    @async_to_sync_method
+    async def build_kg(self):
+        """
+        build knowledge graph from text chunks
+        """
+        # Step 1: get new chunks according to meta and chunks storage
+        inserting_chunks = await self.meta_storage.get_new_data(self.chunks_storage)
+        if len(inserting_chunks) == 0:
+            logger.warning("All chunks are already in the storage")
+            return
 
+        logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks))
+        # Step 2: build knowledge graph from new chunks
         _add_entities_and_relations = await build_kg(
             llm_client=self.synthesizer_llm_client,
             kg_instance=self.graph_storage,
@@ -137,23 +155,13 @@ async def insert(self, insert_config: Dict):
             logger.warning("No entities or relations extracted from text chunks")
             return
 
-        await self._insert_done()
+        # Step 3: mark meta
+        await self.meta_storage.mark_done(self.chunks_storage)
+        await self.meta_storage.index_done_callback()
+
         return _add_entities_and_relations
 
-    async def _insert_done(self):
-        tasks = []
-        for storage_instance in [
-            self.full_docs_storage,
-            self.chunks_storage,
-            self.graph_storage,
-            self.search_storage,
-        ]:
-            if storage_instance is None:
-                continue
-            tasks.append(cast(StorageNameSpace, storage_instance).index_done_callback())
-        await asyncio.gather(*tasks)
-
-    @op("search", deps=["insert"])
+    @op("search", deps=["read"])
     @async_to_sync_method
     async def search(self, search_config: Dict):
         logger.info(
@@ -187,9 +195,9 @@ async def search(self, search_config: Dict):
                         ]
                     )
                 # TODO: fix insert after search
-                await self.insert()
+                # await self.insert()
 
-    @op("quiz_and_judge", deps=["insert"])
+    @op("quiz_and_judge", deps=["build_kg"])
     @async_to_sync_method
     async def quiz_and_judge(self, quiz_and_judge_config: Dict):
         logger.warning(
@@ -228,7 +236,7 @@ async def quiz_and_judge(self, quiz_and_judge_config: Dict):
         logger.info("Restarting synthesizer LLM client.")
         self.synthesizer_llm_client.restart()
 
-    @op("partition", deps=["insert"])
+    @op("partition", deps=["build_kg"])
     @async_to_sync_method
     async def partition(self, partition_config: Dict):
         batches = await partition_kg(
@@ -240,7 +248,7 @@ async def partition(self, partition_config: Dict):
         await self.partition_storage.upsert(batches)
         return batches
 
-    @op("generate", deps=["insert", "partition"])
+    @op("generate", deps=["partition"])
     @async_to_sync_method
     async def generate(self, generate_config: Dict):
 
diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py
@@ -30,5 +30,5 @@
 from .search.web.bing_search import BingSearch
 from .search.web.google_search import GoogleSearch
 from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
-from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage
+from .storage import JsonKVStorage, JsonListStorage, MetaJsonKVStorage, NetworkXStorage
 from .tokenizer import Tokenizer
diff --git a/graphgen/models/storage/__init__.py b/graphgen/models/storage/__init__.py
@@ -1,2 +1,2 @@
-from .json_storage import JsonKVStorage, JsonListStorage
+from .json_storage import JsonKVStorage, JsonListStorage, MetaJsonKVStorage
 from .networkx_storage import NetworkXStorage
diff --git a/graphgen/models/storage/json_storage.py b/graphgen/models/storage/json_storage.py
@@ -44,11 +44,13 @@ async def filter_keys(self, data: list[str]) -> set[str]:
 
     async def upsert(self, data: dict):
         left_data = {k: v for k, v in data.items() if k not in self._data}
-        self._data.update(left_data)
+        if left_data:
+            self._data.update(left_data)
         return left_data
 
     async def drop(self):
-        self._data = {}
+        if self._data:
+            self._data.clear()
 
 
 @dataclass
@@ -87,3 +89,23 @@ async def upsert(self, data: list):
 
     async def drop(self):
         self._data = []
+
+
+@dataclass
+class MetaJsonKVStorage(JsonKVStorage):
+    def __post_init__(self):
+        self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json")
+        self._data = load_json(self._file_name) or {}
+        logger.info("Load KV %s with %d data", self.namespace, len(self._data))
+
+    async def get_new_data(self, storage_instance: "JsonKVStorage") -> dict:
+        new_data = {}
+        for k, v in storage_instance.data.items():
+            if k not in self._data:
+                new_data[k] = v
+        return new_data
+
+    async def mark_done(self, storage_instance: "JsonKVStorage"):
+        new_data = await self.get_new_data(storage_instance)
+        if new_data:
+            self._data.update(new_data)
diff --git a/graphgen/models/storage/networkx_storage.py b/graphgen/models/storage/networkx_storage.py
@@ -75,7 +75,8 @@ def _get_edge_key(source: Any, target: Any) -> str:
 
     def __post_init__(self):
         """
-        如果图文件存在，则加载图文件，否则创建一个新图
+        Initialize the NetworkX graph storage by loading an existing graph from a GraphML file,
+        if it exists, or creating a new empty graph otherwise.
         """
         self._graphml_xml_file = os.path.join(
             self.working_dir, f"{self.namespace}.graphml"

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-from .json_storage import JsonKVStorage, JsonListStorage`
	`1`	`+from .json_storage import JsonKVStorage, JsonListStorage, MetaJsonKVStorage`
`2`	`2`	`from .networkx_storage import NetworkXStorage`