feat: complete extract_info pipeline

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 3f79260cab33 · 2025-11-07T18:38:27.000+08:00
diff --git a/graphgen/configs/aggregated_config.yaml b/graphgen/configs/aggregated_config.yaml
@@ -2,8 +2,11 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-      chunk_size: 1024 # chunk size for text splitting
-      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: chunk
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
 
   - name: build_kg
 
diff --git a/graphgen/configs/atomic_config.yaml b/graphgen/configs/atomic_config.yaml
@@ -2,6 +2,9 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
+
+  - name: chunk
+    params:
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting
 
diff --git a/graphgen/configs/cot_config.yaml b/graphgen/configs/cot_config.yaml
@@ -2,8 +2,11 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-      chunk_size: 1024 # chunk size for text splitting
-      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: chunk
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
 
   - name: build_kg
 
diff --git a/graphgen/configs/multi_hop_config.yaml b/graphgen/configs/multi_hop_config.yaml
@@ -2,6 +2,9 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+
+  - name: chunk
+    params:
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting
 
diff --git a/graphgen/configs/schema_guided_config.yaml b/graphgen/configs/schema_guided_config.yaml
@@ -2,8 +2,12 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-      chunk_size: 20480 # chunk size for text splitting
-      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: chunk
+    params:
+      chunk_size: 20480
+      chunk_overlap: 2000
+      separators: []
 
   - name: extract
     params:
diff --git a/graphgen/configs/vqa_config.yaml b/graphgen/configs/vqa_config.yaml
@@ -2,8 +2,11 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-      chunk_size: 1024 # chunk size for text splitting
-      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: chunk
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
 
   - name: build_kg
 
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -71,6 +71,7 @@ def __init__(
         self.search_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="search"
         )
+
         self.rephrase_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="rephrase"
         )
@@ -81,6 +82,10 @@ def __init__(
             os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
             namespace="qa",
         )
+        self.extract_storage: JsonKVStorage = JsonKVStorage(
+            os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
+            namespace="extraction",
+        )
 
         # webui
         self.progress_bar: gr.Progress = progress_bar
@@ -104,16 +109,30 @@ async def read(self, read_config: Dict):
         _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
         new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
 
+        if len(new_docs) == 0:
+            logger.warning("All documents are already in the storage")
+            return
+
+        await self.full_docs_storage.upsert(new_docs)
+        await self.full_docs_storage.index_done_callback()
+
+    @op("chunk", deps=["read"])
+    @async_to_sync_method
+    async def chunk(self, chunk_config: Dict):
+        """
+        chunk documents into smaller pieces from full_docs_storage if not already present
+        """
+
+        new_docs = await self.meta_storage.get_new_data(self.full_docs_storage)
         if len(new_docs) == 0:
             logger.warning("All documents are already in the storage")
             return
 
         inserting_chunks = await chunk_documents(
             new_docs,
-            read_config["chunk_size"],
-            read_config["chunk_overlap"],
             self.tokenizer_instance,
             self.progress_bar,
+            **chunk_config,
         )
 
         _add_chunk_keys = await self.chunks_storage.filter_keys(
@@ -127,12 +146,12 @@ async def read(self, read_config: Dict):
             logger.warning("All chunks are already in the storage")
             return
 
-        await self.full_docs_storage.upsert(new_docs)
-        await self.full_docs_storage.index_done_callback()
         await self.chunks_storage.upsert(inserting_chunks)
         await self.chunks_storage.index_done_callback()
+        await self.meta_storage.mark_done(self.full_docs_storage)
+        await self.meta_storage.index_done_callback()
 
-    @op("build_kg", deps=["read"])
+    @op("build_kg", deps=["chunk"])
     @async_to_sync_method
     async def build_kg(self):
         """
@@ -162,7 +181,7 @@ async def build_kg(self):
 
         return _add_entities_and_relations
 
-    @op("search", deps=["read"])
+    @op("search", deps=["chunk"])
     @async_to_sync_method
     async def search(self, search_config: Dict):
         logger.info(
@@ -249,7 +268,7 @@ async def partition(self, partition_config: Dict):
         await self.partition_storage.upsert(batches)
         return batches
 
-    @op("extract", deps=["read"])
+    @op("extract", deps=["chunk"])
     @async_to_sync_method
     async def extract(self, extract_config: Dict):
         logger.info("Extracting information from given chunks...")
@@ -263,7 +282,11 @@ async def extract(self, extract_config: Dict):
         if not results:
             logger.warning("No information extracted")
             return
-        print(results)
+
+        await self.extract_storage.upsert(results)
+        await self.extract_storage.index_done_callback()
+        await self.meta_storage.mark_done(self.chunks_storage)
+        await self.meta_storage.index_done_callback()
 
     @op("generate", deps=["partition"])
     @async_to_sync_method
diff --git a/graphgen/models/extractor/schema_guided_extractor.py b/graphgen/models/extractor/schema_guided_extractor.py
@@ -1,8 +1,9 @@
 import json
+from typing import Dict, List
 
 from graphgen.bases import BaseExtractor, BaseLLMWrapper
 from graphgen.templates import SCHEMA_GUIDED_EXTRACTION_PROMPT
-from graphgen.utils import compute_dict_hash, detect_main_language
+from graphgen.utils import compute_dict_hash, detect_main_language, logger
 
 
 class SchemaGuidedExtractor(BaseExtractor):
@@ -69,10 +70,32 @@ async def extract(self, chunk: dict) -> dict:
                 if key not in extracted_info:
                     extracted_info[key] = ""
             if any(extracted_info[key] == "" for key in self.required_keys):
+                logger.debug("Missing required keys in extraction: %s", extracted_info)
                 return {}
             main_keys_info = {key: extracted_info[key] for key in self.required_keys}
-            return {compute_dict_hash(main_keys_info): extracted_info}
+            logger.debug("Extracted info: %s", extracted_info)
+            return {compute_dict_hash(main_keys_info, prefix="extract"): extracted_info}
         except json.JSONDecodeError:
+            logger.error("Failed to parse extraction response: %s", response)
             return {}
 
-    # async def merge_extractions(self):
+    async def merge_extractions(
+        self, extraction_list: List[Dict[str, dict]]
+    ) -> Dict[str, dict]:
+        """
+        Merge multiple extraction results based on their hashes.
+        :param extraction_list: List of extraction results, each is a dict with hash as key and record as value.
+        :return: Merged extraction results.
+        """
+        merged: Dict[str, dict] = {}
+        for ext in extraction_list:
+            for h, rec in ext.items():
+                if h not in merged:
+                    merged[h] = rec.copy()
+                else:
+                    for k, v in rec.items():
+                        if k not in merged[h] or merged[h][k] == v:
+                            merged[h][k] = v
+                        else:
+                            merged[h][k] = f"{merged[h][k]}<SEP>{v}"
+        return merged
diff --git a/graphgen/operators/extract/extract_info.py b/graphgen/operators/extract/extract_info.py
@@ -3,7 +3,6 @@
 import gradio as gr
 
 from graphgen.bases import BaseKVStorage, BaseLLMWrapper
-from graphgen.bases.datatypes import Chunk
 from graphgen.models.extractor import SchemaGuidedExtractor
 from graphgen.utils import logger, run_concurrent
 
@@ -34,7 +33,7 @@ async def extract_info(
 
     chunks = await chunk_storage.get_all()
     chunks = [{k: v} for k, v in chunks.items()]
-    logger.info(f"Start extracting information from {len(chunks)} chunks")
+    logger.info("Start extracting information from %d chunks", len(chunks))
 
     results = await run_concurrent(
         extractor.extract,
@@ -43,8 +42,6 @@ async def extract_info(
         unit="chunk",
         progress_bar=progress_bar,
     )
-    print(results)
 
-    # TODO: 对results合并，去重
-
-    return []
+    results = await extractor.merge_extractions(results)
+    return results
diff --git a/graphgen/operators/split/split_chunks.py b/graphgen/operators/split/split_chunks.py
@@ -31,16 +31,18 @@ def split_chunks(text: str, language: str = "en", **kwargs) -> list:
             f"Unsupported language: {language}. "
             f"Supported languages are: {list(_MAPPING.keys())}"
         )
-    splitter = _get_splitter(language, frozenset(kwargs.items()))
+    frozen_kwargs = frozenset(
+        (k, tuple(v) if isinstance(v, list) else v) for k, v in kwargs.items()
+    )
+    splitter = _get_splitter(language, frozen_kwargs)
     return splitter.split_text(text)
 
 
 async def chunk_documents(
     new_docs: dict,
-    chunk_size: int = 1024,
-    chunk_overlap: int = 100,
     tokenizer_instance: Tokenizer = None,
     progress_bar=None,
+    **kwargs,
 ) -> dict:
     inserting_chunks = {}
     cur_index = 1
@@ -51,11 +53,11 @@ async def chunk_documents(
         doc_type = doc.get("type")
         if doc_type == "text":
             doc_language = detect_main_language(doc["content"])
+
             text_chunks = split_chunks(
                 doc["content"],
                 language=doc_language,
-                chunk_size=chunk_size,
-                chunk_overlap=chunk_overlap,
+                **kwargs,
             )
 
             chunks = {