wip: add extract_info

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 0cefddfe106c · 2025-11-06T20:48:21.000+08:00
diff --git a/graphgen/bases/base_storage.py b/graphgen/bases/base_storage.py
@@ -45,6 +45,9 @@ async def get_by_ids(
     ) -> list[Union[T, None]]:
         raise NotImplementedError
 
+    async def get_all(self) -> dict[str, T]:
+        raise NotImplementedError
+
     async def filter_keys(self, data: list[str]) -> set[str]:
         """return un-exist keys"""
         raise NotImplementedError
diff --git a/graphgen/configs/schema_guided_config.yaml b/graphgen/configs/schema_guided_config.yaml
@@ -1,8 +1,11 @@
-read:
-  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 10240 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-extract:
-  method: schema_guided # extraction method, support: schema_guided
-  schema_file: resources/schemas/legal_contract.json # schema file path for schema_guided method
+pipeline:
+  - name: insert
+    params:
+      input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 10240 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: extract
+    params:
+      method: schema_guided # extraction method, support: schema_guided
+      schema_file: resources/schemas/legal_contract.json # schema file path for schema_guided method
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -19,6 +19,7 @@
 from graphgen.operators import (
     build_kg,
     chunk_documents,
+    extract_info,
     generate_qas,
     init_llm,
     judge_statement,
@@ -240,6 +241,22 @@ async def partition(self, partition_config: Dict):
         await self.partition_storage.upsert(batches)
         return batches
 
+    @op("extract", deps=["insert"])
+    @async_to_sync_method
+    async def extract(self, extract_config: Dict):
+        logger.info("Extracting information from given chunks...")
+
+        results = await extract_info(
+            self.synthesizer_llm_client,
+            self.chunks_storage,
+            extract_config,
+            progress_bar=self.progress_bar,
+        )
+        if not results:
+            logger.warning("No information extracted")
+            return
+        print(results)
+
     @op("generate", deps=["insert", "partition"])
     @async_to_sync_method
     async def generate(self, generate_config: Dict):
diff --git a/graphgen/models/extractor/schema_guided_extractor.py b/graphgen/models/extractor/schema_guided_extractor.py
@@ -37,5 +37,5 @@ def __init__(self, llm_client: BaseLLMWrapper, schema: dict):
     def build_prompt(self, text: str) -> str:
         pass
 
-    def extract(self, text_or_documents: str) -> dict:
-        pass
+    async def extract(self, chunk: dict) -> dict:
+        print(chunk)
diff --git a/graphgen/models/storage/json_storage.py b/graphgen/models/storage/json_storage.py
@@ -39,6 +39,9 @@ async def get_by_ids(self, ids, fields=None) -> list:
             for id in ids
         ]
 
+    async def get_all(self) -> dict[str, str]:
+        return self._data
+
     async def filter_keys(self, data: list[str]) -> set[str]:
         return {s for s in data if s not in self._data}
 
diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py
@@ -1,4 +1,5 @@
 from .build_kg import build_kg
+from .extract import extract_info
 from .generate import generate_qas
 from .init import init_llm
 from .judge import judge_statement
diff --git a/graphgen/operators/extract/__init__.py b/graphgen/operators/extract/__init__.py
@@ -0,0 +1 @@
+from .extract_info import extract_info
diff --git a/graphgen/operators/extract/extract.py b/graphgen/operators/extract/extract.py
diff --git a/graphgen/operators/extract/extract_info.py b/graphgen/operators/extract/extract_info.py
@@ -0,0 +1,47 @@
+from typing import List
+
+import gradio as gr
+
+from graphgen.bases import BaseKVStorage, BaseLLMWrapper
+from graphgen.bases.datatypes import Chunk
+from graphgen.models.extractor import SchemaGuidedExtractor
+from graphgen.utils import logger, run_concurrent
+
+
+async def extract_info(
+    llm_client: BaseLLMWrapper,
+    chunk_storage: BaseKVStorage,
+    extract_config: dict,
+    progress_bar: gr.Progress = None,
+):
+    """
+    Extract information from chunks
+    :param llm_client: LLM client
+    :param chunk_storage: storage for chunks
+    :param extract_config
+    :param progress_bar
+    :return: extracted information
+    """
+
+    method = extract_config.get("method")
+    if method == "schema_guided":
+        schema = extract_config.get("schema")
+        extractor = SchemaGuidedExtractor(llm_client, schema)
+    else:
+        raise ValueError(f"Unsupported extraction method: {method}")
+
+    chunks = await chunk_storage.get_all()
+    chunks = [{k: v} for k, v in chunks.items()]
+    logger.info(f"Start extracting information from {len(chunks)} chunks")
+
+    results = await run_concurrent(
+        extractor.extract,
+        chunks,
+        desc="Extracting information",
+        unit="chunk",
+        progress_bar=progress_bar,
+    )
+
+    # TODO: 对results合并，去重
+
+    return []

Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,9 @@ async def get_by_ids(self, ids, fields=None) -> list:`
`39`	`39`	`for id in ids`
`40`	`40`	`]`
`41`	`41`
	`42`	`+ async def get_all(self) -> dict[str, str]:`
	`43`	`+ return self._data`
	`44`	`+`
`42`	`45`	`async def filter_keys(self, data: list[str]) -> set[str]:`
`43`	`46`	`return {s for s in data if s not in self._data}`
`44`	`47`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`from .build_kg import build_kg`
	`2`	`+from .extract import extract_info`
`2`	`3`	`from .generate import generate_qas`
`3`	`4`	`from .init import init_llm`
`4`	`5`	`from .judge import judge_statement`