wip

ChenZiHong-Gavin · ChenZiHong-Gavin · commit f89a32030265 · 2025-11-07T15:40:18.000+08:00
diff --git a/graphgen/bases/base_extractor.py b/graphgen/bases/base_extractor.py
@@ -14,7 +14,7 @@ def __init__(self, llm_client: BaseLLMWrapper):
         self.llm_client = llm_client
 
     @abstractmethod
-    def extract(self, text_or_documents: str) -> Any:
+    async def extract(self, chunk: dict) -> Any:
         """Extract information from the given text"""
 
     @abstractmethod
diff --git a/graphgen/configs/schema_guided_config.yaml b/graphgen/configs/schema_guided_config.yaml
@@ -1,11 +1,11 @@
 pipeline:
-  - name: insert
+  - name: read
     params:
-      input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-      chunk_size: 10240 # chunk size for text splitting
+      input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      chunk_size: 20480 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting
 
   - name: extract
     params:
       method: schema_guided # extraction method, support: schema_guided
-      schema_file: resources/schemas/legal_contract.json # schema file path for schema_guided method
+      schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -249,7 +249,7 @@ async def partition(self, partition_config: Dict):
         await self.partition_storage.upsert(batches)
         return batches
 
-    @op("extract", deps=["insert"])
+    @op("extract", deps=["read"])
     @async_to_sync_method
     async def extract(self, extract_config: Dict):
         logger.info("Extracting information from given chunks...")
diff --git a/graphgen/models/extractor/schema_guided_extractor.py b/graphgen/models/extractor/schema_guided_extractor.py
@@ -1,4 +1,8 @@
+import json
+
 from graphgen.bases import BaseExtractor, BaseLLMWrapper
+from graphgen.templates import SCHEMA_GUIDED_EXTRACTION_PROMPT
+from graphgen.utils import compute_dict_hash, detect_main_language
 
 
 class SchemaGuidedExtractor(BaseExtractor):
@@ -33,9 +37,42 @@ class SchemaGuidedExtractor(BaseExtractor):
     def __init__(self, llm_client: BaseLLMWrapper, schema: dict):
         super().__init__(llm_client)
         self.schema = schema
+        self.required_keys = self.schema.get("required")
+        if not self.required_keys:
+            # If no required keys are specified, use all keys from the schema as default
+            self.required_keys = list(self.schema.get("properties", {}).keys())
 
     def build_prompt(self, text: str) -> str:
-        pass
+        schema_explanation = ""
+        for field, details in self.schema.get("properties", {}).items():
+            description = details.get("description", "No description provided.")
+            schema_explanation += f'- "{field}": {description}\n'
+
+        lang = detect_main_language(text)
+
+        prompt = SCHEMA_GUIDED_EXTRACTION_PROMPT[lang].format(
+            field=self.schema.get("name", "the document"),
+            schema_explanation=schema_explanation,
+            examples="",
+            text=text,
+        )
+        return prompt
 
     async def extract(self, chunk: dict) -> dict:
-        print(chunk)
+        text = chunk.get("text", "")
+        prompt = self.build_prompt(text)
+        response = await self.llm_client.generate_answer(prompt)
+        try:
+            extracted_info = json.loads(response)
+            # Ensure all required keys are present
+            for key in self.required_keys:
+                if key not in extracted_info:
+                    extracted_info[key] = ""
+            if any(extracted_info[key] == "" for key in self.required_keys):
+                return {}
+            main_keys_info = {key: extracted_info[key] for key in self.required_keys}
+            return {compute_dict_hash(main_keys_info): extracted_info}
+        except json.JSONDecodeError:
+            return {}
+
+    # async def merge_extractions(self):
diff --git a/graphgen/operators/extract/extract_info.py b/graphgen/operators/extract/extract_info.py
@@ -1,4 +1,4 @@
-from typing import List
+import json
 
 import gradio as gr
 
@@ -25,7 +25,9 @@ async def extract_info(
 
     method = extract_config.get("method")
     if method == "schema_guided":
-        schema = extract_config.get("schema")
+        schema_file = extract_config.get("schema_file")
+        with open(schema_file, "r", encoding="utf-8") as f:
+            schema = json.load(f)
         extractor = SchemaGuidedExtractor(llm_client, schema)
     else:
         raise ValueError(f"Unsupported extraction method: {method}")
@@ -41,6 +43,7 @@ async def extract_info(
         unit="chunk",
         progress_bar=progress_bar,
     )
+    print(results)
 
     # TODO: 对results合并，去重
 
diff --git a/graphgen/templates/extraction/schema_guided_extraction.py b/graphgen/templates/extraction/schema_guided_extraction.py
@@ -11,14 +11,21 @@
 - Consider the context of the entire document when determining relevance.
 - Do not be verbose, only respond with the correct format and information.
 - Some docs may have multiple relevant excerpts -- include all that apply.
-- Some questions may have no relevant excerpts -- just return ["N/A"].
+- Some questions may have no relevant excerpts -- just return "".
 - Do not include additional JSON keys beyond the ones listed here.
 - Do not include the same key multiple times in the JSON.
 - Use English for your response.
 
 Expected JSON keys and explanation of what they are:
 {schema_explanation}
 
+Expected format:
+{{
+    "key1": "value1",
+    "key2": "value2",
+    ...
+}}
+
 {examples}
 
 Document to extract from:
@@ -37,14 +44,21 @@
 - 在确定相关性时，考虑整份文件的上下文。
 - 不要冗长，只需以正确的格式和信息进行回应。
 - 有些文件可能有多个相关摘录——请包含所有适用的内容。
-- 有些问题可能没有相关摘录——只需返回["N/A"]。
+- 有些问题可能没有相关摘录——只需返回""。
 - 不要在JSON中包含除列出的键之外的其他键。
 - 不要多次包含同一个键。
 - 使用中文回答。
 
 预期的JSON键及其说明：
 {schema_explanation}
 
+预期格式：
+{{
+    "key1": "value1",
+    "key2": "value2",
+    ...
+}}
+
 {examples}
 要提取的文件：
 {text}
diff --git a/graphgen/utils/__init__.py b/graphgen/utils/__init__.py
@@ -9,7 +9,12 @@
     split_string_by_multi_markers,
     write_json,
 )
-from .hash import compute_args_hash, compute_content_hash, compute_mm_hash
+from .hash import (
+    compute_args_hash,
+    compute_content_hash,
+    compute_dict_hash,
+    compute_mm_hash,
+)
 from .help_nltk import NLTKHelper
 from .log import logger, parse_log, set_logger
 from .loop import create_event_loop
diff --git a/graphgen/utils/hash.py b/graphgen/utils/hash.py
@@ -21,3 +21,8 @@ def compute_mm_hash(item, prefix: str = ""):
     else:
         content = str(item)
     return prefix + md5(content.encode()).hexdigest()
+
+
+def compute_dict_hash(d: dict, prefix: str = ""):
+    items = tuple(sorted(d.items()))
+    return prefix + md5(str(items).encode()).hexdigest()