Skip to content

Commit 9de5b2f

Browse files
wip: add schema_guided_extractor
1 parent 33e1b27 commit 9de5b2f

8 files changed

Lines changed: 171 additions & 0 deletions

File tree

graphgen/bases/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from .base_extractor import BaseExtractor
12
from .base_generator import BaseGenerator
23
from .base_kg_builder import BaseKGBuilder
34
from .base_llm_wrapper import BaseLLMWrapper

graphgen/bases/base_extractor.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from abc import ABC, abstractmethod
2+
from typing import Any
3+
4+
from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
5+
6+
7+
class BaseExtractor(ABC):
8+
"""
9+
Extract information from given text.
10+
11+
"""
12+
13+
def __init__(self, llm_client: BaseLLMWrapper):
14+
self.llm_client = llm_client
15+
16+
@abstractmethod
17+
def extract(self, text_or_documents: str) -> Any:
18+
"""Extract information from the given text"""
19+
20+
@abstractmethod
21+
def build_prompt(self, text: str) -> str:
22+
"""Build prompt for LLM based on the given text"""
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .schema_guided_extractor import SchemaGuidedExtractor
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# TODO: text2json
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from graphgen.bases import BaseExtractor, BaseLLMWrapper
2+
3+
4+
class SchemaGuidedExtractor(BaseExtractor):
5+
"""
6+
Use JSON/YAML Schema or Pydantic Model to guide the LLM to extract structured information from text.
7+
8+
Usage example:
9+
schema = {
10+
"type": "legal contract",
11+
"description": "A legal contract for leasing property.",
12+
"properties": {
13+
"end_date": {"type": "string", "description": "The end date of the lease."},
14+
"leased_space": {"type": "string", "description": "Description of the space that is being leased."},
15+
"lessee": {"type": "string", "description": "The lessee's name (and possibly address)."},
16+
"lessor": {"type": "string", "description": "The lessor's name (and possibly address)."},
17+
"signing_date": {"type": "string", "description": "The date the contract was signed."},
18+
"start_date": {"type": "string", "description": "The start date of the lease."},
19+
"term_of_payment": {"type": "string", "description": "Description of the payment terms."},
20+
"designated_use": {"type": "string",
21+
"description": "Description of the designated use of the property being leased."},
22+
"extension_period": {"type": "string",
23+
"description": "Description of the extension options for the lease."},
24+
"expiration_date_of_lease": {"type": "string", "description": "The expiration data of the lease."}
25+
},
26+
"required": ["lessee", "lessor", "start_date", "end_date"]
27+
}
28+
extractor = SchemaGuidedExtractor(llm_client, schema)
29+
result = extractor.extract(text)
30+
31+
"""
32+
33+
def __init__(self, llm_client: BaseLLMWrapper, schema: dict):
34+
super().__init__(llm_client)
35+
self.schema = schema
36+
37+
def build_prompt(self, text: str) -> str:
38+
pass
39+
40+
def extract(self, text_or_documents: str) -> dict:
41+
pass

graphgen/operators/extract/__init__.py

Whitespace-only changes.
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from typing import List
2+
3+
import gradio as gr
4+
5+
from graphgen.bases import BaseLLMWrapper
6+
from graphgen.bases.datatypes import Chunk
7+
from graphgen.models.extractor import SchemaGuidedExtractor
8+
from graphgen.utils import logger, run_concurrent
9+
10+
11+
async def extract(
12+
llm_client: BaseLLMWrapper,
13+
chunks: List[Chunk],
14+
generation_config: dict,
15+
progress_bar: gr.Progress = None,
16+
):
17+
"""
18+
Extract information from chunks
19+
:param llm_client: LLM client
20+
:param chunks
21+
:param generation_config
22+
:param progress_bar
23+
:return: extracted information
24+
"""
25+
26+
method = generation_config.get("method")
27+
if method == "schema_guided":
28+
schema = generation_config.get("schema")
29+
extractor = SchemaGuidedExtractor(llm_client, schema)
30+
print(extractor)
31+
else:
32+
raise ValueError(f"Unsupported extraction method: {method}")
33+
34+
logger.info("[Extraction] method: %s, chunks: %d", method, len(chunks))
35+
36+
# results = await run_concurrent(
37+
# extractor.extract,
38+
# [chunk.content for chunk in chunks],
39+
# desc="Extracting information",
40+
# unit="chunk",
41+
# progress_bar=progress_bar,
42+
# )
43+
#
44+
# # TODO: 对results合并,去重
45+
# return results
46+
47+
return []
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
{
2+
"type": "object",
3+
"name": "legal contract",
4+
"description": "A legal contract for leasing property.",
5+
"properties": {
6+
"end_date": {
7+
"type": "array",
8+
"items": {"type": "string"},
9+
"description": "The end date of the lease."
10+
},
11+
"leased_space": {
12+
"type": "array",
13+
"items": {"type": "string"},
14+
"description": "Description of the space that is being leased."
15+
},
16+
"lessee": {
17+
"type": "array",
18+
"items": {"type": "string"},
19+
"description": "The lessee's name (and possibly address)."
20+
},
21+
"lessor": {
22+
"type": "array",
23+
"items": {"type": "string"},
24+
"description": "The lessor's name (and possibly address)."
25+
},
26+
"signing_date": {
27+
"type": "array",
28+
"items": {"type": "string"},
29+
"description": "The date the contract was signed."
30+
},
31+
"start_date": {
32+
"type": "array",
33+
"items": {"type": "string"},
34+
"description": "The start date of the lease."
35+
},
36+
"term_of_payment": {
37+
"type": "array",
38+
"items": {"type": "string"},
39+
"description": "Description of the payment terms."
40+
},
41+
"designated_use": {
42+
"type": "array",
43+
"items": {"type": "string"},
44+
"description": "Designated use of the property being leased."
45+
},
46+
"extension_period": {
47+
"type": "array",
48+
"items": {"type": "string"},
49+
"description": "Description of the extension options for the lease."
50+
},
51+
"expiration_date_of_lease": {
52+
"type": "array",
53+
"items": {"type": "string"},
54+
"description": "The expiration date of the lease."
55+
}
56+
},
57+
"required": ["lessee", "lessor", "start_date", "end_date"]
58+
}

0 commit comments

Comments
 (0)