@@ -71,6 +71,7 @@ def __init__(
7171 self .search_storage : JsonKVStorage = JsonKVStorage (
7272 self .working_dir , namespace = "search"
7373 )
74+
7475 self .rephrase_storage : JsonKVStorage = JsonKVStorage (
7576 self .working_dir , namespace = "rephrase"
7677 )
@@ -81,6 +82,10 @@ def __init__(
8182 os .path .join (self .working_dir , "data" , "graphgen" , f"{ self .unique_id } " ),
8283 namespace = "qa" ,
8384 )
85+ self .extract_storage : JsonKVStorage = JsonKVStorage (
86+ os .path .join (self .working_dir , "data" , "graphgen" , f"{ self .unique_id } " ),
87+ namespace = "extraction" ,
88+ )
8489
8590 # webui
8691 self .progress_bar : gr .Progress = progress_bar
@@ -104,16 +109,30 @@ async def read(self, read_config: Dict):
104109 _add_doc_keys = await self .full_docs_storage .filter_keys (list (new_docs .keys ()))
105110 new_docs = {k : v for k , v in new_docs .items () if k in _add_doc_keys }
106111
112+ if len (new_docs ) == 0 :
113+ logger .warning ("All documents are already in the storage" )
114+ return
115+
116+ await self .full_docs_storage .upsert (new_docs )
117+ await self .full_docs_storage .index_done_callback ()
118+
119+ @op ("chunk" , deps = ["read" ])
120+ @async_to_sync_method
121+ async def chunk (self , chunk_config : Dict ):
122+ """
123+ chunk documents into smaller pieces from full_docs_storage if not already present
124+ """
125+
126+ new_docs = await self .meta_storage .get_new_data (self .full_docs_storage )
107127 if len (new_docs ) == 0 :
108128 logger .warning ("All documents are already in the storage" )
109129 return
110130
111131 inserting_chunks = await chunk_documents (
112132 new_docs ,
113- read_config ["chunk_size" ],
114- read_config ["chunk_overlap" ],
115133 self .tokenizer_instance ,
116134 self .progress_bar ,
135+ ** chunk_config ,
117136 )
118137
119138 _add_chunk_keys = await self .chunks_storage .filter_keys (
@@ -127,12 +146,12 @@ async def read(self, read_config: Dict):
127146 logger .warning ("All chunks are already in the storage" )
128147 return
129148
130- await self .full_docs_storage .upsert (new_docs )
131- await self .full_docs_storage .index_done_callback ()
132149 await self .chunks_storage .upsert (inserting_chunks )
133150 await self .chunks_storage .index_done_callback ()
151+ await self .meta_storage .mark_done (self .full_docs_storage )
152+ await self .meta_storage .index_done_callback ()
134153
135- @op ("build_kg" , deps = ["read " ])
154+ @op ("build_kg" , deps = ["chunk " ])
136155 @async_to_sync_method
137156 async def build_kg (self ):
138157 """
@@ -162,7 +181,7 @@ async def build_kg(self):
162181
163182 return _add_entities_and_relations
164183
165- @op ("search" , deps = ["read " ])
184+ @op ("search" , deps = ["chunk " ])
166185 @async_to_sync_method
167186 async def search (self , search_config : Dict ):
168187 logger .info (
@@ -249,7 +268,7 @@ async def partition(self, partition_config: Dict):
249268 await self .partition_storage .upsert (batches )
250269 return batches
251270
252- @op ("extract" , deps = ["read " ])
271+ @op ("extract" , deps = ["chunk " ])
253272 @async_to_sync_method
254273 async def extract (self , extract_config : Dict ):
255274 logger .info ("Extracting information from given chunks..." )
@@ -263,7 +282,11 @@ async def extract(self, extract_config: Dict):
263282 if not results :
264283 logger .warning ("No information extracted" )
265284 return
266- print (results )
285+
286+ await self .extract_storage .upsert (results )
287+ await self .extract_storage .index_done_callback ()
288+ await self .meta_storage .mark_done (self .chunks_storage )
289+ await self .meta_storage .index_done_callback ()
267290
268291 @op ("generate" , deps = ["partition" ])
269292 @async_to_sync_method
0 commit comments