refactor: refactor quiz to accomodata ray data engine

ChenZiHong-Gavin · ChenZiHong-Gavin · commit ee0639dbc069 · 2025-12-05T12:29:30.000+08:00
diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py
@@ -1,9 +1,19 @@
-from .build_kg import build_kg
+from .build_kg import BuildKGService
+from .chunk import ChunkService
 from .extract import extract_info
 from .generate import generate_qas
-from .init import init_llm
 from .partition import partition_kg
-from .quiz_and_judge import judge_statement, quiz
+from .quiz import QuizService
 from .read import read
 from .search import search_all
-from .split import chunk_documents
+
+operators = {
+    "read": read,
+    "chunk": ChunkService,
+    "build_kg": BuildKGService,
+    "quiz": QuizService,
+    "extract_info": extract_info,
+    "search_all": search_all,
+    "partition_kg": partition_kg,
+    "generate_qas": generate_qas,
+}
diff --git a/graphgen/operators/build_kg/build_kg_service.py b/graphgen/operators/build_kg/build_kg_service.py
@@ -24,7 +24,7 @@ def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:
 
         # consume the chunks and build kg
         self.build_kg(docs)
-        return pd.DataFrame()
+        return pd.DataFrame([{"status": "kg_building_completed"}])
 
     def build_kg(self, chunks: List[Chunk]) -> None:
         """
diff --git a/graphgen/operators/evaluate/__init__.py b/graphgen/operators/evaluate/__init__.py
diff --git a/graphgen/operators/judge/__init__.py b/graphgen/operators/judge/__init__.py
diff --git a/graphgen/operators/quiz/__init__.py b/graphgen/operators/quiz/__init__.py
@@ -0,0 +1 @@
+from .quiz import QuizService
diff --git a/graphgen/operators/quiz/quiz.py b/graphgen/operators/quiz/quiz.py
@@ -1,93 +1,107 @@
-from collections import defaultdict
-
-import gradio as gr
-
-from graphgen.bases import BaseLLMWrapper
-from graphgen.models import JsonKVStorage, NetworkXStorage, QuizGenerator
-from graphgen.utils import logger, run_concurrent
-
-
-async def quiz(
-    synth_llm_client: BaseLLMWrapper,
-    graph_storage: NetworkXStorage,
-    rephrase_storage: JsonKVStorage,
-    max_samples: int = 1,
-    progress_bar: gr.Progress = None,
-) -> JsonKVStorage:
-    """
-    Get all edges and quiz them using QuizGenerator.
-
-    :param synth_llm_client: generate statements
-    :param graph_storage: graph storage instance
-    :param rephrase_storage: rephrase storage instance
-    :param max_samples: max samples for each edge
-    :param progress_bar
-    :return:
-    """
-
-    generator = QuizGenerator(synth_llm_client)
-
-    async def _process_single_quiz(item: tuple[str, str, str]):
-        description, template_type, gt = item
-        try:
-            # if rephrase_storage exists already, directly get it
-            descriptions = rephrase_storage.get_by_id(description)
-            if descriptions:
-                return None
-
-            prompt = generator.build_prompt_for_description(description, template_type)
-            new_description = await synth_llm_client.generate_answer(
-                prompt, temperature=1
-            )
-            rephrased_text = generator.parse_rephrased_text(new_description)
-            return {description: [(rephrased_text, gt)]}
-
-        except Exception as e:  # pylint: disable=broad-except
-            logger.error("Error when quizzing description %s: %s", description, e)
+from collections.abc import Iterable
+
+import pandas as pd
+
+from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseLLMWrapper
+from graphgen.common import init_llm, init_storage
+from graphgen.models import QuizGenerator
+from graphgen.utils import compute_content_hash, logger, run_concurrent
+
+
+class QuizService:
+    def __init__(self, working_dir: str = "cache", quiz_samples: int = 1):
+        self.quiz_samples = quiz_samples
+        self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
+        self.graph_storage: BaseGraphStorage = init_storage(
+            backend="networkx", working_dir=working_dir, namespace="graph"
+        )
+        # { _description_id: { "description": str, "quizzes": List[Tuple[str, str]] } }
+        self.quiz_storage: BaseKVStorage = init_storage(
+            backend="json_kv", working_dir=working_dir, namespace="quiz"
+        )
+        self.generator = QuizGenerator(self.llm_client)
+
+        self.concurrency_limit = 20
+
+    def __call__(self, batch: pd.DataFrame) -> Iterable[pd.DataFrame]:
+        # this operator does not consume any batch data
+        # but for compatibility we keep the interface
+        _ = batch.to_dict(orient="records")
+
+        yield from self.quiz()
+
+    async def _process_single_quiz(self, item: str) -> dict | None:
+        # if quiz in quiz_storage exists already, directly get it
+        _description_id = compute_content_hash(item)
+        if self.quiz_storage.get_by_id(_description_id):
             return None
 
-    edges = graph_storage.get_all_edges()
-    nodes = graph_storage.get_all_nodes()
-
-    results = defaultdict(list)
-    items = []
-    for edge in edges:
-        edge_data = edge[2]
-        description = edge_data["description"]
-
-        results[description] = [(description, "yes")]
-
-        for i in range(max_samples):
+        tasks = []
+        for i in range(self.quiz_samples):
             if i > 0:
-                items.append((description, "TEMPLATE", "yes"))
-            items.append((description, "ANTI_TEMPLATE", "no"))
-
-    for node in nodes:
-        node_data = node[1]
-        description = node_data["description"]
+                tasks.append((item, "TEMPLATE", "yes"))
+            tasks.append((item, "ANTI_TEMPLATE", "no"))
+        try:
+            quizzes = []
+            for description, template_type, gt in tasks:
+                prompt = self.generator.build_prompt_for_description(
+                    description, template_type
+                )
+                new_description = await self.llm_client.generate_answer(
+                    prompt, temperature=1
+                )
+                rephrased_text = self.generator.parse_rephrased_text(new_description)
+                quizzes.append((rephrased_text, gt))
+            return {
+                "_description_id": _description_id,
+                "description": item,
+                "quizzes": quizzes,
+            }
+        except Exception as e:
+            logger.error("Error when quizzing description %s: %s", item, e)
+            return None
 
-        results[description] = [(description, "yes")]
+    def quiz(self) -> Iterable[pd.DataFrame]:
+        """
+        Get all nodes and edges and quiz their descriptions using QuizGenerator.
+        """
+        edges = self.graph_storage.get_all_edges()
+        nodes = self.graph_storage.get_all_nodes()
+
+        items = []
+
+        for edge in edges:
+            edge_data = edge[2]
+            description = edge_data["description"]
+            items.append(description)
+
+        for node in nodes:
+            node_data = node[1]
+            description = node_data["description"]
+            items.append(description)
+
+        logger.info("Total descriptions to quiz: %d", len(items))
+
+        for i in range(0, len(items), self.concurrency_limit):
+            batch_items = items[i : i + self.concurrency_limit]
+            batch_results = run_concurrent(
+                self._process_single_quiz,
+                batch_items,
+                desc=f"Quizzing descriptions ({i} / {i + len(batch_items)})",
+                unit="description",
+            )
 
-        for i in range(max_samples):
-            if i > 0:
-                items.append((description, "TEMPLATE", "yes"))
-            items.append((description, "ANTI_TEMPLATE", "no"))
-
-    quiz_results = await run_concurrent(
-        _process_single_quiz,
-        items,
-        desc="Quizzing descriptions",
-        unit="description",
-        progress_bar=progress_bar,
-    )
-
-    for new_result in quiz_results:
-        if new_result:
-            for key, value in new_result.items():
-                results[key].extend(value)
-
-    for key, value in results.items():
-        results[key] = list(set(value))
-        rephrase_storage.upsert({key: results[key]})
-
-    return rephrase_storage
+            final_results = []
+            for new_result in batch_results:
+                if new_result:
+                    self.quiz_storage.upsert(
+                        {
+                            new_result["_description_id"]: {
+                                "description": new_result["description"],
+                                "quizzes": new_result["quizzes"],
+                            }
+                        }
+                    )
+                    final_results.append(new_result)
+            self.quiz_storage.index_done_callback()
+            yield pd.DataFrame(final_results)
diff --git a/graphgen/utils/run_concurrent.py b/graphgen/utils/run_concurrent.py
@@ -4,6 +4,7 @@
 from tqdm.asyncio import tqdm as tqdm_async
 
 from graphgen.utils.log import logger
+
 from .loop import create_event_loop
 
 T = TypeVar("T")
@@ -27,7 +28,7 @@ async def _run_all():
             try:
                 result = await future
                 results.append(result)
-            except Exception as e:  # pylint: disable=broad-except
+            except Exception as e:
                 logger.exception("Task failed: %s", e)
                 results.append(e)