refactor: extract helpers, fix hash encoding, reduce cognitive complexity

polaz · polaz · commit 8953ce5e4575 · 2026-04-09T20:37:53.000+03:00
- graph.py: extract _upsert_node(), _create_edge(), _link_document_to_entities()
  from add_graph_documents() — reduces cognitive complexity from 23 to ~8
- graph.py: _stable_document_id() now uses json.dumps(sort_keys=True) instead
  of string concat with "|"/"=" — fixes delimiter ambiguity for nested metadata
- base.py: _parse_edge_types_from_schema() refactored to use iter() with two
  for-loops instead of boolean flag — reduces complexity from 16 to 14
- base.py: get_triplets() docstring documents that relation_names is required
diff --git a/langchain-coordinode/langchain_coordinode/graph.py b/langchain-coordinode/langchain_coordinode/graph.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import hashlib
+import json
 import re
 from typing import Any
 
@@ -111,63 +112,70 @@ def add_graph_documents(
                 ``MENTIONS`` edges (also unconditional ``CREATE``).
         """
         for doc in graph_documents:
-            # ── Upsert nodes ──────────────────────────────────────────────
             for node in doc.nodes:
-                label = _cypher_ident(node.type or "Entity")
-                props = dict(node.properties or {})
-                # Always enforce node.id as the merge key; incoming
-                # properties["name"] must not drift from the MERGE predicate.
-                props["name"] = node.id
-                self._client.cypher(
-                    f"MERGE (n:{label} {{name: $name}}) SET n += $props",
-                    params={"name": node.id, "props": props},
-                )
-
-            # ── Create relationships ──────────────────────────────────────
+                self._upsert_node(node)
             for rel in doc.relationships:
-                src_label = _cypher_ident(rel.source.type or "Entity")
-                dst_label = _cypher_ident(rel.target.type or "Entity")
-                rel_type = _cypher_ident(rel.type)
-                props = dict(rel.properties or {})
-                # CoordiNode does not support MERGE for edges or WHERE NOT
-                # (pattern) guards — use unconditional CREATE.  SET r += $props
-                # is skipped when props is empty because SET r += {} is not
-                # supported by all server versions.
-                if props:
-                    self._client.cypher(
-                        f"MATCH (src:{src_label} {{name: $src}}) "
-                        f"MATCH (dst:{dst_label} {{name: $dst}}) "
-                        f"CREATE (src)-[r:{rel_type}]->(dst) SET r += $props",
-                        params={"src": rel.source.id, "dst": rel.target.id, "props": props},
-                    )
-                else:
-                    self._client.cypher(
-                        f"MATCH (src:{src_label} {{name: $src}}) "
-                        f"MATCH (dst:{dst_label} {{name: $dst}}) "
-                        f"CREATE (src)-[r:{rel_type}]->(dst)",
-                        params={"src": rel.source.id, "dst": rel.target.id},
-                    )
-
-            # ── Optionally link source document ───────────────────────────
+                self._create_edge(rel)
             if include_source and doc.source:
-                src_id = getattr(doc.source, "id", None) or _stable_document_id(doc.source)
-                self._client.cypher(
-                    "MERGE (d:__Document__ {id: $id}) SET d.page_content = $text",
-                    params={"id": src_id, "text": doc.source.page_content or ""},
-                )
-                for node in doc.nodes:
-                    label = _cypher_ident(node.type or "Entity")
-                    self._client.cypher(
-                        f"MATCH (d:__Document__ {{id: $doc_id}}) "
-                        f"MATCH (n:{label} {{name: $name}}) "
-                        f"CREATE (d)-[:MENTIONS]->(n)",
-                        params={"doc_id": src_id, "name": node.id},
-                    )
+                self._link_document_to_entities(doc)
 
         # Invalidate cached schema so next access reflects new data
         self._schema = None
         self._structured_schema = None
 
+    def _upsert_node(self, node: Any) -> None:
+        """Upsert a single node by ``id`` via MERGE."""
+        label = _cypher_ident(node.type or "Entity")
+        props = dict(node.properties or {})
+        # Always enforce node.id as the merge key; incoming
+        # properties["name"] must not drift from the MERGE predicate.
+        props["name"] = node.id
+        self._client.cypher(
+            f"MERGE (n:{label} {{name: $name}}) SET n += $props",
+            params={"name": node.id, "props": props},
+        )
+
+    def _create_edge(self, rel: Any) -> None:
+        """Create a relationship via unconditional CREATE.
+
+        CoordiNode does not support MERGE for edge patterns.  Re-ingesting the
+        same relationship will create a duplicate edge.  SET r += $props is
+        skipped when props is empty because SET r += {} is not supported by all
+        server versions.
+        """
+        src_label = _cypher_ident(rel.source.type or "Entity")
+        dst_label = _cypher_ident(rel.target.type or "Entity")
+        rel_type = _cypher_ident(rel.type)
+        props = dict(rel.properties or {})
+        if props:
+            self._client.cypher(
+                f"MATCH (src:{src_label} {{name: $src}}) "
+                f"MATCH (dst:{dst_label} {{name: $dst}}) "
+                f"CREATE (src)-[r:{rel_type}]->(dst) SET r += $props",
+                params={"src": rel.source.id, "dst": rel.target.id, "props": props},
+            )
+        else:
+            self._client.cypher(
+                f"MATCH (src:{src_label} {{name: $src}}) "
+                f"MATCH (dst:{dst_label} {{name: $dst}}) "
+                f"CREATE (src)-[r:{rel_type}]->(dst)",
+                params={"src": rel.source.id, "dst": rel.target.id},
+            )
+
+    def _link_document_to_entities(self, doc: Any) -> None:
+        """Upsert a ``__Document__`` node and CREATE ``MENTIONS`` edges to all entities."""
+        src_id = getattr(doc.source, "id", None) or _stable_document_id(doc.source)
+        self._client.cypher(
+            "MERGE (d:__Document__ {id: $id}) SET d.page_content = $text",
+            params={"id": src_id, "text": doc.source.page_content or ""},
+        )
+        for node in doc.nodes:
+            label = _cypher_ident(node.type or "Entity")
+            self._client.cypher(
+                f"MATCH (d:__Document__ {{id: $doc_id}}) MATCH (n:{label} {{name: $name}}) CREATE (d)-[:MENTIONS]->(n)",
+                params={"doc_id": src_id, "name": node.id},
+            )
+
     def query(
         self,
         query: str,
@@ -213,8 +221,15 @@ def _stable_document_id(source: Any) -> str:
     """
     content = getattr(source, "page_content", "") or ""
     metadata = getattr(source, "metadata", {}) or {}
-    stable = content + "|" + "|".join(f"{k}={v}" for k, v in sorted(metadata.items()))
-    return hashlib.sha256(stable.encode()).hexdigest()[:32]
+    # Use canonical JSON encoding to avoid delimiter ambiguity and ensure
+    # determinism for nested/non-scalar metadata values.
+    canonical = json.dumps(
+        {"content": content, "metadata": metadata},
+        sort_keys=True,
+        separators=(",", ":"),
+        ensure_ascii=False,
+    )
+    return hashlib.sha256(canonical.encode()).hexdigest()[:32]
 
 
 def _cypher_ident(name: str) -> str:
diff --git a/llama-index-coordinode/llama_index/graph_stores/coordinode/base.py b/llama-index-coordinode/llama_index/graph_stores/coordinode/base.py
@@ -117,7 +117,13 @@ def get_triplets(
         properties: dict[str, Any] | None = None,
         ids: list[str] | None = None,
     ) -> list[list[LabelledNode]]:
-        """Retrieve triplets (subject, predicate, object) as node triples."""
+        """Retrieve triplets (subject, predicate, object) as node triples.
+
+        Note:
+            ``relation_names`` is **required**.  CoordiNode does not support
+            untyped wildcard ``[r]`` relationship patterns — they silently return
+            no rows.  Omitting ``relation_names`` raises ``NotImplementedError``.
+        """
         conditions: list[str] = []
         params: dict[str, Any] = {}
 
@@ -377,17 +383,21 @@ def _parse_edge_types_from_schema(schema_text: str) -> list[str]:
     Parses the "Edge types:" section produced by ``get_schema_text()``.
     """
     edge_types: list[str] = []
-    in_edges = False
-    for line in schema_text.splitlines():
+    lines = iter(schema_text.splitlines())
+
+    # Advance to the "Edge types:" header.
+    for line in lines:
+        if line.strip().lower().startswith("edge types"):
+            break
+
+    # Collect bullet items until the first blank line.
+    for line in lines:
         stripped = line.strip()
-        if stripped.lower().startswith("edge types"):
-            in_edges = True
-            continue
-        if in_edges:
-            if not stripped:
-                break
-            if stripped.startswith("-") or stripped.startswith("*"):
-                name = stripped.lstrip("-* ").split("(")[0].strip()
-                if name:
-                    edge_types.append(name)
+        if not stripped:
+            break
+        if stripped.startswith(("-", "*")):
+            name = stripped.lstrip("-* ").split("(")[0].strip()
+            if name:
+                edge_types.append(name)
+
     return edge_types