Merge pull request #27 from structured-world/feat/#26-similarity-search-upsert-idempotency

polaz · web-flow · commit f0ad60333ea2 · 2026-04-12T16:25:37.000+03:00
feat: similarity_search() for LangChain + upsert_relations() idempotency test
diff --git a/langchain-coordinode/langchain_coordinode/graph.py b/langchain-coordinode/langchain_coordinode/graph.py
@@ -5,6 +5,7 @@
 import hashlib
 import json
 import re
+from collections.abc import Sequence
 from typing import Any
 
 from langchain_community.graphs.graph_store import GraphStore
@@ -194,6 +195,45 @@ def query(
         # cypher() returns List[Dict[str, Any]] directly — column name → value.
         return self._client.cypher(query, params=params or {})
 
+    def similarity_search(
+        self,
+        query_vector: Sequence[float],
+        k: int = 5,
+        label: str = "Chunk",
+        property: str = "embedding",
+    ) -> list[dict[str, Any]]:
+        """Find nodes whose ``property`` vector is closest to ``query_vector``.
+
+        Wraps ``CoordinodeClient.vector_search()``.  The returned list contains
+        one dict per result with the keys ``node`` (node properties), ``id``
+        (internal integer node ID), and ``distance`` (cosine distance, lower =
+        more similar).
+
+        Args:
+            query_vector: Embedding vector to search for.
+            k: Maximum number of results to return.
+            label: Node label to search (default ``"Chunk"``).
+            property: Embedding property name (default ``"embedding"``).
+
+        Returns:
+            List of result dicts sorted by ascending distance.
+        """
+        # Use len() instead of truthiness check: numpy.ndarray (and other Sequence
+        # types) raise ValueError("The truth value of an array is ambiguous") when
+        # used in a boolean context. len() == 0 works for all sequence types.
+        if len(query_vector) == 0:
+            return []
+        results = sorted(
+            self._client.vector_search(
+                label=label,
+                property=property,
+                vector=query_vector,
+                top_k=k,
+            ),
+            key=lambda r: r.distance,
+        )
+        return [{"id": r.node.id, "node": r.node.properties, "distance": r.distance} for r in results]
+
     # ── Lifecycle ─────────────────────────────────────────────────────────
 
     def close(self) -> None:
diff --git a/tests/integration/adapters/test_langchain.py b/tests/integration/adapters/test_langchain.py
@@ -133,6 +133,48 @@ def test_add_graph_documents_idempotent(graph, unique_tag):
     assert result[0]["cnt"] == 1
 
 
+# ── similarity_search ─────────────────────────────────────────────────────────
+
+
+def test_similarity_search_returns_results(graph, unique_tag):
+    """similarity_search() returns node dicts with id, node, and distance keys.
+
+    Seeds a :LCSim node with a known embedding, then searches for the closest
+    vector. The seeded node must appear in the top-k results.
+    """
+    # Derive a unique embedding from the test tag (same technique as llama-index
+    # test) to avoid collisions with other :LCSim nodes in the shared DB.
+    seed = list(bytes.fromhex(unique_tag))
+    vec = [float(seed[i % len(seed)]) / 255.0 for i in range(16)]
+
+    try:
+        seed_rows = graph.query(
+            "CREATE (n:LCSim {id: $id, embedding: $vec}) RETURN n AS nid",
+            params={"id": f"lcsim-{unique_tag}", "vec": vec},
+        )
+        # graph.query() wraps CoordinodeClient.cypher() which returns raw dict values.
+        # CoordiNode: CREATE ... RETURN n yields the internal integer node ID directly
+        # (NOT a node object). similarity_search() also returns {"id": r.node.id, ...}
+        # where r.node.id is the same integer. Direct equality comparison is correct.
+        seeded_internal_id = seed_rows[0]["nid"]
+
+        results = graph.similarity_search(vec, k=5, label="LCSim", property="embedding")
+
+        assert isinstance(results, list)
+        assert len(results) >= 1
+        assert all("id" in r and "node" in r and "distance" in r for r in results)
+        assert any(r["id"] == seeded_internal_id for r in results)
+        assert all(results[i]["distance"] <= results[i + 1]["distance"] for i in range(len(results) - 1))
+    finally:
+        graph.query("MATCH (n:LCSim {id: $id}) DELETE n", params={"id": f"lcsim-{unique_tag}"})
+
+
+def test_similarity_search_empty_vector_returns_empty(graph):
+    """similarity_search() with an empty vector list returns an empty list without error."""
+    results = graph.similarity_search([], k=5)
+    assert results == []
+
+
 def test_schema_refreshes_after_add(graph, unique_tag):
     """structured_schema is invalidated and re-fetched after add_graph_documents."""
     graph._schema = None  # force refresh
diff --git a/tests/integration/adapters/test_llama_index.py b/tests/integration/adapters/test_llama_index.py
@@ -72,6 +72,23 @@ def test_upsert_nodes_idempotent(store, tag):
     assert len(found) == 1
 
 
+def test_upsert_relations_idempotent(store, tag):
+    """Upserting the same relation twice must produce exactly one edge (MERGE idempotent)."""
+    src = EntityNode(label="LIIdempRel", name=f"IdempSrc-{tag}")
+    dst = EntityNode(label="LIIdempRel", name=f"IdempDst-{tag}")
+    store.upsert_nodes([src, dst])
+
+    rel = Relation(label="LI_IDEMP_REL", source_id=src.id, target_id=dst.id)
+    store.upsert_relations([rel])
+    store.upsert_relations([rel])  # second call must not duplicate
+
+    rows = store.structured_query(
+        "MATCH (a {id: $src})-[r:LI_IDEMP_REL]->(b {id: $dst}) RETURN count(r) AS cnt",
+        param_map={"src": src.id, "dst": dst.id},
+    )
+    assert rows[0]["cnt"] == 1, f"expected exactly 1 edge after double upsert, got: {rows}"
+
+
 def test_get_by_id(store, tag):
     node = EntityNode(label="LIGetById", name=f"ById-{tag}")
     node_id = node.id