Skip to content

Commit f0ad603

Browse files
authored
Merge pull request #27 from structured-world/feat/#26-similarity-search-upsert-idempotency
feat: similarity_search() for LangChain + upsert_relations() idempotency test
2 parents 1101ac8 + ab3559e commit f0ad603

3 files changed

Lines changed: 99 additions & 0 deletions

File tree

langchain-coordinode/langchain_coordinode/graph.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import hashlib
66
import json
77
import re
8+
from collections.abc import Sequence
89
from typing import Any
910

1011
from langchain_community.graphs.graph_store import GraphStore
@@ -194,6 +195,45 @@ def query(
194195
# cypher() returns List[Dict[str, Any]] directly — column name → value.
195196
return self._client.cypher(query, params=params or {})
196197

198+
def similarity_search(
199+
self,
200+
query_vector: Sequence[float],
201+
k: int = 5,
202+
label: str = "Chunk",
203+
property: str = "embedding",
204+
) -> list[dict[str, Any]]:
205+
"""Find nodes whose ``property`` vector is closest to ``query_vector``.
206+
207+
Wraps ``CoordinodeClient.vector_search()``. The returned list contains
208+
one dict per result with the keys ``node`` (node properties), ``id``
209+
(internal integer node ID), and ``distance`` (cosine distance, lower =
210+
more similar).
211+
212+
Args:
213+
query_vector: Embedding vector to search for.
214+
k: Maximum number of results to return.
215+
label: Node label to search (default ``"Chunk"``).
216+
property: Embedding property name (default ``"embedding"``).
217+
218+
Returns:
219+
List of result dicts sorted by ascending distance.
220+
"""
221+
# Use len() instead of truthiness check: numpy.ndarray (and other Sequence
222+
# types) raise ValueError("The truth value of an array is ambiguous") when
223+
# used in a boolean context. len() == 0 works for all sequence types.
224+
if len(query_vector) == 0:
225+
return []
226+
results = sorted(
227+
self._client.vector_search(
228+
label=label,
229+
property=property,
230+
vector=query_vector,
231+
top_k=k,
232+
),
233+
key=lambda r: r.distance,
234+
)
235+
return [{"id": r.node.id, "node": r.node.properties, "distance": r.distance} for r in results]
236+
197237
# ── Lifecycle ─────────────────────────────────────────────────────────
198238

199239
def close(self) -> None:

tests/integration/adapters/test_langchain.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,48 @@ def test_add_graph_documents_idempotent(graph, unique_tag):
133133
assert result[0]["cnt"] == 1
134134

135135

136+
# ── similarity_search ─────────────────────────────────────────────────────────
137+
138+
139+
def test_similarity_search_returns_results(graph, unique_tag):
140+
"""similarity_search() returns node dicts with id, node, and distance keys.
141+
142+
Seeds a :LCSim node with a known embedding, then searches for the closest
143+
vector. The seeded node must appear in the top-k results.
144+
"""
145+
# Derive a unique embedding from the test tag (same technique as llama-index
146+
# test) to avoid collisions with other :LCSim nodes in the shared DB.
147+
seed = list(bytes.fromhex(unique_tag))
148+
vec = [float(seed[i % len(seed)]) / 255.0 for i in range(16)]
149+
150+
try:
151+
seed_rows = graph.query(
152+
"CREATE (n:LCSim {id: $id, embedding: $vec}) RETURN n AS nid",
153+
params={"id": f"lcsim-{unique_tag}", "vec": vec},
154+
)
155+
# graph.query() wraps CoordinodeClient.cypher() which returns raw dict values.
156+
# CoordiNode: CREATE ... RETURN n yields the internal integer node ID directly
157+
# (NOT a node object). similarity_search() also returns {"id": r.node.id, ...}
158+
# where r.node.id is the same integer. Direct equality comparison is correct.
159+
seeded_internal_id = seed_rows[0]["nid"]
160+
161+
results = graph.similarity_search(vec, k=5, label="LCSim", property="embedding")
162+
163+
assert isinstance(results, list)
164+
assert len(results) >= 1
165+
assert all("id" in r and "node" in r and "distance" in r for r in results)
166+
assert any(r["id"] == seeded_internal_id for r in results)
167+
assert all(results[i]["distance"] <= results[i + 1]["distance"] for i in range(len(results) - 1))
168+
finally:
169+
graph.query("MATCH (n:LCSim {id: $id}) DELETE n", params={"id": f"lcsim-{unique_tag}"})
170+
171+
172+
def test_similarity_search_empty_vector_returns_empty(graph):
173+
"""similarity_search() with an empty vector list returns an empty list without error."""
174+
results = graph.similarity_search([], k=5)
175+
assert results == []
176+
177+
136178
def test_schema_refreshes_after_add(graph, unique_tag):
137179
"""structured_schema is invalidated and re-fetched after add_graph_documents."""
138180
graph._schema = None # force refresh

tests/integration/adapters/test_llama_index.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,23 @@ def test_upsert_nodes_idempotent(store, tag):
7272
assert len(found) == 1
7373

7474

75+
def test_upsert_relations_idempotent(store, tag):
76+
"""Upserting the same relation twice must produce exactly one edge (MERGE idempotent)."""
77+
src = EntityNode(label="LIIdempRel", name=f"IdempSrc-{tag}")
78+
dst = EntityNode(label="LIIdempRel", name=f"IdempDst-{tag}")
79+
store.upsert_nodes([src, dst])
80+
81+
rel = Relation(label="LI_IDEMP_REL", source_id=src.id, target_id=dst.id)
82+
store.upsert_relations([rel])
83+
store.upsert_relations([rel]) # second call must not duplicate
84+
85+
rows = store.structured_query(
86+
"MATCH (a {id: $src})-[r:LI_IDEMP_REL]->(b {id: $dst}) RETURN count(r) AS cnt",
87+
param_map={"src": src.id, "dst": dst.id},
88+
)
89+
assert rows[0]["cnt"] == 1, f"expected exactly 1 edge after double upsert, got: {rows}"
90+
91+
7592
def test_get_by_id(store, tag):
7693
node = EntityNode(label="LIGetById", name=f"ById-{tag}")
7794
node_id = node.id

0 commit comments

Comments
 (0)