Skip to content

Commit f0e1ff3

Browse files
committed
fix(langchain): deduplicate relationship triples after _first_label normalization
RETURN DISTINCT operates on raw label lists, but min()-based _first_label() can collapse different multi-label combos (e.g. ['Employee','Person'] and ['Person','Employee']) into the same (start, type, end) triple. Use a set for deduplication after normalization so each relationship triple appears once. Also make the vector_query() integration test embedding unique per test tag (derived from tag bytes) and increase similarity_top_k to 5 to prevent flaky results in shared integration DBs where another :Chunk may share the same vector.
1 parent 543816f commit f0e1ff3

2 files changed

Lines changed: 21 additions & 9 deletions

File tree

langchain-coordinode/langchain_coordinode/graph.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,20 @@ def refresh_schema(self) -> None:
8181
"MATCH (a)-[r]->(b) RETURN DISTINCT labels(a) AS src_labels, type(r) AS rel, labels(b) AS dst_labels"
8282
)
8383
if rows:
84+
# Deduplicate after _first_label() normalization: RETURN DISTINCT operates on
85+
# raw label lists, but _first_label(min()) can collapse different multi-label
86+
# combinations to the same (start, type, end) triple (e.g. ['Employee','Person']
87+
# and ['Person','Employee'] both min-normalize to 'Employee'). Use a set to
88+
# ensure each relationship triple appears at most once.
89+
triples: set[tuple[str, str, str]] = set()
90+
for row in rows:
91+
start = _first_label(row.get("src_labels"))
92+
end = _first_label(row.get("dst_labels"))
93+
rel = row.get("rel")
94+
if start and rel and end:
95+
triples.add((start, rel, end))
8496
structured["relationships"] = [
85-
{
86-
"start": _first_label(row.get("src_labels")),
87-
"type": row["rel"],
88-
"end": _first_label(row.get("dst_labels")),
89-
}
90-
for row in rows
91-
if _first_label(row.get("src_labels")) and row.get("rel") and _first_label(row.get("dst_labels"))
97+
{"start": start, "type": rel, "end": end} for start, rel, end in sorted(triples)
9298
]
9399
self._structured_schema = structured
94100

tests/integration/adapters/test_llama_index.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,11 @@ def test_vector_query_returns_results(store, tag):
162162
vector_query() without filters defaults to label="Chunk", so the seed node must use
163163
that label to be found by the underlying vector_search() call.
164164
"""
165-
vec = [float(i) / 16 for i in range(16)]
165+
# Derive a unique embedding from the test tag so that no other :Chunk in the shared
166+
# integration DB can have the same or closer vector, preventing flaky top-k results.
167+
# tag is uuid4().hex[:8] → 8 hex chars → 4 bytes of entropy.
168+
seed = list(bytes.fromhex(tag))
169+
vec = [float(seed[i % len(seed)]) / 255.0 for i in range(16)]
166170
# Seeding is inside the try block so that the finally cleanup always runs even if
167171
# the CREATE succeeds but extracting seeded_internal_id raises (e.g., unexpected
168172
# response format). vector_query() defaults label to "Chunk" when no
@@ -183,7 +187,9 @@ def test_vector_query_returns_results(store, tag):
183187
params={"id": f"vec-{tag}", "text": "test chunk", "vec": vec},
184188
)
185189
seeded_internal_id = str(seed_rows[0]["nid"])
186-
query = VectorStoreQuery(query_embedding=vec, similarity_top_k=1)
190+
# top_k=5: even if other :Chunk nodes exist with similar vectors, the unique
191+
# tag-based embedding ensures ours is among the closest results.
192+
query = VectorStoreQuery(query_embedding=vec, similarity_top_k=5)
187193
nodes, scores = store.vector_query(query)
188194

189195
assert isinstance(nodes, list)

0 commit comments

Comments
 (0)