fix(langchain): deduplicate relationship triples after _first_label normalization

polaz · polaz · commit f0e1ff3f954e · 2026-04-12T13:57:04.000+03:00
RETURN DISTINCT operates on raw label lists, but min()-based _first_label()
can collapse different multi-label combos (e.g. ['Employee','Person'] and
['Person','Employee']) into the same (start, type, end) triple. Use a set
for deduplication after normalization so each relationship triple appears once.

Also make the vector_query() integration test embedding unique per test tag
(derived from tag bytes) and increase similarity_top_k to 5 to prevent flaky
results in shared integration DBs where another :Chunk may share the same vector.
diff --git a/langchain-coordinode/langchain_coordinode/graph.py b/langchain-coordinode/langchain_coordinode/graph.py
@@ -81,14 +81,20 @@ def refresh_schema(self) -> None:
             "MATCH (a)-[r]->(b) RETURN DISTINCT labels(a) AS src_labels, type(r) AS rel, labels(b) AS dst_labels"
         )
         if rows:
+            # Deduplicate after _first_label() normalization: RETURN DISTINCT operates on
+            # raw label lists, but _first_label(min()) can collapse different multi-label
+            # combinations to the same (start, type, end) triple (e.g. ['Employee','Person']
+            # and ['Person','Employee'] both min-normalize to 'Employee'). Use a set to
+            # ensure each relationship triple appears at most once.
+            triples: set[tuple[str, str, str]] = set()
+            for row in rows:
+                start = _first_label(row.get("src_labels"))
+                end = _first_label(row.get("dst_labels"))
+                rel = row.get("rel")
+                if start and rel and end:
+                    triples.add((start, rel, end))
             structured["relationships"] = [
-                {
-                    "start": _first_label(row.get("src_labels")),
-                    "type": row["rel"],
-                    "end": _first_label(row.get("dst_labels")),
-                }
-                for row in rows
-                if _first_label(row.get("src_labels")) and row.get("rel") and _first_label(row.get("dst_labels"))
+                {"start": start, "type": rel, "end": end} for start, rel, end in sorted(triples)
             ]
         self._structured_schema = structured
 
diff --git a/tests/integration/adapters/test_llama_index.py b/tests/integration/adapters/test_llama_index.py
@@ -162,7 +162,11 @@ def test_vector_query_returns_results(store, tag):
     vector_query() without filters defaults to label="Chunk", so the seed node must use
     that label to be found by the underlying vector_search() call.
     """
-    vec = [float(i) / 16 for i in range(16)]
+    # Derive a unique embedding from the test tag so that no other :Chunk in the shared
+    # integration DB can have the same or closer vector, preventing flaky top-k results.
+    # tag is uuid4().hex[:8] → 8 hex chars → 4 bytes of entropy.
+    seed = list(bytes.fromhex(tag))
+    vec = [float(seed[i % len(seed)]) / 255.0 for i in range(16)]
     # Seeding is inside the try block so that the finally cleanup always runs even if
     # the CREATE succeeds but extracting seeded_internal_id raises (e.g., unexpected
     # response format). vector_query() defaults label to "Chunk" when no
@@ -183,7 +187,9 @@ def test_vector_query_returns_results(store, tag):
             params={"id": f"vec-{tag}", "text": "test chunk", "vec": vec},
         )
         seeded_internal_id = str(seed_rows[0]["nid"])
-        query = VectorStoreQuery(query_embedding=vec, similarity_top_k=1)
+        # top_k=5: even if other :Chunk nodes exist with similar vectors, the unique
+        # tag-based embedding ensures ours is among the closest results.
+        query = VectorStoreQuery(query_embedding=vec, similarity_top_k=5)
         nodes, scores = store.vector_query(query)
 
         assert isinstance(nodes, list)