@@ -72,19 +72,29 @@ def refresh_schema(self) -> None:
7272 structured = _parse_schema (text )
7373 # Augment with relationship triples (start_label, type, end_label) via
7474 # Cypher — get_schema_text() only lists edge types without direction.
75- # CoordiNode: wildcard [r] returns no results; build typed pattern from
76- # the rel_props keys returned by _parse_schema().
77- rel_types = list (structured .get ("rel_props" , {}).keys ())
78- if rel_types :
79- rel_filter = "|" .join (_cypher_ident (t ) for t in rel_types )
80- rows = self ._client .cypher (
81- f"MATCH (a)-[r:{ rel_filter } ]->(b) "
82- "RETURN DISTINCT a.__label__ AS src, r.__type__ AS rel, b.__label__ AS dst"
83- )
75+ # No LIMIT here intentionally: RETURN DISTINCT already collapses all edges
76+ # to unique (src_label, rel_type, dst_label) combinations, so the result
77+ # is bounded by the number of distinct relationship type triples, not by
78+ # total edge count. Adding a LIMIT would silently drop relationship types
79+ # that happen to appear beyond the limit, producing an incomplete schema.
80+ rows = self ._client .cypher (
81+ "MATCH (a)-[r]->(b) RETURN DISTINCT labels(a) AS src_labels, type(r) AS rel, labels(b) AS dst_labels"
82+ )
83+ if rows :
84+ # Deduplicate after _first_label() normalization: RETURN DISTINCT operates on
85+ # raw label lists, but _first_label(min()) can collapse different multi-label
86+ # combinations to the same (start, type, end) triple (e.g. ['Employee','Person']
87+ # and ['Person','Employee'] both min-normalize to 'Employee'). Use a set to
88+ # ensure each relationship triple appears at most once.
89+ triples : set [tuple [str , str , str ]] = set ()
90+ for row in rows :
91+ start = _first_label (row .get ("src_labels" ))
92+ end = _first_label (row .get ("dst_labels" ))
93+ rel = row .get ("rel" )
94+ if start and rel and end :
95+ triples .add ((start , rel , end ))
8496 structured ["relationships" ] = [
85- {"start" : row ["src" ], "type" : row ["rel" ], "end" : row ["dst" ]}
86- for row in rows
87- if row .get ("src" ) and row .get ("rel" ) and row .get ("dst" )
97+ {"start" : start , "type" : rel , "end" : end } for start , rel , end in sorted (triples )
8898 ]
8999 self ._structured_schema = structured
90100
@@ -95,18 +105,14 @@ def add_graph_documents(
95105 ) -> None :
96106 """Store nodes and relationships extracted from ``GraphDocument`` objects.
97107
98- Nodes are upserted by ``id`` (used as the ``name`` property) via
99- ``MERGE``, so repeated calls are safe for nodes.
100-
101- Relationships are created with unconditional ``CREATE`` because
102- CoordiNode does not yet support ``MERGE`` for edge patterns. Re-ingesting
103- the same ``GraphDocument`` will therefore produce duplicate edges.
108+ Both nodes and relationships are upserted via ``MERGE``, so repeated
109+ calls with the same data are idempotent.
104110
105111 Args:
106112 graph_documents: List of ``langchain_community.graphs.graph_document.GraphDocument``.
107113 include_source: If ``True``, also store the source ``Document`` as a
108114 ``__Document__`` node linked to every extracted entity via
109- ``MENTIONS`` edges (also unconditional ``CREATE``) .
115+ ``MENTIONS`` edges.
110116 """
111117 for doc in graph_documents :
112118 for node in doc .nodes :
@@ -133,12 +139,10 @@ def _upsert_node(self, node: Any) -> None:
133139 )
134140
135141 def _create_edge (self , rel : Any ) -> None :
136- """Create a relationship via unconditional CREATE .
142+ """Upsert a relationship via MERGE (idempotent) .
137143
138- CoordiNode does not support MERGE for edge patterns. Re-ingesting the
139- same relationship will create a duplicate edge. SET r += $props is
140- skipped when props is empty because SET r += {} is not supported by all
141- server versions.
144+ SET r += $props is skipped when props is empty because
145+ SET r += {} is not supported by all server versions.
142146 """
143147 src_label = _cypher_ident (rel .source .type or "Entity" )
144148 dst_label = _cypher_ident (rel .target .type or "Entity" )
@@ -148,19 +152,19 @@ def _create_edge(self, rel: Any) -> None:
148152 self ._client .cypher (
149153 f"MATCH (src:{ src_label } {{name: $src}}) "
150154 f"MATCH (dst:{ dst_label } {{name: $dst}}) "
151- f"CREATE (src)-[r:{ rel_type } ]->(dst) SET r += $props" ,
155+ f"MERGE (src)-[r:{ rel_type } ]->(dst) SET r += $props" ,
152156 params = {"src" : rel .source .id , "dst" : rel .target .id , "props" : props },
153157 )
154158 else :
155159 self ._client .cypher (
156160 f"MATCH (src:{ src_label } {{name: $src}}) "
157161 f"MATCH (dst:{ dst_label } {{name: $dst}}) "
158- f"CREATE (src)-[r:{ rel_type } ]->(dst)" ,
162+ f"MERGE (src)-[r:{ rel_type } ]->(dst)" ,
159163 params = {"src" : rel .source .id , "dst" : rel .target .id },
160164 )
161165
162166 def _link_document_to_entities (self , doc : Any ) -> None :
163- """Upsert a ``__Document__`` node and CREATE ``MENTIONS`` edges to all entities."""
167+ """Upsert a ``__Document__`` node and MERGE ``MENTIONS`` edges to all entities."""
164168 src_id = getattr (doc .source , "id" , None ) or _stable_document_id (doc .source )
165169 self ._client .cypher (
166170 "MERGE (d:__Document__ {id: $id}) SET d.page_content = $text" ,
@@ -169,7 +173,7 @@ def _link_document_to_entities(self, doc: Any) -> None:
169173 for node in doc .nodes :
170174 label = _cypher_ident (node .type or "Entity" )
171175 self ._client .cypher (
172- f"MATCH (d:__Document__ {{id: $doc_id}}) MATCH (n:{ label } {{name: $name}}) CREATE (d)-[:MENTIONS]->(n)" ,
176+ f"MATCH (d:__Document__ {{id: $doc_id}}) MATCH (n:{ label } {{name: $name}}) MERGE (d)-[:MENTIONS]->(n)" ,
173177 params = {"doc_id" : src_id , "name" : node .id },
174178 )
175179
@@ -211,10 +215,7 @@ def _stable_document_id(source: Any) -> str:
211215
212216 Combines ``page_content`` and sorted ``metadata`` items so the same
213217 document produces the same ``__Document__`` node ID across different
214- Python processes. This makes document-node creation stable when
215- ``include_source=True`` is used, but does not make re-ingest fully
216- idempotent because ``MENTIONS`` edges are not deduplicated until edge
217- ``MERGE``/dedup support is added to CoordiNode.
218+ Python processes.
218219 """
219220 content = getattr (source , "page_content" , "" ) or ""
220221 metadata = getattr (source , "metadata" , {}) or {}
@@ -232,6 +233,20 @@ def _stable_document_id(source: Any) -> str:
232233 return hashlib .sha256 (canonical .encode ()).hexdigest ()[:32 ]
233234
234235
236+ def _first_label (labels : Any ) -> str | None :
237+ """Extract a stable label from a labels() result (list of strings).
238+
239+ openCypher does not guarantee a stable ordering for labels(), so using
240+ labels[0] would produce nondeterministic schema entries across calls.
241+ We return the lexicographically smallest label as a deterministic rule.
242+ """
243+ if isinstance (labels , list ) and labels :
244+ return str (min (labels ))
245+ if isinstance (labels , str ):
246+ return labels
247+ return None
248+
249+
235250def _cypher_ident (name : str ) -> str :
236251 """Escape a label/type name for use as a Cypher identifier."""
237252 # ASCII-only word characters: letter/digit/underscore, not starting with digit.
0 commit comments