|
3 | 3 | from __future__ import annotations |
4 | 4 |
|
5 | 5 | import hashlib |
| 6 | +import json |
6 | 7 | import re |
7 | 8 | from typing import Any |
8 | 9 |
|
@@ -111,63 +112,70 @@ def add_graph_documents( |
111 | 112 | ``MENTIONS`` edges (also unconditional ``CREATE``). |
112 | 113 | """ |
113 | 114 | for doc in graph_documents: |
114 | | - # ── Upsert nodes ────────────────────────────────────────────── |
115 | 115 | for node in doc.nodes: |
116 | | - label = _cypher_ident(node.type or "Entity") |
117 | | - props = dict(node.properties or {}) |
118 | | - # Always enforce node.id as the merge key; incoming |
119 | | - # properties["name"] must not drift from the MERGE predicate. |
120 | | - props["name"] = node.id |
121 | | - self._client.cypher( |
122 | | - f"MERGE (n:{label} {{name: $name}}) SET n += $props", |
123 | | - params={"name": node.id, "props": props}, |
124 | | - ) |
125 | | - |
126 | | - # ── Create relationships ────────────────────────────────────── |
| 116 | + self._upsert_node(node) |
127 | 117 | for rel in doc.relationships: |
128 | | - src_label = _cypher_ident(rel.source.type or "Entity") |
129 | | - dst_label = _cypher_ident(rel.target.type or "Entity") |
130 | | - rel_type = _cypher_ident(rel.type) |
131 | | - props = dict(rel.properties or {}) |
132 | | - # CoordiNode does not support MERGE for edges or WHERE NOT |
133 | | - # (pattern) guards — use unconditional CREATE. SET r += $props |
134 | | - # is skipped when props is empty because SET r += {} is not |
135 | | - # supported by all server versions. |
136 | | - if props: |
137 | | - self._client.cypher( |
138 | | - f"MATCH (src:{src_label} {{name: $src}}) " |
139 | | - f"MATCH (dst:{dst_label} {{name: $dst}}) " |
140 | | - f"CREATE (src)-[r:{rel_type}]->(dst) SET r += $props", |
141 | | - params={"src": rel.source.id, "dst": rel.target.id, "props": props}, |
142 | | - ) |
143 | | - else: |
144 | | - self._client.cypher( |
145 | | - f"MATCH (src:{src_label} {{name: $src}}) " |
146 | | - f"MATCH (dst:{dst_label} {{name: $dst}}) " |
147 | | - f"CREATE (src)-[r:{rel_type}]->(dst)", |
148 | | - params={"src": rel.source.id, "dst": rel.target.id}, |
149 | | - ) |
150 | | - |
151 | | - # ── Optionally link source document ─────────────────────────── |
| 118 | + self._create_edge(rel) |
152 | 119 | if include_source and doc.source: |
153 | | - src_id = getattr(doc.source, "id", None) or _stable_document_id(doc.source) |
154 | | - self._client.cypher( |
155 | | - "MERGE (d:__Document__ {id: $id}) SET d.page_content = $text", |
156 | | - params={"id": src_id, "text": doc.source.page_content or ""}, |
157 | | - ) |
158 | | - for node in doc.nodes: |
159 | | - label = _cypher_ident(node.type or "Entity") |
160 | | - self._client.cypher( |
161 | | - f"MATCH (d:__Document__ {{id: $doc_id}}) " |
162 | | - f"MATCH (n:{label} {{name: $name}}) " |
163 | | - f"CREATE (d)-[:MENTIONS]->(n)", |
164 | | - params={"doc_id": src_id, "name": node.id}, |
165 | | - ) |
| 120 | + self._link_document_to_entities(doc) |
166 | 121 |
|
167 | 122 | # Invalidate cached schema so next access reflects new data |
168 | 123 | self._schema = None |
169 | 124 | self._structured_schema = None |
170 | 125 |
|
| 126 | + def _upsert_node(self, node: Any) -> None: |
| 127 | + """Upsert a single node by ``id`` via MERGE.""" |
| 128 | + label = _cypher_ident(node.type or "Entity") |
| 129 | + props = dict(node.properties or {}) |
| 130 | + # Always enforce node.id as the merge key; incoming |
| 131 | + # properties["name"] must not drift from the MERGE predicate. |
| 132 | + props["name"] = node.id |
| 133 | + self._client.cypher( |
| 134 | + f"MERGE (n:{label} {{name: $name}}) SET n += $props", |
| 135 | + params={"name": node.id, "props": props}, |
| 136 | + ) |
| 137 | + |
| 138 | + def _create_edge(self, rel: Any) -> None: |
| 139 | + """Create a relationship via unconditional CREATE. |
| 140 | +
|
| 141 | + CoordiNode does not support MERGE for edge patterns. Re-ingesting the |
| 142 | + same relationship will create a duplicate edge. SET r += $props is |
| 143 | + skipped when props is empty because SET r += {} is not supported by all |
| 144 | + server versions. |
| 145 | + """ |
| 146 | + src_label = _cypher_ident(rel.source.type or "Entity") |
| 147 | + dst_label = _cypher_ident(rel.target.type or "Entity") |
| 148 | + rel_type = _cypher_ident(rel.type) |
| 149 | + props = dict(rel.properties or {}) |
| 150 | + if props: |
| 151 | + self._client.cypher( |
| 152 | + f"MATCH (src:{src_label} {{name: $src}}) " |
| 153 | + f"MATCH (dst:{dst_label} {{name: $dst}}) " |
| 154 | + f"CREATE (src)-[r:{rel_type}]->(dst) SET r += $props", |
| 155 | + params={"src": rel.source.id, "dst": rel.target.id, "props": props}, |
| 156 | + ) |
| 157 | + else: |
| 158 | + self._client.cypher( |
| 159 | + f"MATCH (src:{src_label} {{name: $src}}) " |
| 160 | + f"MATCH (dst:{dst_label} {{name: $dst}}) " |
| 161 | + f"CREATE (src)-[r:{rel_type}]->(dst)", |
| 162 | + params={"src": rel.source.id, "dst": rel.target.id}, |
| 163 | + ) |
| 164 | + |
| 165 | + def _link_document_to_entities(self, doc: Any) -> None: |
| 166 | + """Upsert a ``__Document__`` node and CREATE ``MENTIONS`` edges to all entities.""" |
| 167 | + src_id = getattr(doc.source, "id", None) or _stable_document_id(doc.source) |
| 168 | + self._client.cypher( |
| 169 | + "MERGE (d:__Document__ {id: $id}) SET d.page_content = $text", |
| 170 | + params={"id": src_id, "text": doc.source.page_content or ""}, |
| 171 | + ) |
| 172 | + for node in doc.nodes: |
| 173 | + label = _cypher_ident(node.type or "Entity") |
| 174 | + self._client.cypher( |
| 175 | + f"MATCH (d:__Document__ {{id: $doc_id}}) MATCH (n:{label} {{name: $name}}) CREATE (d)-[:MENTIONS]->(n)", |
| 176 | + params={"doc_id": src_id, "name": node.id}, |
| 177 | + ) |
| 178 | + |
171 | 179 | def query( |
172 | 180 | self, |
173 | 181 | query: str, |
@@ -213,8 +221,15 @@ def _stable_document_id(source: Any) -> str: |
213 | 221 | """ |
214 | 222 | content = getattr(source, "page_content", "") or "" |
215 | 223 | metadata = getattr(source, "metadata", {}) or {} |
216 | | - stable = content + "|" + "|".join(f"{k}={v}" for k, v in sorted(metadata.items())) |
217 | | - return hashlib.sha256(stable.encode()).hexdigest()[:32] |
| 224 | + # Use canonical JSON encoding to avoid delimiter ambiguity and ensure |
| 225 | + # determinism for nested/non-scalar metadata values. |
| 226 | + canonical = json.dumps( |
| 227 | + {"content": content, "metadata": metadata}, |
| 228 | + sort_keys=True, |
| 229 | + separators=(",", ":"), |
| 230 | + ensure_ascii=False, |
| 231 | + ) |
| 232 | + return hashlib.sha256(canonical.encode()).hexdigest()[:32] |
218 | 233 |
|
219 | 234 |
|
220 | 235 | def _cypher_ident(name: str) -> str: |
|
0 commit comments