Skip to content

Commit c3a1e1c

Browse files
feat: add rdf_reader
1 parent 9c3f073 commit c3a1e1c

2 files changed

Lines changed: 51 additions & 0 deletions

File tree

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from typing import Any, Dict, List
2+
3+
import rdflib
4+
from rdflib import Literal
5+
from rdflib.util import guess_format
6+
7+
from graphgen.bases.base_reader import BaseReader
8+
9+
10+
class RDFReader(BaseReader):
11+
"""
12+
Reader for RDF files that extracts triples and represents them as dictionaries.
13+
"""
14+
15+
def read(self, file_path: str) -> List[Dict[str, Any]]:
16+
g = rdflib.Graph()
17+
fmt = guess_format(file_path)
18+
try:
19+
g.parse(file_path, format=fmt)
20+
except Exception as e:
21+
raise ValueError(f"Cannot parse RDF file {file_path}: {e}") from e
22+
23+
docs: List[Dict[str, Any]] = []
24+
text_col = self.text_column
25+
26+
for subj in set(g.subjects()):
27+
literals = []
28+
props = {}
29+
for _, pred, obj in g.triples((subj, None, None)):
30+
pred_str = str(pred)
31+
if isinstance(obj, Literal):
32+
literals.append(str(obj))
33+
props.setdefault(pred_str, []).append(str(obj))
34+
35+
text = " ".join(literals).strip()
36+
if not text:
37+
raise ValueError(
38+
f"Subject {subj} has no literal values; "
39+
f"missing '{text_col}' for text column."
40+
)
41+
42+
doc = {"id": str(subj), text_col: text, "properties": props}
43+
docs.append(doc)
44+
45+
if not docs:
46+
raise ValueError("RDF file contains no valid documents.")
47+
48+
return self.filter(docs)

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ leidenalg
2424
igraph
2525
python-louvain
2626

27+
# KG
28+
rdflib
29+
2730
# Bioinformatics
2831
biopython
2932

0 commit comments

Comments
 (0)