Skip to content

Commit e21a193

Browse files
committed
curie contraction
1 parent 74c847e commit e21a193

7 files changed

Lines changed: 1663 additions & 24 deletions

File tree

src/rdf_sql_bulkloader/cli.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import logging
44

55
from rdf_sql_bulkloader import __version__
6-
from rdf_sql_bulkloader.main import demo
76

87
__all__ = [
98
"main",

src/rdf_sql_bulkloader/loaders/bulkloader.py

Lines changed: 102 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
11
import re
22
from abc import ABC
3+
from collections import defaultdict
34
from dataclasses import dataclass
45
from pathlib import Path
5-
from typing import Union, Tuple, Iterator, Mapping
6+
from typing import Union, Tuple, Iterator, Mapping, Optional
67

8+
import curies
79
import lightrdf
10+
from curies import Converter
811

912
re_untyped_literal = re.compile(r'^"(.*)"$')
10-
re_typed_literal = re.compile(r'^"(.*)"^^<([\S^"]+)>$')
13+
re_typed_literal = re.compile(r'^"(.*)"\^\^<([\S^"]+)>$')
1114
re_lang_literal = re.compile(r'^"(.*)"@(\w+)$')
15+
re_blank_node = re.compile(r'^riog(\d+)$')
16+
1217

1318
DDL = """
1419
CREATE TABLE statement (
@@ -35,37 +40,120 @@
3540
STATEMENT = Tuple[SUBJECT, PREDICATE, OBJECT_URI, OBJECT_VALUE, OBJECT_DATATYPE, OBJECT_LANG]
3641
PREFIX_MAP = Mapping[PREFIX, URI]
3742

43+
44+
def _parse_literal(o: str) -> Tuple[OBJECT_VALUE, OBJECT_DATATYPE, OBJECT_LANG]:
45+
"""
46+
parses an encoded literal
47+
48+
See https://github.com/ozekik/lightrdf/issues/12
49+
:param o:
50+
:return:
51+
"""
52+
result = re_typed_literal.match(o)
53+
if result:
54+
o_value = result.group(1)
55+
o_datatype = result.group(2)
56+
o_lang = None
57+
else:
58+
o_datatype = None
59+
result = re_lang_literal.match(o)
60+
if result:
61+
o_value = result.group(1)
62+
o_lang = result.group(2)
63+
else:
64+
result = re_untyped_literal.match(o)
65+
o_lang = None
66+
if result:
67+
o_value = result.group(1)
68+
else:
69+
raise ValueError(f"Cannot parse {o}")
70+
return o_value, o_datatype, o_lang
71+
72+
def _parse_literal_as_value(o: str) -> str:
73+
return _parse_literal(o)[0]
74+
3875
@dataclass
3976
class BulkLoader(ABC):
77+
"""
78+
Base class for all bulk loaders
79+
"""
4080
path: str
4181
prefix_map: PREFIX_MAP = None
82+
converter: Converter = None
83+
index_statements = False
84+
85+
def __post_init__(self):
86+
if self.prefix_map:
87+
self.converter = Converter.from_prefix_map(self.prefix_map)
4288

4389
def bulkload(self, path: str):
4490
raise NotImplemented
4591

92+
def contract_uri(self, uri: Optional[URI]) -> Optional[str]:
93+
if uri is None:
94+
return None
95+
elif self.converter:
96+
curie = self.converter.compress(uri)
97+
if curie:
98+
return curie
99+
else:
100+
return uri
101+
else:
102+
return uri
103+
46104
def statements(self, path: Union[Path, str]) -> Iterator[STATEMENT]:
47105
doc = lightrdf.RDFDocument(str(path))
106+
107+
# First pass pre-processing
108+
# index shacl prefixes and reified statements
109+
# note we use lists as a proxy for mutable tuples here;
110+
# this may not be the cleanest way but it should hopefully be fast
111+
prefix_node_map = defaultdict(lambda: [None, None])
112+
statement_node_map = defaultdict(lambda: [None, None])
113+
for s, p, o in doc.search_triples(None, None, None):
114+
if p == 'http://www.w3.org/ns/shacl#prefix':
115+
prefix_node_map[s][0] = _parse_literal_as_value(o)
116+
elif p == 'http://www.w3.org/ns/shacl#namespace':
117+
prefix_node_map[s][1] = _parse_literal_as_value(o)
118+
elif self.index_statements:
119+
# this is optional as it may be inefficient to do at the python level
120+
if p == "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject":
121+
statement_node_map[s][0] = o
122+
elif p == "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate":
123+
statement_node_map[s][1] = o
124+
elif p == "http://www.w3.org/1999/02/22-rdf-syntax-ns#object":
125+
statement_node_map[s][2] = o
126+
elif p == "http://www.w3.org/2002/07/owl#annotatedSource":
127+
statement_node_map[s][0] = o
128+
elif p == "http://www.w3.org/2002/07/owl#annotatedProperty":
129+
statement_node_map[s][1] = o
130+
elif p == "http://www.w3.org/2002/07/owl#annotatedTarget":
131+
statement_node_map[s][2] = o
132+
if prefix_node_map:
133+
if self.prefix_map is None:
134+
self.prefix_map = {}
135+
for [p, ns] in prefix_node_map.values():
136+
if p not in self.prefix_map:
137+
self.prefix_map[p] = ns
138+
self.converter = Converter.from_prefix_map(self.prefix_map)
139+
48140
# this code could be reduced if https://github.com/ozekik/lightrdf/issues/12 is implemented
49141
for t in doc.search_triples(None, None, None):
142+
s = self.contract_uri(t[0])
143+
p = self.contract_uri(t[1])
50144
o = t[2]
51145
o_uri = None
52146
o_datatype = None
53147
o_lang = None
54148
if o.startswith('"'):
55-
o_value = o
56-
result = re_typed_literal.match(o)
57-
if result:
58-
o_value = result.group(1)
59-
o_datatype = result.group(2)
60-
else:
61-
result = re_lang_literal.match(o)
62-
if result:
63-
o_value = result.group(1)
64-
o_lang = result.group(2)
149+
o_value, o_datatype, o_lang = _parse_literal(o)
65150
else:
66151
o_value = None
67-
o_uri = o
68-
yield t[0], t[1], o_uri, o_value, o_datatype, o_lang
152+
if re_blank_node.match(o):
153+
o_uri = f"_:{o}"
154+
else:
155+
o_uri = self.contract_uri(o)
156+
yield s, p, o_uri, o_value, o_datatype, o_lang
69157

70158
def ddl(self) -> str:
71159
return DDL

src/rdf_sql_bulkloader/loaders/sqlite3_bulkloader.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import sqlite3
2+
from dataclasses import dataclass
23
from typing import Any
34

45
from rdf_sql_bulkloader.loaders.bulkloader import BulkLoader
56

67
COLS = ["subject", "predicate", "object", "value", "datatype", "language"]
78

9+
10+
@dataclass
811
class SqliteBulkloader(BulkLoader):
912
connection: Any = None
1013

@@ -18,4 +21,4 @@ def bulkload(self, path: str):
1821
print(len(tuples))
1922
colstr = ",".join(COLS)
2023
qs = ",".join(["?" for _ in COLS])
21-
con.executemany(f"insert into statement({colstr}) values ({qs})", tuples)
24+
con.executemany(f"insert into statement({colstr}) values ({qs})", tuples)

src/rdf_sql_bulkloader/main.py

Lines changed: 0 additions & 4 deletions
This file was deleted.

tests/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import os
22
from pathlib import Path
33

4+
TEST_PREFIX_MAP = {
5+
"GO": "http://purl.obolibrary.org/obo/GO_"
6+
}
7+
48
ROOT = os.path.abspath(os.path.dirname(__file__))
59
INPUT_DIR = Path(ROOT) / "input"
610
OUTPUT_DIR = Path(ROOT) / "output"

0 commit comments

Comments
 (0)