|
1 | 1 | import re |
2 | 2 | from abc import ABC |
| 3 | +from collections import defaultdict |
3 | 4 | from dataclasses import dataclass |
4 | 5 | from pathlib import Path |
5 | | -from typing import Union, Tuple, Iterator, Mapping |
| 6 | +from typing import Union, Tuple, Iterator, Mapping, Optional |
6 | 7 |
|
| 8 | +import curies |
7 | 9 | import lightrdf |
| 10 | +from curies import Converter |
8 | 11 |
|
9 | 12 | re_untyped_literal = re.compile(r'^"(.*)"$') |
10 | | -re_typed_literal = re.compile(r'^"(.*)"^^<([\S^"]+)>$') |
| 13 | +re_typed_literal = re.compile(r'^"(.*)"\^\^<([\S^"]+)>$') |
11 | 14 | re_lang_literal = re.compile(r'^"(.*)"@(\w+)$') |
| 15 | +re_blank_node = re.compile(r'^riog(\d+)$') |
| 16 | + |
12 | 17 |
|
13 | 18 | DDL = """ |
14 | 19 | CREATE TABLE statement ( |
|
35 | 40 | STATEMENT = Tuple[SUBJECT, PREDICATE, OBJECT_URI, OBJECT_VALUE, OBJECT_DATATYPE, OBJECT_LANG] |
36 | 41 | PREFIX_MAP = Mapping[PREFIX, URI] |
37 | 42 |
|
| 43 | + |
| 44 | +def _parse_literal(o: str) -> Tuple[OBJECT_VALUE, OBJECT_DATATYPE, OBJECT_LANG]: |
| 45 | + """ |
| 46 | + parses an encoded literal |
| 47 | +
|
| 48 | + See https://github.com/ozekik/lightrdf/issues/12 |
| 49 | + :param o: |
| 50 | + :return: |
| 51 | + """ |
| 52 | + result = re_typed_literal.match(o) |
| 53 | + if result: |
| 54 | + o_value = result.group(1) |
| 55 | + o_datatype = result.group(2) |
| 56 | + o_lang = None |
| 57 | + else: |
| 58 | + o_datatype = None |
| 59 | + result = re_lang_literal.match(o) |
| 60 | + if result: |
| 61 | + o_value = result.group(1) |
| 62 | + o_lang = result.group(2) |
| 63 | + else: |
| 64 | + result = re_untyped_literal.match(o) |
| 65 | + o_lang = None |
| 66 | + if result: |
| 67 | + o_value = result.group(1) |
| 68 | + else: |
| 69 | + raise ValueError(f"Cannot parse {o}") |
| 70 | + return o_value, o_datatype, o_lang |
| 71 | + |
| 72 | +def _parse_literal_as_value(o: str) -> str: |
| 73 | + return _parse_literal(o)[0] |
| 74 | + |
38 | 75 | @dataclass |
39 | 76 | class BulkLoader(ABC): |
| 77 | + """ |
| 78 | + Base class for all bulk loaders |
| 79 | + """ |
40 | 80 | path: str |
41 | 81 | prefix_map: PREFIX_MAP = None |
| 82 | + converter: Converter = None |
| 83 | + index_statements = False |
| 84 | + |
| 85 | + def __post_init__(self): |
| 86 | + if self.prefix_map: |
| 87 | + self.converter = Converter.from_prefix_map(self.prefix_map) |
42 | 88 |
|
43 | 89 | def bulkload(self, path: str): |
44 | 90 | raise NotImplemented |
45 | 91 |
|
| 92 | + def contract_uri(self, uri: Optional[URI]) -> Optional[str]: |
| 93 | + if uri is None: |
| 94 | + return None |
| 95 | + elif self.converter: |
| 96 | + curie = self.converter.compress(uri) |
| 97 | + if curie: |
| 98 | + return curie |
| 99 | + else: |
| 100 | + return uri |
| 101 | + else: |
| 102 | + return uri |
| 103 | + |
46 | 104 | def statements(self, path: Union[Path, str]) -> Iterator[STATEMENT]: |
47 | 105 | doc = lightrdf.RDFDocument(str(path)) |
| 106 | + |
| 107 | + # First pass pre-processing |
| 108 | + # index shacl prefixes and reified statements |
| 109 | + # note we use lists as a proxy for mutable tuples here; |
| 110 | + # this may not be the cleanest way but it should hopefully be fast |
| 111 | + prefix_node_map = defaultdict(lambda: [None, None]) |
| 112 | + statement_node_map = defaultdict(lambda: [None, None]) |
| 113 | + for s, p, o in doc.search_triples(None, None, None): |
| 114 | + if p == 'http://www.w3.org/ns/shacl#prefix': |
| 115 | + prefix_node_map[s][0] = _parse_literal_as_value(o) |
| 116 | + elif p == 'http://www.w3.org/ns/shacl#namespace': |
| 117 | + prefix_node_map[s][1] = _parse_literal_as_value(o) |
| 118 | + elif self.index_statements: |
| 119 | + # this is optional as it may be inefficient to do at the python level |
| 120 | + if p == "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject": |
| 121 | + statement_node_map[s][0] = o |
| 122 | + elif p == "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate": |
| 123 | + statement_node_map[s][1] = o |
| 124 | + elif p == "http://www.w3.org/1999/02/22-rdf-syntax-ns#object": |
| 125 | + statement_node_map[s][2] = o |
| 126 | + elif p == "http://www.w3.org/2002/07/owl#annotatedSource": |
| 127 | + statement_node_map[s][0] = o |
| 128 | + elif p == "http://www.w3.org/2002/07/owl#annotatedProperty": |
| 129 | + statement_node_map[s][1] = o |
| 130 | + elif p == "http://www.w3.org/2002/07/owl#annotatedTarget": |
| 131 | + statement_node_map[s][2] = o |
| 132 | + if prefix_node_map: |
| 133 | + if self.prefix_map is None: |
| 134 | + self.prefix_map = {} |
| 135 | + for [p, ns] in prefix_node_map.values(): |
| 136 | + if p not in self.prefix_map: |
| 137 | + self.prefix_map[p] = ns |
| 138 | + self.converter = Converter.from_prefix_map(self.prefix_map) |
| 139 | + |
48 | 140 | # this code could be reduced if https://github.com/ozekik/lightrdf/issues/12 is implemented |
49 | 141 | for t in doc.search_triples(None, None, None): |
| 142 | + s = self.contract_uri(t[0]) |
| 143 | + p = self.contract_uri(t[1]) |
50 | 144 | o = t[2] |
51 | 145 | o_uri = None |
52 | 146 | o_datatype = None |
53 | 147 | o_lang = None |
54 | 148 | if o.startswith('"'): |
55 | | - o_value = o |
56 | | - result = re_typed_literal.match(o) |
57 | | - if result: |
58 | | - o_value = result.group(1) |
59 | | - o_datatype = result.group(2) |
60 | | - else: |
61 | | - result = re_lang_literal.match(o) |
62 | | - if result: |
63 | | - o_value = result.group(1) |
64 | | - o_lang = result.group(2) |
| 149 | + o_value, o_datatype, o_lang = _parse_literal(o) |
65 | 150 | else: |
66 | 151 | o_value = None |
67 | | - o_uri = o |
68 | | - yield t[0], t[1], o_uri, o_value, o_datatype, o_lang |
| 152 | + if re_blank_node.match(o): |
| 153 | + o_uri = f"_:{o}" |
| 154 | + else: |
| 155 | + o_uri = self.contract_uri(o) |
| 156 | + yield s, p, o_uri, o_value, o_datatype, o_lang |
69 | 157 |
|
70 | 158 | def ddl(self) -> str: |
71 | 159 | return DDL |
0 commit comments