INCATools
diff --git a/‎src/rdf_sql_bulkloader/cli.py‎
Lines changed: 0 additions & 1 deletion b/‎src/rdf_sql_bulkloader/cli.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/rdf_sql_bulkloader/loaders/bulkloader.py‎
Lines changed: 102 additions & 14 deletions b/‎src/rdf_sql_bulkloader/loaders/bulkloader.py‎
Lines changed: 102 additions & 14 deletions
diff --git a/‎src/rdf_sql_bulkloader/loaders/sqlite3_bulkloader.py‎
Lines changed: 4 additions & 1 deletion b/‎src/rdf_sql_bulkloader/loaders/sqlite3_bulkloader.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/rdf_sql_bulkloader/main.py‎
Lines changed: 0 additions & 4 deletions b/‎src/rdf_sql_bulkloader/main.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎tests/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎tests/__init__.py‎
Lines changed: 4 additions & 0 deletions
@@ -3,7 +3,6 @@
 import logging
 
 from rdf_sql_bulkloader import __version__
-from rdf_sql_bulkloader.main import demo
 
 __all__ = [
     "main",
 
@@ -1,14 +1,19 @@
 import re
 from abc import ABC
+from collections import defaultdict
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Union, Tuple, Iterator, Mapping
+from typing import Union, Tuple, Iterator, Mapping, Optional
 
+import curies
 import lightrdf
+from curies import Converter
 
 re_untyped_literal = re.compile(r'^"(.*)"$')
-re_typed_literal = re.compile(r'^"(.*)"^^<([\S^"]+)>$')
+re_typed_literal = re.compile(r'^"(.*)"\^\^<([\S^"]+)>$')
 re_lang_literal = re.compile(r'^"(.*)"@(\w+)$')
+re_blank_node = re.compile(r'^riog(\d+)$')
+
 
 DDL = """
 CREATE TABLE statement (
@@ -35,37 +40,120 @@
 STATEMENT = Tuple[SUBJECT, PREDICATE, OBJECT_URI, OBJECT_VALUE, OBJECT_DATATYPE, OBJECT_LANG]
 PREFIX_MAP = Mapping[PREFIX, URI]
 
+
+def _parse_literal(o: str) -> Tuple[OBJECT_VALUE, OBJECT_DATATYPE, OBJECT_LANG]:
+    """
+    parses an encoded literal
+
+    See https://github.com/ozekik/lightrdf/issues/12
+    :param o:
+    :return:
+    """
+    result = re_typed_literal.match(o)
+    if result:
+        o_value = result.group(1)
+        o_datatype = result.group(2)
+        o_lang = None
+    else:
+        o_datatype = None
+        result = re_lang_literal.match(o)
+        if result:
+            o_value = result.group(1)
+            o_lang = result.group(2)
+        else:
+            result = re_untyped_literal.match(o)
+            o_lang = None
+            if result:
+                o_value = result.group(1)
+            else:
+                raise ValueError(f"Cannot parse {o}")
+    return o_value, o_datatype, o_lang
+
+def _parse_literal_as_value(o: str) -> str:
+    return _parse_literal(o)[0]
+
 @dataclass
 class BulkLoader(ABC):
+    """
+    Base class for all bulk loaders
+    """
     path: str
     prefix_map: PREFIX_MAP = None
+    converter: Converter = None
+    index_statements = False
+    
+    def __post_init__(self):
+        if self.prefix_map:
+            self.converter = Converter.from_prefix_map(self.prefix_map)
 
     def bulkload(self, path: str):
         raise NotImplemented
 
+    def contract_uri(self, uri: Optional[URI]) -> Optional[str]:
+        if uri is None:
+            return None
+        elif self.converter:
+            curie = self.converter.compress(uri)
+            if curie:
+                return curie
+            else:
+                return uri
+        else:
+            return uri
+
     def statements(self, path: Union[Path, str]) -> Iterator[STATEMENT]:
         doc = lightrdf.RDFDocument(str(path))
+
+        # First pass pre-processing
+        # index shacl prefixes and reified statements
+        # note we use lists as a proxy for mutable tuples here;
+        # this may not be the cleanest way but it should hopefully be fast
+        prefix_node_map = defaultdict(lambda: [None, None])
+        statement_node_map = defaultdict(lambda: [None, None])
+        for s, p, o in doc.search_triples(None, None, None):
+            if p == 'http://www.w3.org/ns/shacl#prefix':
+                prefix_node_map[s][0] = _parse_literal_as_value(o)
+            elif p == 'http://www.w3.org/ns/shacl#namespace':
+                prefix_node_map[s][1] = _parse_literal_as_value(o)
+            elif self.index_statements:
+                # this is optional as it may be inefficient to do at the python level
+                if p == "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject":
+                    statement_node_map[s][0] = o
+                elif p == "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate":
+                    statement_node_map[s][1] = o
+                elif p == "http://www.w3.org/1999/02/22-rdf-syntax-ns#object":
+                    statement_node_map[s][2] = o
+                elif p == "http://www.w3.org/2002/07/owl#annotatedSource":
+                    statement_node_map[s][0] = o
+                elif p == "http://www.w3.org/2002/07/owl#annotatedProperty":
+                    statement_node_map[s][1] = o
+                elif p == "http://www.w3.org/2002/07/owl#annotatedTarget":
+                    statement_node_map[s][2] = o
+        if prefix_node_map:
+            if self.prefix_map is None:
+                self.prefix_map = {}
+            for [p, ns] in prefix_node_map.values():
+                if p not in self.prefix_map:
+                    self.prefix_map[p] = ns
+            self.converter = Converter.from_prefix_map(self.prefix_map)
+
         # this code could be reduced if https://github.com/ozekik/lightrdf/issues/12 is implemented
         for t in doc.search_triples(None, None, None):
+            s = self.contract_uri(t[0])
+            p = self.contract_uri(t[1])
             o = t[2]
             o_uri = None
             o_datatype = None
             o_lang = None
             if o.startswith('"'):
-                o_value = o
-                result = re_typed_literal.match(o)
-                if result:
-                    o_value = result.group(1)
-                    o_datatype = result.group(2)
-                else:
-                    result = re_lang_literal.match(o)
-                    if result:
-                        o_value = result.group(1)
-                        o_lang = result.group(2)
+                o_value, o_datatype, o_lang = _parse_literal(o)
             else:
                 o_value = None
-                o_uri = o
-            yield t[0], t[1], o_uri, o_value, o_datatype, o_lang
+                if re_blank_node.match(o):
+                    o_uri = f"_:{o}"
+                else:
+                    o_uri = self.contract_uri(o)
+            yield s, p, o_uri, o_value, o_datatype, o_lang
 
     def ddl(self) -> str:
         return DDL
@@ -1,10 +1,13 @@
 import sqlite3
+from dataclasses import dataclass
 from typing import Any
 
 from rdf_sql_bulkloader.loaders.bulkloader import BulkLoader
 
 COLS = ["subject", "predicate", "object", "value", "datatype", "language"]
 
+
+@dataclass
 class SqliteBulkloader(BulkLoader):
     connection: Any = None
 
@@ -18,4 +21,4 @@ def bulkload(self, path: str):
         print(len(tuples))
         colstr = ",".join(COLS)
         qs = ",".join(["?" for _ in COLS])
-        con.executemany(f"insert into statement({colstr}) values ({qs})", tuples)
+        con.executemany(f"insert into statement({colstr}) values ({qs})", tuples)
@@ -1,6 +1,10 @@
 import os
 from pathlib import Path
 
+TEST_PREFIX_MAP = {
+    "GO": "http://purl.obolibrary.org/obo/GO_"
+}
+
 ROOT = os.path.abspath(os.path.dirname(__file__))
 INPUT_DIR = Path(ROOT) / "input"
 OUTPUT_DIR = Path(ROOT) / "output"