ChEB-AI · sfluegel05 · May 6, 2026 · May 6, 2026 · May 6, 2026 · May 7, 2026
diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
@@ -526,7 +526,7 @@ class ChEBIFromList(_ChEBIDataExtractor):
 
     """
 
-    READER = dr.ChemDataReader
+    READER = dr.StaticSMILESReader
 
     def __init__(
         self,
@@ -572,7 +572,7 @@ class ChEBIOverX(_ChEBIDataExtractor):
         THRESHOLD (None): The threshold for selecting classes.
     """
 
-    READER: dr.ChemDataReader = dr.ChemDataReader
+    READER = dr.StaticSMILESReader
 
     @property
     def _name(self) -> str:
@@ -791,11 +791,8 @@ class ChEBIOver100Fingerprints(ChEBIOverXFingerprints, ChEBIOver100):
 
 
 if __name__ == "__main__":
-    dataset = ChEBIOver50Partial(
-        chebi_version=247,
-        subset="3_STAR",
-        top_class_id="36700",
-        external_data_ratio=0.5,
+    dataset = ChEBIOver50(
+        chebi_version=251,
     )
     dataset.prepare_data()
     dataset.setup()
diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
@@ -257,6 +257,52 @@ def _back_to_smiles(self, smiles_encoded):
         return smiles_decoded
 
 
+class StaticSMILESReader(DataReader):
+    """
+    Data reader for SMILES tokens with a static token set. Atoms are split into 5 components: isotope, element, charge, hydrogens, stereo.
+    New tokens are not added to the token file, and unknown tokens are mapped to a special index.
+    """
+
+    COLLATOR = RaggedCollator
+
+    def __init__(self, *args, **kwargs) -> None:
+        from chebai.preprocessing.smiles_tokenizer import BasicSmilesTokenizer
+
+        super().__init__(*args, **kwargs)
+        self.tokenizer = BasicSmilesTokenizer()
+
+    @classmethod
+    def name(cls) -> str:
+        """Returns the name of the data reader."""
+        return "static_smiles"
+
+    def _read_data(self, raw_data: str | Chem.Mol) -> Optional[List[int]]:
+        """Tokenize raw SMILES data using BasicSmilesTokenizer with static vocabulary."""
+        try:
+            if isinstance(raw_data, str):
+                mol = Chem.MolFromSmiles(raw_data.strip())
+            else:
+                mol = raw_data
+        except ValueError as e:
+            print(f"could not process {raw_data}")
+            print(f"\tError: {e}")
+            return None
+
+        try:
+            smiles = Chem.MolToSmiles(mol, canonical=True)
+        except Exception as e:
+            print(f"RDKit failed to canonicalize the SMILES: {raw_data}")
+            print(f"\t{e}")
+            return None
+
+        try:
+            return self.tokenizer.encode(smiles)
+        except Exception as e:
+            print(f"could not tokenize {raw_data}")
+            print(f"\tError: {e}")
+            return None
+
+
 class DeepChemDataReader(ChemDataReader):
     """
     Data reader for chemical data using DeepSMILES tokens.