Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions chebai/preprocessing/datasets/chebi.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ class ChEBIFromList(_ChEBIDataExtractor):

"""

READER = dr.ChemDataReader
READER = dr.StaticSMILESReader

def __init__(
self,
Expand Down Expand Up @@ -572,7 +572,7 @@ class ChEBIOverX(_ChEBIDataExtractor):
THRESHOLD (None): The threshold for selecting classes.
"""

READER: dr.ChemDataReader = dr.ChemDataReader
READER = dr.StaticSMILESReader

@property
def _name(self) -> str:
Expand Down Expand Up @@ -791,11 +791,8 @@ class ChEBIOver100Fingerprints(ChEBIOverXFingerprints, ChEBIOver100):


if __name__ == "__main__":
dataset = ChEBIOver50Partial(
chebi_version=247,
subset="3_STAR",
top_class_id="36700",
external_data_ratio=0.5,
dataset = ChEBIOver50(
chebi_version=251,
)
dataset.prepare_data()
dataset.setup()
46 changes: 46 additions & 0 deletions chebai/preprocessing/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,52 @@ def _back_to_smiles(self, smiles_encoded):
return smiles_decoded


class StaticSMILESReader(DataReader):
"""
Data reader for SMILES tokens with a static token set. Atoms are split into 5 components: isotope, element, charge, hydrogens, stereo.
New tokens are not added to the token file, and unknown tokens are mapped to a special index.
"""

COLLATOR = RaggedCollator

def __init__(self, *args, **kwargs) -> None:
from chebai.preprocessing.smiles_tokenizer import BasicSmilesTokenizer

super().__init__(*args, **kwargs)
self.tokenizer = BasicSmilesTokenizer()

@classmethod
def name(cls) -> str:
"""Returns the name of the data reader."""
return "static_smiles"

def _read_data(self, raw_data: str | Chem.Mol) -> Optional[List[int]]:
"""Tokenize raw SMILES data using BasicSmilesTokenizer with static vocabulary."""
try:
if isinstance(raw_data, str):
mol = Chem.MolFromSmiles(raw_data.strip())
else:
mol = raw_data
except ValueError as e:
print(f"could not process {raw_data}")
print(f"\tError: {e}")
return None

try:
smiles = Chem.MolToSmiles(mol, canonical=True)
except Exception as e:
print(f"RDKit failed to canonicalize the SMILES: {raw_data}")
print(f"\t{e}")
return None

try:
return self.tokenizer.encode(smiles)
except Exception as e:
print(f"could not tokenize {raw_data}")
print(f"\tError: {e}")
return None


class DeepChemDataReader(ChemDataReader):
"""
Data reader for chemical data using DeepSMILES tokens.
Expand Down
Loading
Loading