Skip to content

Instantly share code, notes, and snippets.

@baoilleach
Created December 28, 2023 19:12
Show Gist options
  • Save baoilleach/c7b5fe80f7abc5afbe0f822c6ffa4ff5 to your computer and use it in GitHub Desktop.
Save baoilleach/c7b5fe80f7abc5afbe0f822c6ffa4ff5 to your computer and use it in GitHub Desktop.
Code to tokenize a SMILES string
import re
import time
import itertools
import doctest
ITERATIONS = 1000000
# From IBM Research's Rxn4Chemistry:
# https://github.com/rxn4chemistry/rxn-chemutils/blob/main/src/rxn/chemutils/tokenization.py
SMILES_TOKENIZER_PATTERN = r"(\%\([0-9]{3}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\||\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
SMILES_REGEX = re.compile(SMILES_TOKENIZER_PATTERN)
def split_on_regexp(smi):
"""
>>> split_on_regexp("Cl11%11%(111)C[C@@H](Br)I")
['Cl', '1', '1', '%11', '%(111)', 'C', '[C@@H]', '(', 'Br', ')', 'I']
"""
return SMILES_REGEX.findall(smi)
chars = set('[lr%')
def tokenize_v3(smi):
"""
>>> tokenize_v3("Cl11%11%(111)C[C@@H](Br)I")
['Cl', '1', '1', '%11', '%(111)', 'C', '[C@@H]', '(', 'Br', ')', 'I']
"""
tokens = []
i = 0
N = len(smi)
while i < N:
x = smi[i]
if x not in chars:
tokens.append(x)
i += 1
else:
if x == 'l':
tokens[-1] = 'Cl'
i += 1
elif x == 'r':
tokens[-1] = 'Br'
i += 1
elif x=='[':
j = i+1
while smi[j] != ']':
j += 1
tokens.append(smi[i:j+1])
i += j-i + 1
else: # %
if smi[i+1] == '(':
j = i
while smi[j] != ')':
j += 1
tokens.append(smi[i:j+1])
i += j-i + 1
else:
tokens.append(smi[i:i+3])
i += 3
return tokens
def tokenize_v2(smi):
"""
>>> tokenize_v2("Cl11%11%(111)C[C@@H](Br)I")
['Cl', '1', '1', '%11', '%(111)', 'C', '[C@@H]', '(', 'Br', ')', 'I']
"""
tokens = []
i = 0
N = len(smi)
while i < N:
x = smi[i]
if x == 'l':
tokens[-1] = 'Cl'
elif x == 'r':
tokens[-1] = 'Br'
elif x=='[':
j = i+1
while smi[j] != ']':
j += 1
tokens.append(smi[i:j+1])
i += j-i
elif x == '%':
if smi[i+1] == '(':
j = i
while smi[j] != ')':
j += 1
tokens.append(smi[i:j+1])
i += j-i
else:
tokens.append(smi[i:i+3])
i += 2
else:
tokens.append(x)
i += 1
return tokens
def tokenize_v1(smi):
"""
>>> tokenize_v1("Cl11%11%(111)C[C@@H](Br)I")
['Cl', '1', '1', '%11', '%(111)', 'C', '[C@@H]', '(', 'Br', ')', 'I']
"""
tokens = []
i = 0
N = len(smi)
while i < N:
x = smi[i]
if x == 'C' and i+1<N and smi[i+1]=='l':
tokens.append("Cl")
i += 1
elif x == 'B' and i+1<N and smi[i+1]=='r':
tokens.append("Br")
i += 1
elif x=='[':
j = i+1
while smi[j] != ']':
j += 1
tokens.append(smi[i:j+1])
i += j-i
elif x == '%':
if smi[i+1] == '(':
j = i
while smi[j] != ')':
j += 1
tokens.append(smi[i:j+1])
i += j-i
else:
tokens.append(smi[i:i+3])
i += 2
else:
tokens.append(x)
i += 1
return tokens
def read_smiles(fname):
with open(fname) as inp:
for line in inp:
yield line.split()[0]
if __name__ == "__main__":
doctest.testmod()
tokenizers = [tokenize_v1, tokenize_v2, tokenize_v3, split_on_regexp]
tokenizers.reverse()
fname = "orig/chembl_20.smi"
miter = read_smiles(fname)
smiles = list(itertools.islice(miter, ITERATIONS))
for tokenizer in tokenizers:
t = time.time()
for smi in smiles:
tokens = tokenizer(smi)
print(time.time() - t)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment