Created
December 28, 2023 19:12
-
-
Save baoilleach/c7b5fe80f7abc5afbe0f822c6ffa4ff5 to your computer and use it in GitHub Desktop.
Code to tokenize a SMILES string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import time | |
import itertools | |
import doctest | |
ITERATIONS = 1000000 | |
# From IBM Research's Rxn4Chemistry: | |
# https://github.com/rxn4chemistry/rxn-chemutils/blob/main/src/rxn/chemutils/tokenization.py | |
SMILES_TOKENIZER_PATTERN = r"(\%\([0-9]{3}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\||\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])" | |
SMILES_REGEX = re.compile(SMILES_TOKENIZER_PATTERN) | |
def split_on_regexp(smi): | |
""" | |
>>> split_on_regexp("Cl11%11%(111)C[C@@H](Br)I") | |
['Cl', '1', '1', '%11', '%(111)', 'C', '[C@@H]', '(', 'Br', ')', 'I'] | |
""" | |
return SMILES_REGEX.findall(smi) | |
chars = set('[lr%') | |
def tokenize_v3(smi): | |
""" | |
>>> tokenize_v3("Cl11%11%(111)C[C@@H](Br)I") | |
['Cl', '1', '1', '%11', '%(111)', 'C', '[C@@H]', '(', 'Br', ')', 'I'] | |
""" | |
tokens = [] | |
i = 0 | |
N = len(smi) | |
while i < N: | |
x = smi[i] | |
if x not in chars: | |
tokens.append(x) | |
i += 1 | |
else: | |
if x == 'l': | |
tokens[-1] = 'Cl' | |
i += 1 | |
elif x == 'r': | |
tokens[-1] = 'Br' | |
i += 1 | |
elif x=='[': | |
j = i+1 | |
while smi[j] != ']': | |
j += 1 | |
tokens.append(smi[i:j+1]) | |
i += j-i + 1 | |
else: # % | |
if smi[i+1] == '(': | |
j = i | |
while smi[j] != ')': | |
j += 1 | |
tokens.append(smi[i:j+1]) | |
i += j-i + 1 | |
else: | |
tokens.append(smi[i:i+3]) | |
i += 3 | |
return tokens | |
def tokenize_v2(smi): | |
""" | |
>>> tokenize_v2("Cl11%11%(111)C[C@@H](Br)I") | |
['Cl', '1', '1', '%11', '%(111)', 'C', '[C@@H]', '(', 'Br', ')', 'I'] | |
""" | |
tokens = [] | |
i = 0 | |
N = len(smi) | |
while i < N: | |
x = smi[i] | |
if x == 'l': | |
tokens[-1] = 'Cl' | |
elif x == 'r': | |
tokens[-1] = 'Br' | |
elif x=='[': | |
j = i+1 | |
while smi[j] != ']': | |
j += 1 | |
tokens.append(smi[i:j+1]) | |
i += j-i | |
elif x == '%': | |
if smi[i+1] == '(': | |
j = i | |
while smi[j] != ')': | |
j += 1 | |
tokens.append(smi[i:j+1]) | |
i += j-i | |
else: | |
tokens.append(smi[i:i+3]) | |
i += 2 | |
else: | |
tokens.append(x) | |
i += 1 | |
return tokens | |
def tokenize_v1(smi): | |
""" | |
>>> tokenize_v1("Cl11%11%(111)C[C@@H](Br)I") | |
['Cl', '1', '1', '%11', '%(111)', 'C', '[C@@H]', '(', 'Br', ')', 'I'] | |
""" | |
tokens = [] | |
i = 0 | |
N = len(smi) | |
while i < N: | |
x = smi[i] | |
if x == 'C' and i+1<N and smi[i+1]=='l': | |
tokens.append("Cl") | |
i += 1 | |
elif x == 'B' and i+1<N and smi[i+1]=='r': | |
tokens.append("Br") | |
i += 1 | |
elif x=='[': | |
j = i+1 | |
while smi[j] != ']': | |
j += 1 | |
tokens.append(smi[i:j+1]) | |
i += j-i | |
elif x == '%': | |
if smi[i+1] == '(': | |
j = i | |
while smi[j] != ')': | |
j += 1 | |
tokens.append(smi[i:j+1]) | |
i += j-i | |
else: | |
tokens.append(smi[i:i+3]) | |
i += 2 | |
else: | |
tokens.append(x) | |
i += 1 | |
return tokens | |
def read_smiles(fname): | |
with open(fname) as inp: | |
for line in inp: | |
yield line.split()[0] | |
if __name__ == "__main__": | |
doctest.testmod() | |
tokenizers = [tokenize_v1, tokenize_v2, tokenize_v3, split_on_regexp] | |
tokenizers.reverse() | |
fname = "orig/chembl_20.smi" | |
miter = read_smiles(fname) | |
smiles = list(itertools.islice(miter, ITERATIONS)) | |
for tokenizer in tokenizers: | |
t = time.time() | |
for smi in smiles: | |
tokens = tokenizer(smi) | |
print(time.time() - t) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment