Created
March 8, 2019 04:03
-
-
Save ravila4/302b9d450a212c095d6bd6e8f55bd4e9 to your computer and use it in GitHub Desktop.
Python script for parsing an xml database dump from DrugBank for extracting Log P values
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xmltodict | |
import pandas as pd | |
with open("full_database.xml") as db: | |
doc = xmltodict.parse(db.read()) | |
values = [] | |
for item in doc['drugbank']['drug']: | |
logp = None | |
try: | |
ID = item['drugbank-id']['#text'] | |
prop = item['experimental-properties'] | |
calc = item['calculated-properties'] | |
if prop is not None: | |
# Iterate through the list of propterties | |
prop_list = prop['property'] | |
if type(prop_list) is not list: | |
prop_list = [prop_list] | |
for p in prop_list: | |
if p['kind'] == logP: | |
logp = p['value'] | |
if logp is not None: | |
for prop in calc['property']: | |
if prop['kind'] == SMILES: | |
smiles = prop['value'] | |
if prop['kind'] == logP: | |
logp_exp = prop['value'] | |
values.append((ID, smiles, logp, logp_exp)) | |
except: | |
pass | |
values_df = pd.DataFrame(values) | |
columns = ["DrugBankID", "SMILES", "expLogP", "calcLogP"] | |
values_df.columns = columns | |
values_df.to_csv("logp_values.csv", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment