Skip to content

Instantly share code, notes, and snippets.

@lomereiter
Created May 12, 2017 14:30
Show Gist options
  • Save lomereiter/956cf91f816b332366fe354a9ade4a51 to your computer and use it in GitHub Desktop.
Save lomereiter/956cf91f816b332366fe354a9ade4a51 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ChEBI / 2016\n",
"Number of completely invalid molecular formulae (not corresponding to ANY molecule): 1700\n",
"Number of molecular formulae we can fix without re-processing (at least one molecule in the database): 1077\n",
"Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group): 2777\n",
"HMDB / 2016\n",
"Number of completely invalid molecular formulae (not corresponding to ANY molecule): 0\n",
"Number of molecular formulae we can fix without re-processing (at least one molecule in the database): 0\n",
"Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group): 0\n",
"LIPID_MAPS / 2016\n",
"Number of completely invalid molecular formulae (not corresponding to ANY molecule): 1\n",
"Number of molecular formulae we can fix without re-processing (at least one molecule in the database): 0\n",
"Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group): 1\n",
"SwissLipids / 2016\n",
"Number of completely invalid molecular formulae (not corresponding to ANY molecule): 0\n",
"Number of molecular formulae we can fix without re-processing (at least one molecule in the database): 0\n",
"Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group): 0\n"
]
}
],
"source": [
"import requests\n",
"\n",
"server = 'http://annotate.metasp.eu:5000/v1/'\n",
"\n",
"wtf = {}\n",
"\n",
"for db in requests.get(server + 'databases').json()['data']:\n",
" dbId = db['id']\n",
" print('{} / {}'.format(db['name'], db['version']))\n",
" molecules = requests.get(server + 'databases/{}/molecules?limit=1000000'.format(dbId)).json()['data']\n",
" \n",
" failures = [m for m in molecules if not m['inchi'].startswith('InChI')]\n",
" failure_mf = {m['sf'] for m in failures}\n",
" all_mf = {m['sf'] for m in molecules}\n",
" valid_mf = {m['sf'] for m in molecules if m['inchi'].startswith('InChI')}\n",
" \n",
" wtf[dbId] = all_mf.difference(valid_mf)\n",
"\n",
" print(\"Number of completely invalid molecular formulae (not corresponding to ANY molecule):\", len(all_mf.difference(valid_mf)))\n",
" print(\"Number of molecular formulae we can fix without re-processing (at least one molecule in the database):\", len(valid_mf.intersection(failure_mf)))\n",
" print(\"Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group):\", len(failure_mf))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1700"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(wtf[2])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#!pip install git+https://github.com/metaspace2020/[email protected] -U"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from sm_annotation_utils import sm_annotation_utils\n",
"sm = sm_annotation_utils.SMInstance()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"chebi_annot = sm._gqclient.getAnnotations({'database': 'ChEBI'})"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"384280"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(chebi_annot)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame.from_records(\n",
" [{\n",
" 'mf': a['sumFormula'], \n",
" 'adduct': a['adduct'], \n",
" 'ds': a['dataset']['name'], \n",
" 'msm': a['msmScore'],\n",
" 'fdr': a['fdrLevel']\n",
" } for a in chebi_annot if a is not None]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"@ FDR = 0.05\n",
"total annotations: 12920\n",
"non-molecular annotations: 1359 (10.5%)\n",
"@ FDR = 0.1\n",
"total annotations: 44885\n",
"non-molecular annotations: 6406 (14.3%)\n",
"@ FDR = 0.2\n",
"total annotations: 116102\n",
"non-molecular annotations: 15139 (13.0%)\n",
"@ FDR = 0.5\n",
"total annotations: 384245\n",
"non-molecular annotations: 38799 (10.1%)\n"
]
}
],
"source": [
"for cutoff in [0.05, 0.1, 0.2, 0.5]:\n",
" print(\"@ FDR =\", cutoff)\n",
" subdf = df[df['fdr'] <= cutoff]\n",
" total = len(subdf)\n",
" print(\"total annotations:\", len(subdf))\n",
" nonmol = subdf['mf'].isin(wtf[2]).sum()\n",
" print(\"non-molecular annotations:\", nonmol, '({:.1f}%)'.format(nonmol / total * 100))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment