lomereiter · May 12, 2017 14:30
diff --git a/non-molecules.ipynb b/non-molecules.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ChEBI / 2016\n",
      "Number of completely invalid molecular formulae (not corresponding to ANY molecule): 1700\n",
      "Number of molecular formulae we can fix without re-processing (at least one molecule in the database): 1077\n",
      "Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group): 2777\n",
      "HMDB / 2016\n",
      "Number of completely invalid molecular formulae (not corresponding to ANY molecule): 0\n",
      "Number of molecular formulae we can fix without re-processing (at least one molecule in the database): 0\n",
      "Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group): 0\n",
      "LIPID_MAPS / 2016\n",
      "Number of completely invalid molecular formulae (not corresponding to ANY molecule): 1\n",
      "Number of molecular formulae we can fix without re-processing (at least one molecule in the database): 0\n",
      "Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group): 1\n",
      "SwissLipids / 2016\n",
      "Number of completely invalid molecular formulae (not corresponding to ANY molecule): 0\n",
      "Number of molecular formulae we can fix without re-processing (at least one molecule in the database): 0\n",
      "Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group): 0\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "\n",
    "server = 'http://annotate.metasp.eu:5000/v1/'\n",
    "\n",
    "wtf = {}\n",
    "\n",
    "for db in requests.get(server + 'databases').json()['data']:\n",
    "    dbId = db['id']\n",
    "    print('{} / {}'.format(db['name'], db['version']))\n",
    "    molecules = requests.get(server + 'databases/{}/molecules?limit=1000000'.format(dbId)).json()['data']\n",
    "    \n",
    "    failures = [m for m in molecules if not m['inchi'].startswith('InChI')]\n",
    "    failure_mf = {m['sf'] for m in failures}\n",
    "    all_mf = {m['sf'] for m in molecules}\n",
    "    valid_mf = {m['sf'] for m in molecules if m['inchi'].startswith('InChI')}\n",
    "    \n",
    "    wtf[dbId] = all_mf.difference(valid_mf)\n",
    "\n",
    "    print(\"Number of completely invalid molecular formulae (not corresponding to ANY molecule):\", len(all_mf.difference(valid_mf)))\n",
    "    print(\"Number of molecular formulae we can fix without re-processing (at least one molecule in the database):\", len(valid_mf.intersection(failure_mf)))\n",
    "    print(\"Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group):\", len(failure_mf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1700"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(wtf[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip install git+https://github.com/metaspace2020/[email protected] -U"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sm_annotation_utils import sm_annotation_utils\n",
    "sm = sm_annotation_utils.SMInstance()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "chebi_annot = sm._gqclient.getAnnotations({'database': 'ChEBI'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "384280"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(chebi_annot)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame.from_records(\n",
    "    [{\n",
    "        'mf': a['sumFormula'], \n",
    "        'adduct': a['adduct'], \n",
    "        'ds': a['dataset']['name'], \n",
    "        'msm': a['msmScore'],\n",
    "        'fdr': a['fdrLevel']\n",
    "    } for a in chebi_annot if a is not None]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "@ FDR = 0.05\n",
      "total annotations: 12920\n",
      "non-molecular annotations: 1359 (10.5%)\n",
      "@ FDR = 0.1\n",
      "total annotations: 44885\n",
      "non-molecular annotations: 6406 (14.3%)\n",
      "@ FDR = 0.2\n",
      "total annotations: 116102\n",
      "non-molecular annotations: 15139 (13.0%)\n",
      "@ FDR = 0.5\n",
      "total annotations: 384245\n",
      "non-molecular annotations: 38799 (10.1%)\n"
     ]
    }
   ],
   "source": [
    "for cutoff in [0.05, 0.1, 0.2, 0.5]:\n",
    "    print(\"@ FDR =\", cutoff)\n",
    "    subdf = df[df['fdr'] <= cutoff]\n",
    "    total = len(subdf)\n",
    "    print(\"total annotations:\", len(subdf))\n",
    "    nonmol = subdf['mf'].isin(wtf[2]).sum()\n",
    "    print(\"non-molecular annotations:\", nonmol, '({:.1f}%)'.format(nonmol / total * 100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"ChEBI / 2016\n",
	"Number of completely invalid molecular formulae (not corresponding to ANY molecule): 1700\n",
	"Number of molecular formulae we can fix without re-processing (at least one molecule in the database): 1077\n",
	"Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group): 2777\n",
	"HMDB / 2016\n",
	"Number of completely invalid molecular formulae (not corresponding to ANY molecule): 0\n",
	"Number of molecular formulae we can fix without re-processing (at least one molecule in the database): 0\n",
	"Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group): 0\n",
	"LIPID_MAPS / 2016\n",
	"Number of completely invalid molecular formulae (not corresponding to ANY molecule): 1\n",
	"Number of molecular formulae we can fix without re-processing (at least one molecule in the database): 0\n",
	"Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group): 1\n",
	"SwissLipids / 2016\n",
	"Number of completely invalid molecular formulae (not corresponding to ANY molecule): 0\n",
	"Number of molecular formulae we can fix without re-processing (at least one molecule in the database): 0\n",
	"Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group): 0\n"
	]
	}
	],
	"source": [
	"import requests\n",
	"\n",
	"server = 'http://annotate.metasp.eu:5000/v1/'\n",
	"\n",
	"wtf = {}\n",
	"\n",
	"for db in requests.get(server + 'databases').json()['data']:\n",
	" dbId = db['id']\n",
	" print('{} / {}'.format(db['name'], db['version']))\n",
	" molecules = requests.get(server + 'databases/{}/molecules?limit=1000000'.format(dbId)).json()['data']\n",
	" \n",
	" failures = [m for m in molecules if not m['inchi'].startswith('InChI')]\n",
	" failure_mf = {m['sf'] for m in failures}\n",
	" all_mf = {m['sf'] for m in molecules}\n",
	" valid_mf = {m['sf'] for m in molecules if m['inchi'].startswith('InChI')}\n",
	" \n",
	" wtf[dbId] = all_mf.difference(valid_mf)\n",
	"\n",
	" print(\"Number of completely invalid molecular formulae (not corresponding to ANY molecule):\", len(all_mf.difference(valid_mf)))\n",
	" print(\"Number of molecular formulae we can fix without re-processing (at least one molecule in the database):\", len(valid_mf.intersection(failure_mf)))\n",
	" print(\"Number of molecular formulae that need to be fixed (at least one 'molecule' is in fact a group):\", len(failure_mf))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1700"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(wtf[2])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"#!pip install git+https://github.com/metaspace2020/[email protected] -U"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"from sm_annotation_utils import sm_annotation_utils\n",
	"sm = sm_annotation_utils.SMInstance()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [],
	"source": [
	"chebi_annot = sm._gqclient.getAnnotations({'database': 'ChEBI'})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"384280"
	]
	},
	"execution_count": 18,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(chebi_annot)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.DataFrame.from_records(\n",
	" [{\n",
	" 'mf': a['sumFormula'], \n",
	" 'adduct': a['adduct'], \n",
	" 'ds': a['dataset']['name'], \n",
	" 'msm': a['msmScore'],\n",
	" 'fdr': a['fdrLevel']\n",
	" } for a in chebi_annot if a is not None]\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"@ FDR = 0.05\n",
	"total annotations: 12920\n",
	"non-molecular annotations: 1359 (10.5%)\n",
	"@ FDR = 0.1\n",
	"total annotations: 44885\n",
	"non-molecular annotations: 6406 (14.3%)\n",
	"@ FDR = 0.2\n",
	"total annotations: 116102\n",
	"non-molecular annotations: 15139 (13.0%)\n",
	"@ FDR = 0.5\n",
	"total annotations: 384245\n",
	"non-molecular annotations: 38799 (10.1%)\n"
	]
	}
	],
	"source": [
	"for cutoff in [0.05, 0.1, 0.2, 0.5]:\n",
	" print(\"@ FDR =\", cutoff)\n",
	" subdf = df[df['fdr'] <= cutoff]\n",
	" total = len(subdf)\n",
	" print(\"total annotations:\", len(subdf))\n",
	" nonmol = subdf['mf'].isin(wtf[2]).sum()\n",
	" print(\"non-molecular annotations:\", nonmol, '({:.1f}%)'.format(nonmol / total * 100))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}