Instantly share code, notes, and snippets.
Last active
January 11, 2021 22:41
-
Star
0
(0)
You must be signed in to star a gist -
Fork
1
(1)
You must be signed in to fork a gist
-
Save macieksk/a139d451e8de3b71225d884671b45915 to your computer and use it in GitHub Desktop.
Download authors and their affiliations from Pubmed articles using R packages: RISmed, rentrez
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#Download authors and their affiliations from Pubmed articles\n", | |
"#\n", | |
"#Author: Maciek Sykulski ([email protected]), partially based on solution by Tim Read\n", | |
"#\n", | |
"#Uses IRKernel in Jupyter Notebook https://github.com/IRkernel/IRkernel\n", | |
"#" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"8" | |
], | |
"text/latex": [ | |
"8" | |
], | |
"text/markdown": [ | |
"8" | |
], | |
"text/plain": [ | |
"[1] 8" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#install.packages(\"RISmed\",repos='http://cran.us.r-project.org')\n", | |
"library(\"RISmed\")\n", | |
"\n", | |
"res <- EUtilsSummary('Sykulski M[author]', type='esearch', db='pubmed', mindate='2011', maxdate='2016')\n", | |
"\n", | |
"QueryCount(res)\n", | |
"#fetch <- EUtilsGet(res)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 234, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#attributes(res)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<ol class=list-inline>\n", | |
"\t<li>'XMLInternalDocument'</li>\n", | |
"\t<li>'XMLAbstractDocument'</li>\n", | |
"</ol>\n" | |
], | |
"text/latex": [ | |
"\\begin{enumerate*}\n", | |
"\\item 'XMLInternalDocument'\n", | |
"\\item 'XMLAbstractDocument'\n", | |
"\\end{enumerate*}\n" | |
], | |
"text/markdown": [ | |
"1. 'XMLInternalDocument'\n", | |
"2. 'XMLAbstractDocument'\n", | |
"\n", | |
"\n" | |
], | |
"text/plain": [ | |
"[1] \"XMLInternalDocument\" \"XMLAbstractDocument\"" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#install.packages(\"rentrez\",repos='http://cran.us.r-project.org')\n", | |
"library(rentrez)\n", | |
"\n", | |
"your.ids <- attributes(res)$PMID #c(\"26386083\",\"26273372\",\"26066373\",\"25837167\",\"25466451\",\"25013473\")\n", | |
"# rentrez function to get the data from pubmed db\n", | |
"fetch.pubmed <- entrez_fetch(db = \"pubmed\", id = your.ids,\n", | |
" rettype = \"xml\", parsed = TRUE)\n", | |
"class(fetch.pubmed)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<ol>\n", | |
"\t<li>'Gene Expression Profile of the Clinically Aggressive Micropapillary Variant of Bladder Cancer.'</li>\n", | |
"\t<li>'Spaced seeds improve k-mer-based metagenomic classification.'</li>\n", | |
"\t<li>'Application of array comparative genomic hybridization in 256 patients with developmental delay or intellectual disability.'</li>\n", | |
"\t<li>'Multiple samples aCGH analysis for rare CNVs detection.'</li>\n", | |
"\t<li>'Functional performance of aCGH design for clinical cytogenetics.'</li>\n", | |
"\t<li>'Assessment of the role of copy-number variants in 150 patients with congenital heart defects.'</li>\n", | |
"\t<li>'Application of custom-designed oligonucleotide array CGH in 145 patients with autistic spectrum disorders.'</li>\n", | |
"\t<li>'Application of array comparative genomic hybridization in 102 patients with epilepsy and additional neurodevelopmental disorders.'</li>\n", | |
"</ol>\n" | |
], | |
"text/latex": [ | |
"\\begin{enumerate}\n", | |
"\\item 'Gene Expression Profile of the Clinically Aggressive Micropapillary Variant of Bladder Cancer.'\n", | |
"\\item 'Spaced seeds improve k-mer-based metagenomic classification.'\n", | |
"\\item 'Application of array comparative genomic hybridization in 256 patients with developmental delay or intellectual disability.'\n", | |
"\\item 'Multiple samples aCGH analysis for rare CNVs detection.'\n", | |
"\\item 'Functional performance of aCGH design for clinical cytogenetics.'\n", | |
"\\item 'Assessment of the role of copy-number variants in 150 patients with congenital heart defects.'\n", | |
"\\item 'Application of custom-designed oligonucleotide array CGH in 145 patients with autistic spectrum disorders.'\n", | |
"\\item 'Application of array comparative genomic hybridization in 102 patients with epilepsy and additional neurodevelopmental disorders.'\n", | |
"\\end{enumerate}\n" | |
], | |
"text/markdown": [ | |
"1. 'Gene Expression Profile of the Clinically Aggressive Micropapillary Variant of Bladder Cancer.'\n", | |
"2. 'Spaced seeds improve k-mer-based metagenomic classification.'\n", | |
"3. 'Application of array comparative genomic hybridization in 256 patients with developmental delay or intellectual disability.'\n", | |
"4. 'Multiple samples aCGH analysis for rare CNVs detection.'\n", | |
"5. 'Functional performance of aCGH design for clinical cytogenetics.'\n", | |
"6. 'Assessment of the role of copy-number variants in 150 patients with congenital heart defects.'\n", | |
"7. 'Application of custom-designed oligonucleotide array CGH in 145 patients with autistic spectrum disorders.'\n", | |
"8. 'Application of array comparative genomic hybridization in 102 patients with epilepsy and additional neurodevelopmental disorders.'\n", | |
"\n", | |
"\n" | |
], | |
"text/plain": [ | |
"[[1]]\n", | |
"[1] \"Gene Expression Profile of the Clinically Aggressive Micropapillary Variant of Bladder Cancer.\"\n", | |
"\n", | |
"[[2]]\n", | |
"[1] \"Spaced seeds improve k-mer-based metagenomic classification.\"\n", | |
"\n", | |
"[[3]]\n", | |
"[1] \"Application of array comparative genomic hybridization in 256 patients with developmental delay or intellectual disability.\"\n", | |
"\n", | |
"[[4]]\n", | |
"[1] \"Multiple samples aCGH analysis for rare CNVs detection.\"\n", | |
"\n", | |
"[[5]]\n", | |
"[1] \"Functional performance of aCGH design for clinical cytogenetics.\"\n", | |
"\n", | |
"[[6]]\n", | |
"[1] \"Assessment of the role of copy-number variants in 150 patients with congenital heart defects.\"\n", | |
"\n", | |
"[[7]]\n", | |
"[1] \"Application of custom-designed oligonucleotide array CGH in 145 patients with autistic spectrum disorders.\"\n", | |
"\n", | |
"[[8]]\n", | |
"[1] \"Application of array comparative genomic hybridization in 102 patients with epilepsy and additional neurodevelopmental disorders.\"\n" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"#install.packages(\"XML\",repos='http://cran.us.r-project.org')\n", | |
"library(XML)\n", | |
"xpathApply(fetch.pubmed, '//PubmedArticle//MedlineCitation//Article//ArticleTitle',xmlValue)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Extract the Abstracts for the respective IDS. \n", | |
"affil = xpathApply(fetch.pubmed, '//PubmedArticle//MedlineCitation//Article//AuthorList//Author',\n", | |
" function(x)xmlChildren(x))\n", | |
" \n", | |
"getAuthorAffil<-function(x)cbind(paste(xmlValue(x$ForeName),\n", | |
" xmlValue(x$LastName)),\n", | |
" ifelse(is.null(x[[\"AffiliationInfo\"]]),NA,\n", | |
" xmlValue(xmlChildren(x[[\"AffiliationInfo\"]])$Affiliation))\n", | |
" ) \n", | |
"affil<-do.call(rbind,lapply(affil,function(x)as.data.frame(getAuthorAffil(x),stringsAsFactors=FALSE)))\n", | |
"colnames(affil)<-c(\"name\",\"affil\") " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<ol class=list-inline>\n", | |
"\t<li>25</li>\n", | |
"\t<li>2</li>\n", | |
"</ol>\n" | |
], | |
"text/latex": [ | |
"\\begin{enumerate*}\n", | |
"\\item 25\n", | |
"\\item 2\n", | |
"\\end{enumerate*}\n" | |
], | |
"text/markdown": [ | |
"1. 25\n", | |
"2. 2\n", | |
"\n", | |
"\n" | |
], | |
"text/plain": [ | |
"[1] 25 2" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
"<thead><tr><th></th><th scope=col>name</th><th scope=col>affil</th></tr></thead>\n", | |
"<tbody>\n", | |
"\t<tr><th scope=row>1</th><td>Anna Gambin </td><td>1. Institute of Informatics, University of Warsaw, Warsaw, Poland.</td></tr>\n", | |
"\t<tr><th scope=row>2</th><td>Arlene Siefker-Radtke </td><td>1. Department of Genitourinary Medical Oncology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.</td></tr>\n", | |
"\t<tr><th scope=row>3</th><td>Ashish Madhav Kamat </td><td>1. Department of Urology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.</td></tr>\n", | |
"\t<tr><th scope=row>4</th><td>Barbara Wiśniowiecka-Kowalnik </td><td>1. Department of Medical Genetics, Institute of Mother and Child, Warsaw, Poland.</td></tr>\n", | |
"\t<tr><th scope=row>5</th><td>Bogdan Czerniak </td><td>1. Department of Pathology, University of Texas MD Anderson Cancer Center, Houston, TX, USA. Electronic address: [email protected].</td></tr>\n", | |
"\t<tr><th scope=row>6</th><td>Charles Chuanhai Guo </td><td>1. Department of Pathology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.</td></tr>\n", | |
"</tbody>\n", | |
"</table>\n" | |
], | |
"text/latex": [ | |
"\\begin{tabular}{r|ll}\n", | |
" & name & affil\\\\\n", | |
"\\hline\n", | |
"\t1 & Anna Gambin & 1. Institute of Informatics, University of Warsaw, Warsaw, Poland.\\\\\n", | |
"\t2 & Arlene Siefker-Radtke & 1. Department of Genitourinary Medical Oncology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.\\\\\n", | |
"\t3 & Ashish Madhav Kamat & 1. Department of Urology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.\\\\\n", | |
"\t4 & Barbara Wiśniowiecka-Kowalnik & 1. Department of Medical Genetics, Institute of Mother and Child, Warsaw, Poland.\\\\\n", | |
"\t5 & Bogdan Czerniak & 1. Department of Pathology, University of Texas MD Anderson Cancer Center, Houston, TX, USA. Electronic address: [email protected].\\\\\n", | |
"\t6 & Charles Chuanhai Guo & 1. Department of Pathology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.\\\\\n", | |
"\\end{tabular}\n" | |
], | |
"text/plain": [ | |
" name\n", | |
"1 Anna Gambin\n", | |
"2 Arlene Siefker-Radtke\n", | |
"3 Ashish Madhav Kamat\n", | |
"4 Barbara Wiśniowiecka-Kowalnik\n", | |
"5 Bogdan Czerniak\n", | |
"6 Charles Chuanhai Guo\n", | |
" affil\n", | |
"1 1. Institute of Informatics, University of Warsaw, Warsaw, Poland.\n", | |
"2 1. Department of Genitourinary Medical Oncology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.\n", | |
"3 1. Department of Urology, University of Texas MD Anderson Cancer Center, Houston, TX, USA.\n", | |
"4 1. Department of Medical Genetics, Institute of Mother and Child, Warsaw, Poland.\n", | |
"5 1. Department of Pathology, University of Texas MD Anderson Cancer Center, Houston, TX, USA. Electronic address: [email protected].\n", | |
"6 1. Department of Pathology, University of Texas MD Anderson Cancer Center, Houston, TX, USA." | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"affil.agg<-aggregate(affil~name,data=affil,\n", | |
" FUN=function(x)paste(paste(1:length(unique(x)),unique(x),sep=\". \"),collapse=\" | \")) \n", | |
"dim(affil.agg)\n", | |
"head(affil.agg) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"Download: <a href=affil_out/coauthors_with_affil.tab>affil_out/coauthors_with_affil.tab</a>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"system(\"mkdir -p affil_out\")\n", | |
"fname<-\"affil_out/coauthors_with_affil.tab\"\n", | |
"write.table(affil.agg,file=fname,col.names=TRUE,row.names=TRUE,quote=FALSE,sep=\"\\t\")\n", | |
"library(IRdisplay)\n", | |
"IRdisplay::display_html(paste(\"Download: <a href=\",fname,\">\",fname,\"</a>\",sep=\"\"))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "R", | |
"language": "R", | |
"name": "ir" | |
}, | |
"language_info": { | |
"codemirror_mode": "r", | |
"file_extension": ".r", | |
"mimetype": "text/x-r-source", | |
"name": "R", | |
"pygments_lexer": "r", | |
"version": "3.2.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment