Created
March 21, 2017 17:29
-
-
Save seandavi/749950a18237573cbdfed6d215f260cd to your computer and use it in GitHub Desktop.
NCI Thesaurus flat file to Bigquery using bigrquery and rstats
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Note that this will need to be | |
# edited to address different google | |
# project details | |
library(downloader) | |
library(readr) | |
zipname = "Thesaurus.zip" | |
# change to get the version you like. | |
download("http://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Thesaurus_17.02d.FLAT.zip",destfile=zipname) | |
unzip(zipname) | |
thes = read_tsv("Thesaurus.txt") | |
cnames = c("code","concept_name","parents","synonyms", | |
"definition","display_name","concept_status","semantic_type") | |
colnames(thes) = cnames | |
library(bigrquery) | |
# You'll need to change to match your own | |
# google projects | |
project = 'isb-cgc-04-0020' | |
dataset = 'cancer_data_warehouse' | |
tbl_name = "ncit" | |
billing_proj = 'isb-cgc-04-0020' | |
job = insert_upload_job(project,dataset,tbl_name,values=thes,billing=billing_proj) | |
wait_for(job) | |
# using | |
library(dplyr) | |
db = src_bigquery(project,dataset,billing_proj) | |
ncit = tbl(src = db,'ncit') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment