Skip to content

Instantly share code, notes, and snippets.

@seandavi
Created March 21, 2017 17:29
Show Gist options
  • Save seandavi/749950a18237573cbdfed6d215f260cd to your computer and use it in GitHub Desktop.
Save seandavi/749950a18237573cbdfed6d215f260cd to your computer and use it in GitHub Desktop.
NCI Thesaurus flat file to Bigquery using bigrquery and rstats
# Note that this will need to be
# edited to address different google
# project details
library(downloader)
library(readr)
zipname = "Thesaurus.zip"
# change to get the version you like.
download("http://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Thesaurus_17.02d.FLAT.zip",destfile=zipname)
unzip(zipname)
thes = read_tsv("Thesaurus.txt")
cnames = c("code","concept_name","parents","synonyms",
"definition","display_name","concept_status","semantic_type")
colnames(thes) = cnames
library(bigrquery)
# You'll need to change to match your own
# google projects
project = 'isb-cgc-04-0020'
dataset = 'cancer_data_warehouse'
tbl_name = "ncit"
billing_proj = 'isb-cgc-04-0020'
job = insert_upload_job(project,dataset,tbl_name,values=thes,billing=billing_proj)
wait_for(job)
# using
library(dplyr)
db = src_bigquery(project,dataset,billing_proj)
ncit = tbl(src = db,'ncit')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment