Created
November 22, 2019 17:43
-
-
Save RandomCriticalAnalysis/bbefcd3e0536c07cb9256843e3de18a1 to your computer and use it in GitHub Desktop.
Helper functions to make using OECD package easier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(OECD) | |
# global variable to cache data structure in memory | |
# this is crudely implemented, but much faster than hitting OECD.stat constantly | |
if (!exists("oecd_cached_structures")) { | |
oecd_cached_structures=list() | |
} else { | |
warning("oecd_cached_structures already set. You can delete the object if you wish to clear....") | |
} | |
# | |
# retrieve the data structure from cache, fetch and set in cache if not already | |
# | |
fetch_structure_from_cache=function(OECDds,autoLookup=T) { | |
if (!exists("oecd_cached_structures")) { | |
stop("oecd_cached_structures should already be defined globally, outside this scope") | |
} | |
dsLookupObj=oecd_cached_structures[[OECDds]] | |
if( is.null(dsLookupObj) ) { | |
if (autoLookup) { | |
fetch_and_cache_structure(OECDds) | |
dsLookupObj=oecd_cached_structures[[OECDds]] | |
} else { | |
stop(sprintf("Dataset %s does not exist in cache and autolookup disabled.",OECDds)) | |
} | |
} | |
return(dsLookupObj) | |
} | |
# | |
# OECD::get_dataset() returns columns as IDs corresponding to the labels | |
# while these allow quick coding if you remember them, sometimes it's nice to have the labels to go with it. | |
# this function simply modifies the passed data frame and appends columns with the name _label corresponding | |
# to the (ID) variables | |
add_dataset_labels = function(originalDS,OECDds, enumerated_columns = NULL,autoLookup=T ) { | |
dsLookupObj=fetch_from_cache(OECDds) | |
labels = names(dsLookupObj) | |
dataset_names = colnames(originalDS) | |
labeledDS=originalDS | |
skipped_names = '' | |
for( label in labels ) { | |
if ( !label %in% dataset_names ) { | |
skipped_names = paste(skipped_names,label) | |
next | |
} | |
if ( length(enumerated_columns) > 0 & !label %in% enumerated_columns) next | |
cds = dsLookupObj[[label]]$label_df | |
merged_column_name = paste(label,'_label',sep='') | |
names(cds)[2] = merged_column_name # replace column name 'label' with variable name and append '_label' | |
# return all original values, i.e, even if it doesnt match for some reason | |
labeledDS = merge(labeledDS,cds,by.x=label,by.y='id',all.x=T) | |
} | |
return(labeledDS) | |
} | |
auto_fetch_oecd=function(OECDds,filterObj,start_time=NULL,end_time=NULL,enumerated_columns=NULL) { | |
newFilter=generate_oecd_filter(OECDds,filterObj) | |
retDS=get_dataset(OECDds, filter = newFilter,start_time,end_time) | |
retDS=add_dataset_labels(retDS,OECDds,enumerated_columns) | |
retDS | |
} | |
# fetch data structure from OECD.stat and save to global variable | |
fetch_and_cache_structure=function(OECDds) { | |
if (!exists("oecd_cached_structures")) { | |
stop("oecd_cached_structures should already be defined globally, outside this scope") | |
} | |
cat(sprintf("Fetching %s from OECDstat: ",OECDds)) | |
oecd_struct=OECD::get_data_structure(OECDds) | |
cat("done\n") | |
variable_list=oecd_struct$VAR_DESC # a data frame | |
dsInfo=list() | |
for (i in 1:NROW(variable_list) ) { | |
rec=variable_list[i,] | |
id=as.character(rec$id) | |
cat(sprintf(" Adding id '%s' at index %i\n",id,i)) | |
description=rec$description | |
if( id == 'OBS_VALUE') { | |
label_df=data.frame(id=c(),label=c()) | |
} else { | |
label_df=as.data.frame(oecd_struct[[id]]) | |
names(label_df)=c('id','label') | |
} | |
# iteratively adding to a list is slow in R, but tolerable with small n | |
varInfo=list( | |
index=i, | |
description=description, | |
valid_ids=label_df$id, | |
valid_labels=label_df$label, | |
label_df=label_df | |
) | |
dsInfo[[id]]=varInfo | |
} | |
# ensure this gets assigned to global scope | |
oecd_cached_structures[[OECDds]] <<- dsInfo | |
cat("Done adding records!\n\nConcise list of variables (parameters):\n") | |
head(variable_list,50) | |
} | |
# print and return a summary of parameters for the given data frame | |
# compact/terse is showLong=F, otherwise long in the tidy-verse sense | |
show_filter_options=function(OECDds,showLong=F,autoLookup=T,autoView=T) { | |
dsLookupObj=fetch_structure_from_cache(OECDds) | |
idList=names(dsLookupObj) | |
df=data.frame() | |
for(id in idList) { | |
obj=dsLookupObj[[id]] | |
if (showLong) { | |
label_pairs=obj$label_df | |
npairs=NROW(label_pairs) | |
ndf=data.frame( | |
id=rep(id,npairs), | |
description=rep(obj$description,npairs), | |
index=rep(obj$index,npairs), | |
label_id=label_pairs$id, | |
label_name=label_pairs$label, | |
stringsAsFactors = F | |
) | |
} else { | |
ndf=data.frame( | |
id=id, | |
description=obj$description, | |
index=obj$index, | |
values=paste(obj$valid_ids,collapse=', '), | |
stringsAsFactors = F | |
) | |
} | |
df=bind_rows(df,ndf) | |
} | |
if (autoView) View(df) | |
return(df) | |
} | |
# | |
# the OECD filter expects unnamed filter parameters ordered according to the data structure | |
# this function matches to the data structure and, optionally, checks the parameter values (for existence) | |
generate_oecd_filter=function(OECDds,filterPairs,validateFields=T,autoLookup=T) { | |
if( !is.list(filterPairs) | is.null(names(filterPairs)) ) { | |
stop("filterPairs expected to be a named list object") | |
} | |
unnamedList=list() | |
dsLookupObj=fetch_structure_from_cache(OECDds) | |
for(item in names(filterPairs) ) { | |
searchedIDs=filterPairs[[item]] | |
variableID=str_to_upper(item) # allow incorrect caps | |
varInfo=dsLookupObj[[variableID]] | |
if( is.null( varInfo ) ) { | |
varList=names(dsLookupObj) | |
message=sprintf("Could not parameter corresponding to %s\nvalid fields: %s",variableID,paste(varList,collapse=', ')) | |
stop(message) | |
} | |
if(validateFields) { | |
validIDs=varInfo$valid_ids | |
missingIDs=searchedIDs[which(!(searchedIDs%in%validIDs))] | |
if(length(missingIDs)!=0) { | |
stop(sprintf("Field %s does not seem to contain IDs: %s\n\nValid IDs are: %s\n", | |
variableID, | |
paste(missingIDs,collapse=', '), | |
paste(validIDs,collapse=', ') | |
)) | |
} | |
} | |
index=varInfo$index | |
unnamedList[[index]]=searchedIDs | |
} | |
str(unnamedList) | |
unnamedList | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment