Skip to content

Instantly share code, notes, and snippets.

@RandomCriticalAnalysis
Created November 22, 2019 17:43
Show Gist options
  • Save RandomCriticalAnalysis/bbefcd3e0536c07cb9256843e3de18a1 to your computer and use it in GitHub Desktop.
Save RandomCriticalAnalysis/bbefcd3e0536c07cb9256843e3de18a1 to your computer and use it in GitHub Desktop.
Helper functions to make using OECD package easier
library(tidyverse)
library(OECD)
# global variable to cache data structure in memory
# this is crudely implemented, but much faster than hitting OECD.stat constantly
if (!exists("oecd_cached_structures")) {
oecd_cached_structures=list()
} else {
warning("oecd_cached_structures already set. You can delete the object if you wish to clear....")
}
#
# retrieve the data structure from cache, fetch and set in cache if not already
#
fetch_structure_from_cache=function(OECDds,autoLookup=T) {
if (!exists("oecd_cached_structures")) {
stop("oecd_cached_structures should already be defined globally, outside this scope")
}
dsLookupObj=oecd_cached_structures[[OECDds]]
if( is.null(dsLookupObj) ) {
if (autoLookup) {
fetch_and_cache_structure(OECDds)
dsLookupObj=oecd_cached_structures[[OECDds]]
} else {
stop(sprintf("Dataset %s does not exist in cache and autolookup disabled.",OECDds))
}
}
return(dsLookupObj)
}
#
# OECD::get_dataset() returns columns as IDs corresponding to the labels
# while these allow quick coding if you remember them, sometimes it's nice to have the labels to go with it.
# this function simply modifies the passed data frame and appends columns with the name _label corresponding
# to the (ID) variables
add_dataset_labels = function(originalDS,OECDds, enumerated_columns = NULL,autoLookup=T ) {
dsLookupObj=fetch_from_cache(OECDds)
labels = names(dsLookupObj)
dataset_names = colnames(originalDS)
labeledDS=originalDS
skipped_names = ''
for( label in labels ) {
if ( !label %in% dataset_names ) {
skipped_names = paste(skipped_names,label)
next
}
if ( length(enumerated_columns) > 0 & !label %in% enumerated_columns) next
cds = dsLookupObj[[label]]$label_df
merged_column_name = paste(label,'_label',sep='')
names(cds)[2] = merged_column_name # replace column name 'label' with variable name and append '_label'
# return all original values, i.e, even if it doesnt match for some reason
labeledDS = merge(labeledDS,cds,by.x=label,by.y='id',all.x=T)
}
return(labeledDS)
}
auto_fetch_oecd=function(OECDds,filterObj,start_time=NULL,end_time=NULL,enumerated_columns=NULL) {
newFilter=generate_oecd_filter(OECDds,filterObj)
retDS=get_dataset(OECDds, filter = newFilter,start_time,end_time)
retDS=add_dataset_labels(retDS,OECDds,enumerated_columns)
retDS
}
# fetch data structure from OECD.stat and save to global variable
fetch_and_cache_structure=function(OECDds) {
if (!exists("oecd_cached_structures")) {
stop("oecd_cached_structures should already be defined globally, outside this scope")
}
cat(sprintf("Fetching %s from OECDstat: ",OECDds))
oecd_struct=OECD::get_data_structure(OECDds)
cat("done\n")
variable_list=oecd_struct$VAR_DESC # a data frame
dsInfo=list()
for (i in 1:NROW(variable_list) ) {
rec=variable_list[i,]
id=as.character(rec$id)
cat(sprintf(" Adding id '%s' at index %i\n",id,i))
description=rec$description
if( id == 'OBS_VALUE') {
label_df=data.frame(id=c(),label=c())
} else {
label_df=as.data.frame(oecd_struct[[id]])
names(label_df)=c('id','label')
}
# iteratively adding to a list is slow in R, but tolerable with small n
varInfo=list(
index=i,
description=description,
valid_ids=label_df$id,
valid_labels=label_df$label,
label_df=label_df
)
dsInfo[[id]]=varInfo
}
# ensure this gets assigned to global scope
oecd_cached_structures[[OECDds]] <<- dsInfo
cat("Done adding records!\n\nConcise list of variables (parameters):\n")
head(variable_list,50)
}
# print and return a summary of parameters for the given data frame
# compact/terse is showLong=F, otherwise long in the tidy-verse sense
show_filter_options=function(OECDds,showLong=F,autoLookup=T,autoView=T) {
dsLookupObj=fetch_structure_from_cache(OECDds)
idList=names(dsLookupObj)
df=data.frame()
for(id in idList) {
obj=dsLookupObj[[id]]
if (showLong) {
label_pairs=obj$label_df
npairs=NROW(label_pairs)
ndf=data.frame(
id=rep(id,npairs),
description=rep(obj$description,npairs),
index=rep(obj$index,npairs),
label_id=label_pairs$id,
label_name=label_pairs$label,
stringsAsFactors = F
)
} else {
ndf=data.frame(
id=id,
description=obj$description,
index=obj$index,
values=paste(obj$valid_ids,collapse=', '),
stringsAsFactors = F
)
}
df=bind_rows(df,ndf)
}
if (autoView) View(df)
return(df)
}
#
# the OECD filter expects unnamed filter parameters ordered according to the data structure
# this function matches to the data structure and, optionally, checks the parameter values (for existence)
generate_oecd_filter=function(OECDds,filterPairs,validateFields=T,autoLookup=T) {
if( !is.list(filterPairs) | is.null(names(filterPairs)) ) {
stop("filterPairs expected to be a named list object")
}
unnamedList=list()
dsLookupObj=fetch_structure_from_cache(OECDds)
for(item in names(filterPairs) ) {
searchedIDs=filterPairs[[item]]
variableID=str_to_upper(item) # allow incorrect caps
varInfo=dsLookupObj[[variableID]]
if( is.null( varInfo ) ) {
varList=names(dsLookupObj)
message=sprintf("Could not parameter corresponding to %s\nvalid fields: %s",variableID,paste(varList,collapse=', '))
stop(message)
}
if(validateFields) {
validIDs=varInfo$valid_ids
missingIDs=searchedIDs[which(!(searchedIDs%in%validIDs))]
if(length(missingIDs)!=0) {
stop(sprintf("Field %s does not seem to contain IDs: %s\n\nValid IDs are: %s\n",
variableID,
paste(missingIDs,collapse=', '),
paste(validIDs,collapse=', ')
))
}
}
index=varInfo$index
unnamedList[[index]]=searchedIDs
}
str(unnamedList)
unnamedList
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment