Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save anonymous/0b11c3c9b3d8a45983a1 to your computer and use it in GitHub Desktop.
Save anonymous/0b11c3c9b3d8a45983a1 to your computer and use it in GitHub Desktop.
##--------------------------------------
## libraries
##--------------------------------------
library(rjson);
library(RCurl);
##--------------------------------------
## get data
##--------------------------------------
URL <- 'https://fm.formularynavigator.com/jsonFiles/publish/11/47/providers.json';
jsonRList <- fromJSON(getURL(URL)); ## recursive list representing the original JSON data
##--------------------------------------
## helper functions
##--------------------------------------
## apply a function to a set of nodes at the same depth level in a recursive list structure
levelApply <- function(
nodes, ## the root node of the list (recursive calls pass deeper nodes as they drill down into the list)
keyList, ## another list, expected to hold a sequence of keys (component names, integer indexes, or NULL for all) specifying which nodes to select at each depth level
func=identity, ## a function to run separately on each node once keyList has been exhausted
..., ## further arguments passed to func()
joinFunc=NULL ## optional function for joining the return values of func() at each successive depth, as the stack is unwound. An alternative is calling unlist() on the result, but careful not to lose the top-level index association
) {
if (length(keyList) == 0L) {
ret <- if (is.null(nodes)) NULL else func(nodes,...)
} else if (is.null(keyList[[1L]]) || length(keyList[[1L]]) != 1L) {
ret <- lapply(if (is.null(keyList[[1L]])) nodes else nodes[keyList[[1L]]],levelApply,keyList[-1L],func,...,joinFunc=joinFunc);
if (!is.null(joinFunc))
ret <- do.call(joinFunc,ret);
} else {
ret <- levelApply(nodes[[keyList[[1L]]]],keyList[-1L],func,...,joinFunc=joinFunc);
}; ## end if
ret;
}; ## end if
## these two wrappers automatically attempt to simplify the results of func() to a vector or matrix/data.frame, respectively
levelApplyToVec <- function(...) levelApply(...,joinFunc=c);
levelApplyToFrame <- function(...) levelApply(...,joinFunc=rbind); ## can return matrix or data.frame, depending on ret
## for the given node selection key union, retrieve a data.frame of logicals representing the unique combinations of keys possessed by the selected nodes, possibly with a count
keyCombos <- function(node,keyList,allKeys) `rownames<-`(setNames(unique(as.data.frame(levelApplyToFrame(node,keyList,function(h) allKeys%in%names(h)))),allKeys),NULL);
keyCombosWithCount <- function(node,keyList,allKeys) { ks <- keyCombos(node,keyList,allKeys); ks$.count <- unname(apply(ks,1,function(combo) sum(levelApplyToVec(node,keyList,function(h) identical(sort(names(ks)[combo]),sort(names(h))))))); ks; };
## return a simple two-component list with type (list, namedlist, or atomic vector type) and len for non-namedlist types; tlStr() returns a nice stringified form of said list
tl <- function(e) { if (is.null(e)) return(NULL); ret <- typeof(e); if (ret == 'list' && !is.null(names(e))) ret <- list(type='namedlist') else ret <- list(type=ret,len=length(e)); ret; };
tlStr <- function(e) { if (is.null(e)) return(NA); ret <- tl(e); if (is.null(ret$len)) ret <- ret$type else ret <- paste0(ret$type,'[',ret$len,']'); ret; };
## stringification functions for display
mkcsv <- function(v) paste0(collapse=',',v);
keyListToStr <- function(keyList) paste0(collapse='','/',sapply(keyList,function(key) if (is.null(key)) '*' else paste0(collapse=',',key)));
## return a data.frame giving a comma-separated list of the unique types possessed by the selected nodes; useful for learning about the structure of the data
keyTypes <- function(node,keyList,allKeys) data.frame(key=allKeys,tl=sapply(allKeys,function(key) mkcsv(unique(na.omit(levelApplyToVec(node,c(keyList,key),tlStr))))),row.names=NULL);
## useful for testing; can call npiToFrame() to show the row with a specified npi value, in a nice vertical form
rowToFrame <- function(dfrow) data.frame(column=names(dfrow),value=c(as.matrix(dfrow)));
getNPIRow <- function(df,npi) which(df$npi == npi);
npiToFrame <- function(df,npi) rowToFrame(df[getNPIRow(df,npi),]);
##--------------------------------------
## data examination
##--------------------------------------
## type of object -- plain unnamed list => array, length 3256
levelApplyToVec(jsonRList,list(),tlStr);
## [1] "list[3256]"
## unique types of main array elements => all named lists => hashes
unique(levelApplyToVec(jsonRList,list(NULL),tlStr));
## [1] "namedlist"
## get the union of keys among all hashes
allKeys <- unique(levelApplyToVec(jsonRList,list(NULL),names)); allKeys;
## [1] "npi" "type" "facility_name" "facility_type" "addresses" "plans" "last_updated_on" "name" "speciality" "accepting" "languages" "gender"
## get the unique pattern of keys among all hashes, and how often each occurs => shows there are inconsistent key sets among the top-level hashes
keyCombosWithCount(jsonRList,list(NULL),allKeys);
## npi type facility_name facility_type addresses plans last_updated_on name speciality accepting languages gender .count
## 1 TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE 279
## 2 TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE 2973
## 3 TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE 4
## for each key, get the unique set of types it takes on among all hashes, ignoring hashes where the key is omitted => some scalar strings, some multi-string, addresses is a variable-length list, plans is length-9 list, and name is a hash
keyTypes(jsonRList,list(NULL),allKeys);
## key tl
## 1 npi character[1]
## 2 type character[1]
## 3 facility_name character[1]
## 4 facility_type character[1],character[2],character[3]
## 5 addresses list[1],list[2],list[3],list[6],list[5],list[7],list[4],list[8],list[9],list[13],list[12]
## 6 plans list[9]
## 7 last_updated_on character[1]
## 8 name namedlist
## 9 speciality character[1],character[2],character[3],character[4]
## 10 accepting character[1]
## 11 languages character[2],character[3],character[4],character[6],character[5]
## 12 gender character[1]
## must look deeper into addresses array, plans array, and name hash; we'll have to flatten them
## ==== addresses =====
## note: the addresses key is always present under main array elements
## unique types of address elements across all hashes => all named lists, thus nested hashes
unique(levelApplyToVec(jsonRList,list(NULL,'addresses',NULL),tlStr));
## [1] "namedlist"
## union of keys among all address element hashes
allAddressKeys <- unique(levelApplyToVec(jsonRList,list(NULL,'addresses',NULL),names)); allAddressKeys;
## [1] "address" "city" "state" "zip" "phone" "address_2"
## pattern of keys among address elements => only address_2 varies, similar frequency with it as without it
keyCombosWithCount(jsonRList,list(NULL,'addresses',NULL),allAddressKeys);
## address city state zip phone address_2 .count
## 1 TRUE TRUE TRUE TRUE TRUE FALSE 1898
## 2 TRUE TRUE TRUE TRUE TRUE TRUE 2575
## for each address element key, get the unique set of types it takes on among all hashes, ignoring hashes where the key (only address_2 in this case) is omitted => all scalar strings
keyTypes(jsonRList,list(NULL,'addresses',NULL),allAddressKeys);
## key tl
## 1 address character[1]
## 2 city character[1]
## 3 state character[1]
## 4 zip character[1]
## 5 phone character[1]
## 6 address_2 character[1]
## ==== plans =====
## note: the plans key is always present under main array elements
## unique types of plan elements across all hashes => all named lists, thus nested hashes
unique(levelApplyToVec(jsonRList,list(NULL,'plans',NULL),tlStr));
## [1] "namedlist"
## union of keys among all plan element hashes
allPlanKeys <- unique(levelApplyToVec(jsonRList,list(NULL,'plans',NULL),names)); allPlanKeys;
## [1] "plan_id_type" "plan_id" "network_tier"
## pattern of keys among plan elements => good, all plan elements have all 3 keys, perfectly consistent
keyCombosWithCount(jsonRList,list(NULL,'plans',NULL),allPlanKeys);
## plan_id_type plan_id network_tier .count
## 1 TRUE TRUE TRUE 29304
## for each plan element key, get the unique set of types it takes on among all hashes (note: no plan keys are ever omitted, so don't have to worry about that) => all scalar strings
keyTypes(jsonRList,list(NULL,'plans',NULL),allPlanKeys);
## key tl
## 1 plan_id_type character[1]
## 2 plan_id character[1]
## 3 network_tier character[1]
## ==== name =====
## note: the name key is *not* always present under main array elements
## union of keys among all name hashes
allNameKeys <- unique(levelApplyToVec(jsonRList,list(NULL,'name'),names)); allNameKeys;
## [1] "first" "middle" "last"
## pattern of keys among name elements => sometimes middle is missing, relatively infrequently
keyCombosWithCount(jsonRList,list(NULL,'name'),allNameKeys);
## first middle last .count
## 1 TRUE TRUE TRUE 2679
## 2 TRUE FALSE TRUE 298
## for each name element key, get the unique set of types it takes on among all hashes, ignoring hashes where the key (only middle in this case) is omitted => all scalar strings
keyTypes(jsonRList,list(NULL,'name'),allNameKeys);
## key tl
## 1 first character[1]
## 2 middle character[1]
## 3 last character[1]
##--------------------------------------
## solution
##--------------------------------------
## recursively traverse the list structure, building up a column at each leaf node
extractLevelColumns <- function(
nodes, ## current level node selection
..., ## additional arguments to data.frame()
keyList=list(), ## current key path under main list
sep=NULL, ## optional string separator on which to join multi-element vectors; if NULL, will leave as separate columns
mkname=function(keyList,maxLen) paste0(collapse='.',if (is.null(sep) && maxLen == 1L) keyList[-length(keyList)] else keyList) ## name builder from current keyList and character vector max length across node level; default to dot-separated keys, and remove last index component for scalars
) {
cat(sprintf('extractLevelColumns(): %s\n',keyListToStr(keyList)));
if (length(nodes) == 0L) return(list()); ## handle corner case of empty main list
tlList <- lapply(nodes,tl);
typeList <- do.call(c,lapply(tlList,`[[`,'type'));
if (length(unique(typeList)) != 1L) stop(sprintf('error: inconsistent types (%s) at %s.',mkcsv(typeList),keyListToStr(keyList)));
type <- typeList[1L];
if (type == 'namedlist') { ## hash; recurse
allKeys <- unique(do.call(c,lapply(nodes,names)));
ret <- do.call(c,lapply(allKeys,function(key) extractLevelColumns(lapply(nodes,`[[`,key),...,keyList=c(keyList,key),sep=sep,mkname=mkname)));
} else if (type == 'list') { ## array; recurse
lenList <- do.call(c,lapply(tlList,`[[`,'len'));
maxLen <- max(lenList,na.rm=T);
allIndexes <- seq_len(maxLen);
ret <- do.call(c,lapply(allIndexes,function(index) extractLevelColumns(lapply(nodes,function(node) if (length(node) < index) NULL else node[[index]]),...,keyList=c(keyList,index),sep=sep,mkname=mkname))); ## must be careful to translate out-of-bounds to NULL; happens automatically with string keys, but not with integer indexes
} else if (type%in%c('raw','logical','integer','double','complex','character')) { ## atomic leaf node; build column
lenList <- do.call(c,lapply(tlList,`[[`,'len'));
maxLen <- max(lenList,na.rm=T);
if (is.null(sep)) {
ret <- lapply(seq_len(maxLen),function(i) setNames(data.frame(sapply(nodes,function(node) if (length(node) < i) NA else node[[i]]),...),mkname(c(keyList,i),maxLen)));
} else {
## keep original type if maxLen is 1, IOW don't stringify
ret <- list(setNames(data.frame(sapply(nodes,function(node) if (length(node) == 0L) NA else if (maxLen == 1L) node else paste(collapse=sep,node)),...),mkname(keyList,maxLen)));
}; ## end if
} else stop(sprintf('error: unsupported type %s at %s.',type,keyListToStr(keyList)));
if (is.null(ret)) ret <- list(); ## handle corner case of exclusively empty sublists
ret;
}; ## end extractLevelColumns()
## simple interface function
flattenList <- function(mainList,...) do.call(cbind,extractLevelColumns(mainList,...));
## actually run it
system.time({ df <- flattenList(jsonRList); });
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment