jeffeaton · December 16, 2022 05:04
diff --git a/epp-xml-anc-update.R b/epp-xml-anc-update.R
 library(xml2)
 library(epp)

 eppd_overwrite_matrix <- function (xm, mat) {

  if (!xml_attr(xm, "class") %in% c("[D", "[I"))
    stop("Tried to invoke .parse_matrix() on array node not of class '[D' or '[I'.")
  
  m_rows <- as.integer(xml_attr(xm, "length"))
  stopifnot(nrow(mat) == m_rows)
              
  rows <- xml_children(xm)
  idx <- as.integer(xml_attr(rows, "index")) + 1L
  if ( !all(idx == seq_len(m_rows)) ) {
    stop("Missing row indices. Function needs development to handle this case.")
  }

  rows <- xml_find_first(rows, "array")

  for(i in idx) {
    eppd_overwrite_array(rows[[i]], mat[i, ])
  }

  invisible()
 }

 eppd_overwrite_array <- function(x, arr_new) {
  
  a_length <- as.integer(xml_attr(x, "length"))
  a_mode <- switch(xml_attr(x, "class"),
                   int = "integer",
                   double = "numeric",
                   boolean = "logical",
                   java.lang.String = "character")
  
  arr <- vector(a_mode, a_length)
  elem <- xml_children(x)
  idx <- as.integer(xml_attr(elem, "index")) + 1L

  ## `idx` contains indices of non-zero values in the existing array
  ## If there are any 0 values that are changed to non-zero values
  ## in the new array, a new XML node needs to be added to record
  ## the value
  
  if ( length(arr_new[-idx]) > 0 &&
         any(arr_new[-idx] != 0.0) ) {
    new_idx <- seq_along(arr_new)[-idx]
    new_idx <- new_idx[arr_new[new_idx] != 0.0]

    for(ii in new_idx) {
      ## Copy first node -> insert in new location
      xml_add_child(x, elem[[1]], .where = new_idx-1, .copy = TRUE)
      new_node <- xml_children(x)[[new_idx]]
      xml_attr(new_node, "index") <- new_idx-1  # 0-based index in XML
    }
    xml_attr(x, "length") <- as.character(xml_length(x))

    idx <- sort(c(idx, new_idx))
  }

  ## Write new values into XML array
  elem <- xml_children(x)
  xml_text(elem) <- as.character(arr_new)[idx]
  
  invisible()
 }


 #' Update ANC testing data in EPP XML
 #' 
 #' @param pjnz path to base PJNZ file
 #' @param pjnz_new path to new PJNZ file
 #' @param eppd_new new EPP data set; constructed by `epp::read_epp_data()`
 #'
 #' @details
 #'
 #' @examples
 #' 
 #' ## Step 1: Read base EPP data
 #' pjnz <- "~/Downloads/mwi-xml-test.pjnz"
 #' eppd <- read_epp_data(pjnz)
 #' 
 #' ## Step 2: Update the EPP ANC data
 #' ##   * Do not change array structure
 #' 
 #' eppd_new <- eppd
 #' 
 #' eppd_new[["Southern Region"]]$ancrtsite.prev <- 1.1 * eppd[["Southern Region"]]$ancrtsite.prev
 #' eppd_new[["Southern Region"]]$ancrtsite.n <- floor(0.85 * eppd[["Southern Region"]]$ancrtsite.n)
 #' 
 #' eppd_new[["Southern Region"]]$ancrtcens$prev <- 1.1 * eppd[["Southern Region"]]$ancrtcens$prev
 #' eppd_new[["Southern Region"]]$ancrtcens$n <- floor(0.85 * eppd[["Southern Region"]]$ancrtcens$n)
 #' 
 #' pjnz_new <- "~/Downloads/mwi-xml-test-copy.pjnz"
 #' 
 #' ## Step 3: Write a new PJNZ file with updated ANC data
 #' ##   * Requires the base file
 #' 
 #' epp_xml_update_anc("~/Downloads/mwi-xml-test.pjnz",
 #'                    "~/Downloads/mwi-xml_updated-anc.pjnz",
 #'                     eppd_new)
 #' 
 epp_xml_update_anc <- function(pjnz, pjnz_new, eppd_new) {

  pjnz <- normalizePath(pjnz, mustWork = TRUE)
  pjnz_new <- normalizePath(pjnz_new, mustWork = FALSE)
  
  if (pjnz == pjnz_new) {
    stop("pjnz is same as pjnz_new. Specify new file name for edited PJNZ file.")
  }

  xmlfile <- grep(".xml", unzip(pjnz, list = TRUE)$Name, value = TRUE)

  if (!length(xmlfile)) {
    warning(paste0("No EPP .xml file found for ", basename(pjnz)))
    return(NULL)
  }
  con <- unz(pjnz, xmlfile)
  epp.xml <- read_xml(con)

  r <- xml_children(xml_child(epp.xml))
  names(r) <- xml_attr(r, "property")
  
  obj <- xml_find_all(r, ".//object")

  projsets <- obj[which(xml_attr(obj, "class") == "epp2011.core.sets.ProjectionSet")]

  for(eppSet in projsets){

    projset_id <- as.integer(gsub("[^0-9]", "", xml_attr(eppSet, "id")))

    eppSet <- xml_children(eppSet)
    names(eppSet) <- xml_attr(eppSet, "property")

    eppName <- xml_text(eppSet[["name"]])
    stopifnot(eppName %in% names(eppd_new))

    ##  ANC data  ##

    if(exists("siteNames", eppSet)) {
      siteNames <- epp:::.parse_array(xml_find_first(eppSet[["siteNames"]], "array"))
      nsites <- length(siteNames)

      ## ANC site used
      anc.used <- epp:::.parse_array(xml_find_first(eppSet[["siteSelected"]], "array"))

      ## ANC prevalence

      anc.prev_new <- eppd_new[[eppName]]$anc.prev
      anc.prev_new <- 100 * anc.prev_new
      anc.prev_new[is.na(anc.prev_new)] <- -1

      stopifnot(rownames(anc.prev_new) == siteNames)

      eppd_overwrite_matrix(xml_find_first(eppSet[["survData"]], "array"), anc.prev_new)
      

      ## ANC sample sizes
      anc.n_new <- eppd_new[[eppName]]$anc.n
      anc.n_new[is.na(anc.n_new)] <- -1

      stopifnot(rownames(anc.n_new) == siteNames)

      eppd_overwrite_matrix(xml_find_first(eppSet[["survSampleSizes"]], "array"), anc.n_new)

      
      ## ANC-RT site level
      
      if(length(eppSet[["dataInputMode"]]) &&
           length(xml_find_first(eppSet[["dataInputMode"]], ".//string")))
        input_mode <- xml_text(xml_find_first(eppSet[["dataInputMode"]], ".//string"))
      
      if(length(eppSet[["PMTCTData"]]) && input_mode == "ANC") {

        ancrtsite.prev_new <- eppd_new[[eppName]]$ancrtsite.prev
        ancrtsite.prev_new <- 100 * ancrtsite.prev_new
        ancrtsite.prev_new[is.na(ancrtsite.prev_new)] <- -1

        stopifnot(rownames(ancrtsite.prev_new) == siteNames)
        
        eppd_overwrite_matrix(xml_find_first(eppSet[["PMTCTData"]], "array"), ancrtsite.prev_new)

        ancrtsite.n_new <- eppd_new[[eppName]]$ancrtsite.n
        ancrtsite.n_new[is.na(ancrtsite.n_new)] <- -1
        
        stopifnot(rownames(ancrtsite.n_new) == siteNames)
        
        eppd_overwrite_matrix(xml_find_first(eppSet[["PMTCTSiteSampleSizes"]], "array"), ancrtsite.n_new)

      }
    }

    ## ANC-RT census level
    if (length(eppSet[["censusPMTCTSurvData"]]) && input_mode == "ANC") {

      ## ANC-RT census prevalence
      ancrtcens_prev_node <- xml_find_first(eppSet[["censusPMTCTSurvData"]], "array")
      ancrtcens_len <- as.integer(xml_attr(ancrtcens_prev_node, "length"))
      ancrtcens.prev_new <- rep(-1, ancrtcens_len)

      idx_obs <- eppd_new[[eppName]]$ancrtcens$year - 1985 + 1
      ancrtcens.prev_new[idx_obs] <- 100 * eppd_new[[eppName]]$ancrtcens$prev

      eppd_overwrite_array(ancrtcens_prev_node, ancrtcens.prev_new)
        
      ## ANC-RT census sample size
      ancrtcens_n_node <- xml_find_first(eppSet[["censusPMTCTSampleSizes"]], "array")
      ancrtcens_len <- as.integer(xml_attr(ancrtcens_n_node, "length"))
      ancrtcens.n_new <- rep(-1, ancrtcens_len)

      idx_obs <- eppd_new[[eppName]]$ancrtcens$year - 1985 + 1
      ancrtcens.n_new[idx_obs] <- eppd_new[[eppName]]$ancrtcens$n

      eppd_overwrite_array(ancrtcens_n_node, ancrtcens.n_new)

    }

  }

  ## Save new PJNZ file and copy in new EPP XML
  file.copy(pjnz, pjnz_new)

  wd <- getwd()
  on.exit(setwd(wd))
  
  tmpd <- tempfile()
  dir.create(tmpd)
  setwd(tmpd)
  
  if (!dir.exists(dirname(xmlfile))) {
    dir.create(dirname(xmlfile), recursive = TRUE)
  }
  write_xml(epp.xml, xmlfile)
  on.exit(file.remove(xmlfile))
  utils::zip(pjnz_new, xmlfile)

  pjnz_new
 }
	library(xml2)
	library(epp)

	eppd_overwrite_matrix <- function (xm, mat) {

	if (!xml_attr(xm, "class") %in% c("[D", "[I"))
	stop("Tried to invoke .parse_matrix() on array node not of class '[D' or '[I'.")

	m_rows <- as.integer(xml_attr(xm, "length"))
	stopifnot(nrow(mat) == m_rows)

	rows <- xml_children(xm)
	idx <- as.integer(xml_attr(rows, "index")) + 1L
	if ( !all(idx == seq_len(m_rows)) ) {
	stop("Missing row indices. Function needs development to handle this case.")
	}

	rows <- xml_find_first(rows, "array")

	for(i in idx) {
	eppd_overwrite_array(rows[[i]], mat[i, ])
	}

	invisible()
	}

	eppd_overwrite_array <- function(x, arr_new) {

	a_length <- as.integer(xml_attr(x, "length"))
	a_mode <- switch(xml_attr(x, "class"),
	int = "integer",
	double = "numeric",
	boolean = "logical",
	java.lang.String = "character")

	arr <- vector(a_mode, a_length)
	elem <- xml_children(x)
	idx <- as.integer(xml_attr(elem, "index")) + 1L

	## `idx` contains indices of non-zero values in the existing array
	## If there are any 0 values that are changed to non-zero values
	## in the new array, a new XML node needs to be added to record
	## the value

	if ( length(arr_new[-idx]) > 0 &&
	any(arr_new[-idx] != 0.0) ) {
	new_idx <- seq_along(arr_new)[-idx]
	new_idx <- new_idx[arr_new[new_idx] != 0.0]

	for(ii in new_idx) {
	## Copy first node -> insert in new location
	xml_add_child(x, elem[[1]], .where = new_idx-1, .copy = TRUE)
	new_node <- xml_children(x)[[new_idx]]
	xml_attr(new_node, "index") <- new_idx-1 # 0-based index in XML
	}
	xml_attr(x, "length") <- as.character(xml_length(x))

	idx <- sort(c(idx, new_idx))
	}

	## Write new values into XML array
	elem <- xml_children(x)
	xml_text(elem) <- as.character(arr_new)[idx]

	invisible()
	}


	#' Update ANC testing data in EPP XML
	#'
	#' @param pjnz path to base PJNZ file
	#' @param pjnz_new path to new PJNZ file
	#' @param eppd_new new EPP data set; constructed by `epp::read_epp_data()`
	#'
	#' @details
	#'
	#' @examples
	#'
	#' ## Step 1: Read base EPP data
	#' pjnz <- "~/Downloads/mwi-xml-test.pjnz"
	#' eppd <- read_epp_data(pjnz)
	#'
	#' ## Step 2: Update the EPP ANC data
	#' ## * Do not change array structure
	#'
	#' eppd_new <- eppd
	#'
	#' eppd_new[["Southern Region"]]$ancrtsite.prev <- 1.1 * eppd[["Southern Region"]]$ancrtsite.prev
	#' eppd_new[["Southern Region"]]$ancrtsite.n <- floor(0.85 * eppd[["Southern Region"]]$ancrtsite.n)
	#'
	#' eppd_new[["Southern Region"]]$ancrtcens$prev <- 1.1 * eppd[["Southern Region"]]$ancrtcens$prev
	#' eppd_new[["Southern Region"]]$ancrtcens$n <- floor(0.85 * eppd[["Southern Region"]]$ancrtcens$n)
	#'
	#' pjnz_new <- "~/Downloads/mwi-xml-test-copy.pjnz"
	#'
	#' ## Step 3: Write a new PJNZ file with updated ANC data
	#' ## * Requires the base file
	#'
	#' epp_xml_update_anc("~/Downloads/mwi-xml-test.pjnz",
	#' "~/Downloads/mwi-xml_updated-anc.pjnz",
	#' eppd_new)
	#'
	epp_xml_update_anc <- function(pjnz, pjnz_new, eppd_new) {

	pjnz <- normalizePath(pjnz, mustWork = TRUE)
	pjnz_new <- normalizePath(pjnz_new, mustWork = FALSE)

	if (pjnz == pjnz_new) {
	stop("pjnz is same as pjnz_new. Specify new file name for edited PJNZ file.")
	}

	xmlfile <- grep(".xml", unzip(pjnz, list = TRUE)$Name, value = TRUE)

	if (!length(xmlfile)) {
	warning(paste0("No EPP .xml file found for ", basename(pjnz)))
	return(NULL)
	}
	con <- unz(pjnz, xmlfile)
	epp.xml <- read_xml(con)

	r <- xml_children(xml_child(epp.xml))
	names(r) <- xml_attr(r, "property")

	obj <- xml_find_all(r, ".//object")

	projsets <- obj[which(xml_attr(obj, "class") == "epp2011.core.sets.ProjectionSet")]

	for(eppSet in projsets){

	projset_id <- as.integer(gsub("[^0-9]", "", xml_attr(eppSet, "id")))

	eppSet <- xml_children(eppSet)
	names(eppSet) <- xml_attr(eppSet, "property")

	eppName <- xml_text(eppSet[["name"]])
	stopifnot(eppName %in% names(eppd_new))

	## ANC data ##

	if(exists("siteNames", eppSet)) {
	siteNames <- epp:::.parse_array(xml_find_first(eppSet[["siteNames"]], "array"))
	nsites <- length(siteNames)

	## ANC site used
	anc.used <- epp:::.parse_array(xml_find_first(eppSet[["siteSelected"]], "array"))

	## ANC prevalence

	anc.prev_new <- eppd_new[[eppName]]$anc.prev
	anc.prev_new <- 100 * anc.prev_new
	anc.prev_new[is.na(anc.prev_new)] <- -1

	stopifnot(rownames(anc.prev_new) == siteNames)

	eppd_overwrite_matrix(xml_find_first(eppSet[["survData"]], "array"), anc.prev_new)


	## ANC sample sizes
	anc.n_new <- eppd_new[[eppName]]$anc.n
	anc.n_new[is.na(anc.n_new)] <- -1

	stopifnot(rownames(anc.n_new) == siteNames)

	eppd_overwrite_matrix(xml_find_first(eppSet[["survSampleSizes"]], "array"), anc.n_new)


	## ANC-RT site level

	if(length(eppSet[["dataInputMode"]]) &&
	length(xml_find_first(eppSet[["dataInputMode"]], ".//string")))
	input_mode <- xml_text(xml_find_first(eppSet[["dataInputMode"]], ".//string"))

	if(length(eppSet[["PMTCTData"]]) && input_mode == "ANC") {

	ancrtsite.prev_new <- eppd_new[[eppName]]$ancrtsite.prev
	ancrtsite.prev_new <- 100 * ancrtsite.prev_new
	ancrtsite.prev_new[is.na(ancrtsite.prev_new)] <- -1

	stopifnot(rownames(ancrtsite.prev_new) == siteNames)

	eppd_overwrite_matrix(xml_find_first(eppSet[["PMTCTData"]], "array"), ancrtsite.prev_new)

	ancrtsite.n_new <- eppd_new[[eppName]]$ancrtsite.n
	ancrtsite.n_new[is.na(ancrtsite.n_new)] <- -1

	stopifnot(rownames(ancrtsite.n_new) == siteNames)

	eppd_overwrite_matrix(xml_find_first(eppSet[["PMTCTSiteSampleSizes"]], "array"), ancrtsite.n_new)

	}
	}

	## ANC-RT census level
	if (length(eppSet[["censusPMTCTSurvData"]]) && input_mode == "ANC") {

	## ANC-RT census prevalence
	ancrtcens_prev_node <- xml_find_first(eppSet[["censusPMTCTSurvData"]], "array")
	ancrtcens_len <- as.integer(xml_attr(ancrtcens_prev_node, "length"))
	ancrtcens.prev_new <- rep(-1, ancrtcens_len)

	idx_obs <- eppd_new[[eppName]]$ancrtcens$year - 1985 + 1
	ancrtcens.prev_new[idx_obs] <- 100 * eppd_new[[eppName]]$ancrtcens$prev

	eppd_overwrite_array(ancrtcens_prev_node, ancrtcens.prev_new)

	## ANC-RT census sample size
	ancrtcens_n_node <- xml_find_first(eppSet[["censusPMTCTSampleSizes"]], "array")
	ancrtcens_len <- as.integer(xml_attr(ancrtcens_n_node, "length"))
	ancrtcens.n_new <- rep(-1, ancrtcens_len)

	idx_obs <- eppd_new[[eppName]]$ancrtcens$year - 1985 + 1
	ancrtcens.n_new[idx_obs] <- eppd_new[[eppName]]$ancrtcens$n

	eppd_overwrite_array(ancrtcens_n_node, ancrtcens.n_new)

	}

	}

	## Save new PJNZ file and copy in new EPP XML
	file.copy(pjnz, pjnz_new)

	wd <- getwd()
	on.exit(setwd(wd))

	tmpd <- tempfile()
	dir.create(tmpd)
	setwd(tmpd)

	if (!dir.exists(dirname(xmlfile))) {
	dir.create(dirname(xmlfile), recursive = TRUE)
	}
	write_xml(epp.xml, xmlfile)
	on.exit(file.remove(xmlfile))
	utils::zip(pjnz_new, xmlfile)

	pjnz_new
	}