Skip to content

Instantly share code, notes, and snippets.

@richfitz
Last active May 24, 2021 14:33
Show Gist options
  • Save richfitz/901fbd987ffa1ac42be0 to your computer and use it in GitHub Desktop.
Save richfitz/901fbd987ffa1ac42be0 to your computer and use it in GitHub Desktop.
parse_col <- function(x) {
d <- strsplit(x, NULL)
f <- function(x) {
sum(match(x, LETTERS) * 26^(seq_along(x) - 1L))
}
vapply(d, f, numeric(1))
}
inspect_xlsx <- function(path, sheet) {
tmp <- tempfile()
file <- sprintf("xl/worksheets/sheet%d.xml", 1L)
sheet_path <- utils::unzip(path, file, exdir=tmp)
on.exit(file.remove(sheet_path))
xml <- xml2::read_xml(sheet_path)
stopifnot(xml2::xml_name(xml) == "worksheet")
## https://github.com/hadley/xml2/issues/28
ns <- xml2::xml_ns(xml)
sheetData <- xml2::xml_find_one(xml, "d1:sheetData", ns)
rows <- xml2::xml_find_all(sheetData, "d1:row", xml2::xml_ns(xml))
f_row <- function(x) {
lapply(xml2::xml_find_all(x, "d1:c", ns), f_cell)
}
f_cell <- function(y) {
## TODO: should check/filter for cells holding "" too?
##
## The 't' attribute is type. Possible entries include:
## n: numeric
## s: string
##
## The "type" contains "v" for a value and "f" for a formula
## list(attrs=xml2::xml_attrs(y),
## type=xml2::xml_name(xml2::xml_contents(y)))
list(ref=xml2::xml_attr(y, "r"),
s=xml2::xml_attr(y, "s"),
t=xml2::xml_attr(y, "t"),
is_formula=is_formula(y))
}
is_formula <- function(x) {
!is.null(tryCatch(xml2::xml_find_one(x, "d1:f", ns),
error=function(x) NULL))
}
dat <- unlist(lapply(rows, f_row), FALSE)
dat <- data.frame(ref=vapply(dat, "[[", character(1), "ref"),
s=vapply(dat, "[[", character(1), "s"),
t=vapply(dat, "[[", character(1), "t"),
is_formula=vapply(dat, "[[", logical(1), "is_formula"),
stringsAsFactors=FALSE)
re <- "^([[:alpha:]]+)([0-9]+)$"
dat$col_alpha <- sub(re, "\\1", dat$ref)
dat$col <- parse_col(dat$col_alpha)
dat$row <- as.integer(sub(re, "\\2", dat$ref))
dat
}
show_cells <- function(x) {
m <- matrix(".", max(dat$row), max(dat$col))
m[cbind(dat$row, dat$col)] <- ifelse(dat$is_formula, "=", "o")
message(paste(apply(m, 1, paste, collapse=""), collapse="\n"))
}
dat <- inspect_xlsx("terrible.xlsx", 1L)
show_cells(dat)
## ........
## ........
## ........
## ........
## ........
## ........
## ........
## .....ooo
## .....===
## .ooo....
## .ooo....
## .ooo....
@richfitz
Copy link
Author

In the little diagram above, dots are blank, "o" are data and "=" are formula cells. It should be pretty easy to design heuristics that create sensible ranges to extract from the file.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment