Last active
October 20, 2023 13:34
-
-
Save teunbrand/28f989168d8d801cdd5777f167e63dfd to your computer and use it in GitHub Desktop.
First attempt at 2D run-length encoding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rle_2d <- function(matrix) { | |
# If we have an empty matrix, we return 0-row data.frame | |
if (prod(dim(matrix)) < 1) { | |
ans <- data.frame( | |
col.start = integer(), | |
col.end = integer(), | |
row.start = integer(), | |
row.end = integer(), | |
value = as.vector(matrix) | |
) | |
return(ans) | |
} | |
# Simplified case when there is only one column | |
if (ncol(matrix) == 1) { | |
rle <- rle(matrix[, 1]) | |
end <- cumsum(rle$lengths) | |
ans <- data.frame( | |
col.start = 1L, | |
col.end = 1L, | |
row.start = end - rle$lengths + 1, | |
row.end = end, | |
value = rle$values | |
) | |
return(ans) | |
} | |
# We first run-length encode every row of the matrix | |
runs <- lapply(seq_len(nrow(matrix)), function(row) { | |
rle <- rle(matrix[row,]) | |
end <- cumsum(rle$lengths) | |
data.frame( | |
col.start = end - rle$lengths + 1, | |
col.end = end, | |
row.start = row, | |
row.end = row, | |
value = rle$values | |
) | |
}) | |
# Simplified case when there is only one row | |
if (nrow(matrix) == 1) { | |
return(runs[[1]]) | |
} | |
# Looping over every row that isn't the last row | |
for (row in seq_len(length(runs) - 1)) { | |
current <- runs[[row]] | |
next_rows <- (row + 1):length(runs) | |
# For every run in current row | |
for (run in seq_len(nrow(current))) { | |
# Try to find match in subsequent rows | |
for (following in next_rows) { | |
follow <- runs[[following]] | |
# Does any value in the next row match? (Cheap test) | |
value_match <- follow$value == current$value[run] | |
if (!any(value_match)) { | |
# No value match for this run, restart for next run in same row | |
break | |
} | |
# Do also the positions match? (More expensive) | |
matched <- which( | |
follow$col.start == current$col.start[run] & | |
follow$col.end == current$col.end[run] & | |
value_match | |
) | |
if (length(matched) == 0) { | |
# No position match for this run, restart for next run in same row | |
break | |
} | |
# Merge match into current run | |
current$row.end[run] <- follow$row.end[matched[1]] | |
runs[[following]] <- follow[-matched[1], , drop = FALSE] | |
} | |
} | |
# Update row when finished | |
runs[[row]] <- current | |
} | |
do.call(rbind, runs) | |
} | |
# Version 2.0 inspired by Tim Taylor's hash table approach | |
rle_2d2 <- function(m, byrow = FALSE) { | |
n <- length(m) | |
if (n == 0L) { | |
ans <- data.frame( | |
col_start = integer(), | |
col_end = integer(), | |
row_start = integer(), | |
row_end = integer(), | |
value = as.vector(matrix) | |
) | |
return(ans) | |
} | |
if (isTRUE(byrow)) { | |
m <- t(m) | |
} | |
dim <- dim(m) | |
# Treat matrix content as levels, so we can deal with NAs | |
levels <- unique(m) | |
m <- matrix(match(m, levels), nrow(m), ncol(m)) | |
# Simplified case when m has only a single row | |
if (dim[1] == 1L) { | |
rle <- rle(as.vector(m)) | |
ends <- cumsum(rle$lengths) | |
ans <- data.frame( | |
col_start = ends - rle$lengths + 1, | |
col_end = ends, | |
row_start = 1L, | |
row_end = 1L, | |
value = levels[rle$values] | |
) | |
} | |
# Run length encoding by column | |
# 'By column' just means adding columns as change points | |
ends <- c(which(m[-1] != m[-n] | (row(m) == nrow(m))[-n]), n) | |
lengths <- diff(c(0L, ends)) | |
values <- m[ends] | |
starts <- ends - lengths + 1L | |
# Simplified case when m has only a single column | |
if (dim[2] == 1L) { | |
ans <- data.frame( | |
col_start = 1L, | |
col_end = 1L, | |
row_start = starts, | |
row_end = ends, | |
value = levels[values] | |
) | |
} | |
# Translate to indices | |
row_start <- arrayInd(starts, dim)[, 1] | |
row_end <- row_start + lengths - 1L | |
col_start <- col_end <- arrayInd(ends, dim)[, 2] | |
# Initialise hash table no longer than number of runs | |
# Inspiration for using hash tables for this problem taken from TimTaylor: | |
# https://fosstodon.org/@_TimTaylor/111266682218212785 | |
htab <- hashtab(size = length(values)) | |
for (i in seq_along(values)) { | |
# Lookup if there has been a similar column | |
key <- c(row_start[i], row_end[i], values[i]) | |
hsh <- gethash(htab, key) | |
if (!is.null(hsh) && col_start[i] == col_end[hsh] + 1L) { | |
# Matches run in previous column, merge by updating column end and | |
# deleting current run (NA value will be filtered out later) | |
col_end[hsh] <- col_start[i] | |
values[i] <- NA_integer_ | |
} else { | |
# Add run-index to the table | |
sethash(htab, key, i) | |
} | |
} | |
# For small matrices, this is the expensive step | |
ans <- data.frame( | |
col_start = col_start, | |
col_end = col_end, | |
row_start = row_start, | |
row_end = row_end, | |
value = levels[values] | |
)[!is.na(values), , drop = FALSE] | |
# Simply rename columns | |
if (isTRUE(byrow)) { | |
names(ans) <- c("row_start", "row_end", "col_start", "col_end", "value") | |
} | |
ans | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment