Skip to content

Instantly share code, notes, and snippets.

@drammock
Last active April 26, 2020 17:46
Show Gist options
  • Save drammock/e324cdd6f8f401e548c00f783ca8c836 to your computer and use it in GitHub Desktop.
Save drammock/e324cdd6f8f401e548c00f783ca8c836 to your computer and use it in GitHub Desktop.
WIP assessment of the dim reduction results
#!/usr/bin/env Rscript
library(dplyr, warn.conflicts=FALSE)
# load phoible data (to get feat. col. names)
"~/Documents/academics/research/phoible/dev/data" -> data_dir
load(file.path(data_dir, "phoible.RData"))
# get feature column names
phoible %>%
select(tone:click) %>%
colnames() ->
feature_columns
# list solution files
file.path("results", "dimredux-solutions") -> results_dir
list.files(results_dir, pattern="csv$", full.names=TRUE) -> filenames
# housekeeping
c() -> errors
id_from_filename <- function(filename) {
basename(filename) %>%
strsplit(split=".", fixed=TRUE) %>%
unlist() %>%
`[`(1) %>%
as.integer()
}
record_errors <- function(inventory_id, solutions) {
if(startsWith(solutions[1, 1], "ERROR")) {
errors <<- c(errors, inventory_id)
return(TRUE)
}
return(FALSE)
}
load_solutions <- function(filename) {
id_from_filename(filename) -> inventory_id
read.csv(filename, header=FALSE) -> solutions
record_errors(inventory_id, solutions) -> is_error
if(startsWith(solutions[1, 1], "ERROR")) return(NULL)
# spread the features into boolean columns
apply(solutions, 1, function(feats) feature_columns %in% feats) %>%
as.data.frame(row.names=feature_columns) %>%
t() %>%
as.data.frame() ->
solutions_df
# record the inventory ID in a column
inventory_id -> solutions_df$InventoryID
return(solutions_df)
}
filenames %>%
lapply(load_solutions) %>%
do.call(rbind, .) %>%
group_by(InventoryID) ->
all_solutions
# make sure we didn't lose any
n_distinct(all_solutions$InventoryID) -> n_valid_solutions
assertthat::are_equal(length(filenames), n_valid_solutions + length(errors))
# count 'em up
all_solutions %>%
tally(name="n_solutions", sort=TRUE) ->
n_solutions_per_inventory
# which features occur in the most solutions?
all_solutions %>%
ungroup() %>%
select(-InventoryID) %>%
summarise_all(sum) %>%
unlist() %>%
sort(decreasing=TRUE)->
feature_occurrence_in_solutions_across_all_inventories
# which features occur in the most solutions (per inventory)?
all_solutions %>%
summarise_all(sum) ->
feature_occurrence_in_solutions_by_inventory
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment