Last active
April 26, 2020 17:46
-
-
Save drammock/e324cdd6f8f401e548c00f783ca8c836 to your computer and use it in GitHub Desktop.
WIP assessment of the dim reduction results
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
library(dplyr, warn.conflicts=FALSE) | |
# load phoible data (to get feat. col. names) | |
"~/Documents/academics/research/phoible/dev/data" -> data_dir | |
load(file.path(data_dir, "phoible.RData")) | |
# get feature column names | |
phoible %>% | |
select(tone:click) %>% | |
colnames() -> | |
feature_columns | |
# list solution files | |
file.path("results", "dimredux-solutions") -> results_dir | |
list.files(results_dir, pattern="csv$", full.names=TRUE) -> filenames | |
# housekeeping | |
c() -> errors | |
id_from_filename <- function(filename) { | |
basename(filename) %>% | |
strsplit(split=".", fixed=TRUE) %>% | |
unlist() %>% | |
`[`(1) %>% | |
as.integer() | |
} | |
record_errors <- function(inventory_id, solutions) { | |
if(startsWith(solutions[1, 1], "ERROR")) { | |
errors <<- c(errors, inventory_id) | |
return(TRUE) | |
} | |
return(FALSE) | |
} | |
load_solutions <- function(filename) { | |
id_from_filename(filename) -> inventory_id | |
read.csv(filename, header=FALSE) -> solutions | |
record_errors(inventory_id, solutions) -> is_error | |
if(startsWith(solutions[1, 1], "ERROR")) return(NULL) | |
# spread the features into boolean columns | |
apply(solutions, 1, function(feats) feature_columns %in% feats) %>% | |
as.data.frame(row.names=feature_columns) %>% | |
t() %>% | |
as.data.frame() -> | |
solutions_df | |
# record the inventory ID in a column | |
inventory_id -> solutions_df$InventoryID | |
return(solutions_df) | |
} | |
filenames %>% | |
lapply(load_solutions) %>% | |
do.call(rbind, .) %>% | |
group_by(InventoryID) -> | |
all_solutions | |
# make sure we didn't lose any | |
n_distinct(all_solutions$InventoryID) -> n_valid_solutions | |
assertthat::are_equal(length(filenames), n_valid_solutions + length(errors)) | |
# count 'em up | |
all_solutions %>% | |
tally(name="n_solutions", sort=TRUE) -> | |
n_solutions_per_inventory | |
# which features occur in the most solutions? | |
all_solutions %>% | |
ungroup() %>% | |
select(-InventoryID) %>% | |
summarise_all(sum) %>% | |
unlist() %>% | |
sort(decreasing=TRUE)-> | |
feature_occurrence_in_solutions_across_all_inventories | |
# which features occur in the most solutions (per inventory)? | |
all_solutions %>% | |
summarise_all(sum) -> | |
feature_occurrence_in_solutions_by_inventory |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment