Created
January 25, 2017 06:24
-
-
Save fauxneticien/85ae670fae28989af682779a466c3f96 to your computer and use it in GitHub Desktop.
Split a media file associated with an ELAN file according to some tier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
# Split a media file associated with an ELAN file according to some tier | |
# | |
# Dependencies: ffmpeg, tidyverse (R Package) | |
# | |
# Usage: RScript split_media_by_eaf_tier.R eaf_file tier_name | |
# Example: RScript split_media_by_eaf_tier.R example.eaf tier_A | |
# Load packages, install 'tidyverse' (it includes both xml2 and stringr in it) | |
require(tidyverse) | |
require(xml2) | |
require(stringr) | |
args <- commandArgs(trailingOnly = TRUE) | |
eaf_file <- args[1] | |
tier_name <- args[2] | |
# For testing within R, uncomment as necessary | |
# setwd("/_sandboxes/BFoley/SoundFilePlusElanPairingsForAppDict.Individualwords") | |
# eaf_file <- "KayIndivWords20170111a.eaf" | |
# tier_name <- "text@sp" | |
eaf_file <- paste0(getwd(), "/", eaf_file) | |
# ELAN data is stored as XML, so read in eaf data using read_xml | |
eaf_dat <- read_xml(eaf_file) | |
# Get where media file is relative to the .eaf file | |
med_file <- xml_find_first(eaf_dat, "//MEDIA_DESCRIPTOR") %>% | |
xml_attr("RELATIVE_MEDIA_URL") %>% | |
paste0(getwd(), "/", .) | |
# Configure output variables | |
out_ext <- str_extract(med_file, ".[a-z]+$") | |
if(!dir.exists("split")) { dir.create("split") } | |
out_path <- paste0(getwd(), "/split/") | |
# Time codes are stored separated to annotations. Need to | |
# grab these first, then join them to annotation data later | |
eaf_timecodes <- xml_find_all(eaf_dat, "//TIME_SLOT") %>% | |
map_df( | |
~ xml_attrs(.) %>% | |
unlist() %>% | |
as.list() %>% | |
data.frame(stringsAsFactors = FALSE) | |
) | |
# Get annotations from 'tier_name' defined above | |
annotations_df <- xml_find_all(eaf_dat, | |
paste0("TIER[@TIER_ID='", | |
tier_name, | |
"']/ANNOTATION/ALIGNABLE_ANNOTATION") | |
) %>% | |
map_df( | |
~ data.frame( | |
stringsAsFactors = FALSE, | |
start_tid = xml_attr(., "TIME_SLOT_REF1"), | |
end_tid = xml_attr(., "TIME_SLOT_REF2"), | |
annotation = xml_child(.) %>% xml_text() | |
) | |
) %>% | |
left_join(rename(eaf_timecodes, start_tid = TIME_SLOT_ID, start_time = TIME_VALUE)) %>% | |
left_join(rename(eaf_timecodes, end_tid = TIME_SLOT_ID, end_time = TIME_VALUE)) | |
# Group by annotations and assign -1, -2, ... disambiguators | |
# just in case the same utterance occurs multiple times | |
annotations_df %>% | |
group_by(annotation) %>% | |
mutate(dis_n = 1:n(), | |
max_n = max(dis_n)) %>% | |
ungroup %>% | |
mutate(annotation = ifelse(max_n > 1, paste0(annotation, "-", dis_n), annotation), | |
start_time = as.integer(start_time) / 1000, | |
end_time = as.integer(end_time) / 1000, | |
dur_secs = end_time - start_time, | |
out_file = paste0("'", out_path, annotation, out_ext, "'")) %>% | |
select(start_time, dur_secs, annotation, out_file) -> annotations_df | |
# Prepare for command-line execution | |
cmds_list <- paste0("ffmpeg -y", | |
" -i ", med_file, | |
" -ss ", annotations_df$start_time, | |
" -t ", annotations_df$dur_secs, | |
" -acodec copy ", | |
annotations_df$out_file) | |
# List apply commands to system() | |
exec_results <- lapply(cmds_list, system) | |
if(!all(exec_results == 0)) { | |
warning("The following commands returned non-zero exits: ") | |
cmds_list[unlist(exec_results)] | |
quit(status = 1) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment