Skip to content

Instantly share code, notes, and snippets.

@fauxneticien
Created January 25, 2017 06:24
Show Gist options
  • Save fauxneticien/85ae670fae28989af682779a466c3f96 to your computer and use it in GitHub Desktop.
Save fauxneticien/85ae670fae28989af682779a466c3f96 to your computer and use it in GitHub Desktop.
Split a media file associated with an ELAN file according to some tier
#!/usr/bin/env Rscript
# Split a media file associated with an ELAN file according to some tier
#
# Dependencies: ffmpeg, tidyverse (R Package)
#
# Usage: RScript split_media_by_eaf_tier.R eaf_file tier_name
# Example: RScript split_media_by_eaf_tier.R example.eaf tier_A
# Load packages, install 'tidyverse' (it includes both xml2 and stringr in it)
require(tidyverse)
require(xml2)
require(stringr)
args <- commandArgs(trailingOnly = TRUE)
eaf_file <- args[1]
tier_name <- args[2]
# For testing within R, uncomment as necessary
# setwd("/_sandboxes/BFoley/SoundFilePlusElanPairingsForAppDict.Individualwords")
# eaf_file <- "KayIndivWords20170111a.eaf"
# tier_name <- "text@sp"
eaf_file <- paste0(getwd(), "/", eaf_file)
# ELAN data is stored as XML, so read in eaf data using read_xml
eaf_dat <- read_xml(eaf_file)
# Get where media file is relative to the .eaf file
med_file <- xml_find_first(eaf_dat, "//MEDIA_DESCRIPTOR") %>%
xml_attr("RELATIVE_MEDIA_URL") %>%
paste0(getwd(), "/", .)
# Configure output variables
out_ext <- str_extract(med_file, ".[a-z]+$")
if(!dir.exists("split")) { dir.create("split") }
out_path <- paste0(getwd(), "/split/")
# Time codes are stored separated to annotations. Need to
# grab these first, then join them to annotation data later
eaf_timecodes <- xml_find_all(eaf_dat, "//TIME_SLOT") %>%
map_df(
~ xml_attrs(.) %>%
unlist() %>%
as.list() %>%
data.frame(stringsAsFactors = FALSE)
)
# Get annotations from 'tier_name' defined above
annotations_df <- xml_find_all(eaf_dat,
paste0("TIER[@TIER_ID='",
tier_name,
"']/ANNOTATION/ALIGNABLE_ANNOTATION")
) %>%
map_df(
~ data.frame(
stringsAsFactors = FALSE,
start_tid = xml_attr(., "TIME_SLOT_REF1"),
end_tid = xml_attr(., "TIME_SLOT_REF2"),
annotation = xml_child(.) %>% xml_text()
)
) %>%
left_join(rename(eaf_timecodes, start_tid = TIME_SLOT_ID, start_time = TIME_VALUE)) %>%
left_join(rename(eaf_timecodes, end_tid = TIME_SLOT_ID, end_time = TIME_VALUE))
# Group by annotations and assign -1, -2, ... disambiguators
# just in case the same utterance occurs multiple times
annotations_df %>%
group_by(annotation) %>%
mutate(dis_n = 1:n(),
max_n = max(dis_n)) %>%
ungroup %>%
mutate(annotation = ifelse(max_n > 1, paste0(annotation, "-", dis_n), annotation),
start_time = as.integer(start_time) / 1000,
end_time = as.integer(end_time) / 1000,
dur_secs = end_time - start_time,
out_file = paste0("'", out_path, annotation, out_ext, "'")) %>%
select(start_time, dur_secs, annotation, out_file) -> annotations_df
# Prepare for command-line execution
cmds_list <- paste0("ffmpeg -y",
" -i ", med_file,
" -ss ", annotations_df$start_time,
" -t ", annotations_df$dur_secs,
" -acodec copy ",
annotations_df$out_file)
# List apply commands to system()
exec_results <- lapply(cmds_list, system)
if(!all(exec_results == 0)) {
warning("The following commands returned non-zero exits: ")
cmds_list[unlist(exec_results)]
quit(status = 1)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment