fauxneticien · January 25, 2017 06:24
diff --git a/split_media_by_eaf_tier.R b/split_media_by_eaf_tier.R
 #!/usr/bin/env Rscript

 # Split a media file associated with an ELAN file according to some tier
 #
 # Dependencies: ffmpeg, tidyverse (R Package)
 #
 # Usage:   RScript split_media_by_eaf_tier.R eaf_file tier_name
 # Example: RScript split_media_by_eaf_tier.R example.eaf tier_A

 # Load packages, install 'tidyverse' (it includes both xml2 and stringr in it)
 require(tidyverse)
 require(xml2)
 require(stringr)

 args <- commandArgs(trailingOnly = TRUE)

 eaf_file  <- args[1]
 tier_name <- args[2]

 # For testing within R, uncomment as necessary
 # setwd("/_sandboxes/BFoley/SoundFilePlusElanPairingsForAppDict.Individualwords")
 # eaf_file  <- "KayIndivWords20170111a.eaf"
 # tier_name <- "text@sp"

 eaf_file <- paste0(getwd(), "/", eaf_file)

 # ELAN data is stored as XML, so read in eaf data using read_xml
 eaf_dat <- read_xml(eaf_file)

 # Get where media file is relative to the .eaf file
 med_file <- xml_find_first(eaf_dat, "//MEDIA_DESCRIPTOR") %>%
  xml_attr("RELATIVE_MEDIA_URL") %>%
  paste0(getwd(), "/", .)

 # Configure output variables
 out_ext <- str_extract(med_file, ".[a-z]+$")

 if(!dir.exists("split")) { dir.create("split") }
 out_path <- paste0(getwd(), "/split/")

 # Time codes are stored separated to annotations. Need to
 # grab these first, then join them to annotation data later
 eaf_timecodes <- xml_find_all(eaf_dat, "//TIME_SLOT") %>%
  map_df(
    ~ xml_attrs(.) %>%
      unlist() %>%
      as.list() %>%
      data.frame(stringsAsFactors = FALSE)
  )

 # Get annotations from 'tier_name' defined above
 annotations_df <- xml_find_all(eaf_dat,
             paste0("TIER[@TIER_ID='",
                    tier_name,
                    "']/ANNOTATION/ALIGNABLE_ANNOTATION")
             ) %>%
  map_df(
    ~ data.frame(
      stringsAsFactors = FALSE,
      start_tid  = xml_attr(., "TIME_SLOT_REF1"),
      end_tid    = xml_attr(., "TIME_SLOT_REF2"),
      annotation = xml_child(.) %>% xml_text()
    )
  ) %>%
  left_join(rename(eaf_timecodes, start_tid = TIME_SLOT_ID, start_time = TIME_VALUE)) %>%
  left_join(rename(eaf_timecodes, end_tid = TIME_SLOT_ID, end_time = TIME_VALUE))

 # Group by annotations and assign -1, -2, ... disambiguators
 # just in case the same utterance occurs multiple times
 annotations_df %>%
  group_by(annotation) %>%
  mutate(dis_n = 1:n(),
         max_n = max(dis_n)) %>%
  ungroup %>%
  mutate(annotation = ifelse(max_n > 1, paste0(annotation, "-", dis_n), annotation),
         start_time = as.integer(start_time) / 1000,
         end_time   = as.integer(end_time) / 1000,
         dur_secs   = end_time - start_time,
         out_file   = paste0("'", out_path, annotation, out_ext, "'")) %>%
  select(start_time, dur_secs, annotation, out_file) -> annotations_df

 # Prepare for command-line execution
 cmds_list <- paste0("ffmpeg -y",
                    " -i ", med_file,
                    " -ss ", annotations_df$start_time,
                    " -t ", annotations_df$dur_secs,
                    " -acodec copy ",
                    annotations_df$out_file)

 # List apply commands to system()
 exec_results <- lapply(cmds_list, system)

 if(!all(exec_results == 0)) {
  warning("The following commands returned non-zero exits: ")
  cmds_list[unlist(exec_results)]
  quit(status = 1)
 }
	#!/usr/bin/env Rscript

	# Split a media file associated with an ELAN file according to some tier
	#
	# Dependencies: ffmpeg, tidyverse (R Package)
	#
	# Usage: RScript split_media_by_eaf_tier.R eaf_file tier_name
	# Example: RScript split_media_by_eaf_tier.R example.eaf tier_A

	# Load packages, install 'tidyverse' (it includes both xml2 and stringr in it)
	require(tidyverse)
	require(xml2)
	require(stringr)

	args <- commandArgs(trailingOnly = TRUE)

	eaf_file <- args[1]
	tier_name <- args[2]

	# For testing within R, uncomment as necessary
	# setwd("/_sandboxes/BFoley/SoundFilePlusElanPairingsForAppDict.Individualwords")
	# eaf_file <- "KayIndivWords20170111a.eaf"
	# tier_name <- "text@sp"

	eaf_file <- paste0(getwd(), "/", eaf_file)

	# ELAN data is stored as XML, so read in eaf data using read_xml
	eaf_dat <- read_xml(eaf_file)

	# Get where media file is relative to the .eaf file
	med_file <- xml_find_first(eaf_dat, "//MEDIA_DESCRIPTOR") %>%
	xml_attr("RELATIVE_MEDIA_URL") %>%
	paste0(getwd(), "/", .)

	# Configure output variables
	out_ext <- str_extract(med_file, ".[a-z]+$")

	if(!dir.exists("split")) { dir.create("split") }
	out_path <- paste0(getwd(), "/split/")

	# Time codes are stored separated to annotations. Need to
	# grab these first, then join them to annotation data later
	eaf_timecodes <- xml_find_all(eaf_dat, "//TIME_SLOT") %>%
	map_df(
	~ xml_attrs(.) %>%
	unlist() %>%
	as.list() %>%
	data.frame(stringsAsFactors = FALSE)
	)

	# Get annotations from 'tier_name' defined above
	annotations_df <- xml_find_all(eaf_dat,
	paste0("TIER[@TIER_ID='",
	tier_name,
	"']/ANNOTATION/ALIGNABLE_ANNOTATION")
	) %>%
	map_df(
	~ data.frame(
	stringsAsFactors = FALSE,
	start_tid = xml_attr(., "TIME_SLOT_REF1"),
	end_tid = xml_attr(., "TIME_SLOT_REF2"),
	annotation = xml_child(.) %>% xml_text()
	)
	) %>%
	left_join(rename(eaf_timecodes, start_tid = TIME_SLOT_ID, start_time = TIME_VALUE)) %>%
	left_join(rename(eaf_timecodes, end_tid = TIME_SLOT_ID, end_time = TIME_VALUE))

	# Group by annotations and assign -1, -2, ... disambiguators
	# just in case the same utterance occurs multiple times
	annotations_df %>%
	group_by(annotation) %>%
	mutate(dis_n = 1:n(),
	max_n = max(dis_n)) %>%
	ungroup %>%
	mutate(annotation = ifelse(max_n > 1, paste0(annotation, "-", dis_n), annotation),
	start_time = as.integer(start_time) / 1000,
	end_time = as.integer(end_time) / 1000,
	dur_secs = end_time - start_time,
	out_file = paste0("'", out_path, annotation, out_ext, "'")) %>%
	select(start_time, dur_secs, annotation, out_file) -> annotations_df

	# Prepare for command-line execution
	cmds_list <- paste0("ffmpeg -y",
	" -i ", med_file,
	" -ss ", annotations_df$start_time,
	" -t ", annotations_df$dur_secs,
	" -acodec copy ",
	annotations_df$out_file)

	# List apply commands to system()
	exec_results <- lapply(cmds_list, system)

	if(!all(exec_results == 0)) {
	warning("The following commands returned non-zero exits: ")
	cmds_list[unlist(exec_results)]
	quit(status = 1)
	}