Skip to content

Instantly share code, notes, and snippets.

Last active September 23, 2015 14:50
Show Gist options
  • Save pietrop/b3229885ff73c0bf941e to your computer and use it in GitHub Desktop.
Save pietrop/b3229885ff73c0bf941e to your computer and use it in GitHub Desktop.
Convert sbv to CSV and Plain Text
To convert sbv files generated using youtube captioning to CSV and plain text format.
run from terminal as $ ruby sbv_to_csv.rb Captions.sbv
converts sbv file from youtube into csv file, doing also timecode conversion using timecode gem. from ie 0:09:07.730 to 00:09:07:18
require 'csv'
require 'timecode'
class Line
attr_accessor :n, :tc_in, :tc_out, :text, :tc_in_converted, :tc_out_converted
def initialize(tc_in, tc_out, text, n)
@tc_in, @tc_out, @text,@n = tc_in, tc_out, text, n
def tc_convert(tc)
if !tc.nil?
Timecode.parse("0"+tc, fps = 25)
end#if end
def tc_convert(tc)
if !tc.nil?
Timecode.parse("0"+tc, fps = 25)
end#if end
def tc_convert_no_zero(tc)
if !tc.nil?
Timecode.parse(tc, fps = 25)
end#if end
#troubles in getting user input, but ideally would like to prompt for: reel, tc_meta, clip_name
print "what's the reel name in the metadata?"
reel = gets.chomp
print "what's TC start for this file in the metadata? \n ie 00:49:04:20 "
tc_meta = gets.chomp
print "what's filename of the clip? ie CC0027_01.MOV"
clip_name = gets.chomp
# puts "please put the filename, reel, tc_meta, and clip name"
filename= "Glass_english.sbv"
tc_meta = "10:51:57:00"
text_chunks =[]
n = 0
#the actual file, can now be opened into a variable I've called sbvfile
sbvfile =
#get all the lines from the sbv file into a line array
lines =[]
sbvfile.each do |line|
lines << line
end #end of sbvfile.each
#join all the lines into the array
lines = lines.join
puts lines.inspect
#create an array where every tc in, tc out, and text is in one element.
lines = lines.split("\r\n\r\n")
# puts lines.inspect
puts "*"*10
puts lines.size
puts "*"*10
#for word count
all_the_text = []
tc_count = []
@to_print =[]
#setup to create a csv file with the same name of the sbv file "#{filename.split('.')[0]}.csv", "wb") do |csv|
#give header/first row default names
csv << [ "N", "Time Code In", "Time Code Out","tc_meta", "tc_in_meta", "tc_out_meta","Transcribed Speech", "Speaker Name", "Comments", "Tags" ]
#n, tc_in,tc_out,reel,tc_meta,clip_name
#iterate over lines
lines.each do |l|
#splitting at \n allow to isolate tc, and collect two block of texts
tc, text1,text2 = l.split("\n")
#work around when text2 is nil is to check for it and set it to space
if text2.nil?
text2 =" "
#add text togethere
text = text1 + text2
all_the_text << text
#divide tc into tc in and tc out
tc_in,tc_out = tc.split(',')
tc_count << [tc_in, tc_out]
# puts tc_in
# puts tc_out
# puts text.inspect
#using line object to create instance of line, to do the conversion of TC later on
ln =, tc_out, text, n)
# puts ln.inspect
# puts ln.tc_convert(ln.tc_in)
# puts ln.tc_in
# puts ln.tc_convert("0:09:07.730")
#creating CSV File
# using line object tc_convert method and passing in ln tc_in to get it re formatted from ie 0:09:07.730 to 00:09:07:18
csv << [ n, ln.tc_convert(ln.tc_in), ln.tc_convert(ln.tc_out), tc_meta.upcase, ln.tc_convert(ln.tc_in) + tc_convert_no_zero(tc_meta) , ln.tc_convert(ln.tc_out) + tc_convert_no_zero(tc_meta), text ]
#print preview in terminal
puts "#{n}\t| #{ln.tc_convert(ln.tc_in)} | #{ln.tc_convert(ln.tc_out)} | #{text}\n"
@to_print << "#{n}|#{ln.tc_convert(ln.tc_in)}|#{ln.tc_convert(ln.tc_out)}|\t#{text}\n"
end#close lines looping
end #close CSV
csvfilename = "#{filename.split('.')[0]}.csv"
#print name of csv file saved
puts "-"*90
puts "CSV file saved: \n\t#{Dir.pwd}\n\t#{csvfilename}"
puts "-"*90
#word count
puts"Total word Count:\t#{all_the_text.join.split.size} Words"
#total of transcribed text time, different from total of video
time = []
tc_bucket = []
# puts tc_count.inspect
tc_count.each do |tc|
tc_i = tc_convert(tc[0])
# Timecode.parse("#{tc[0]}", fps = 25)
tc_o= tc_convert(tc[1])
# Timecode.parse("#{tc[1]}", fps = 25)
tc_c = tc_o - tc_i
tc_bucket << tc_c
transcribed_total_time =Timecode.parse("00:00:00:00", fps = 25)
tc_bucket.each do |t|
transcribed_total_time += t
#Total of video recording of transcriptions
i= tc_count.size
i -= 1
tc_start = tc_convert(tc_count[0][0])
tc_end = tc_convert(tc_count[i][1])
tc_total = tc_end + tc_start "#{filename.split('.')[0]}.text", "wb") do |file|
file.write "Transcription file names:\n\t #{filename.split('.')[0]}.text\t Sbv Filename : #{filename} \t CSV Filename #{csvfilename} \n"
file.write "Video file clip name: NA"
file.write "Card Metadata Reel Name: \t Video timecode Metadata Start: #{tc_meta}\n"
file.write ("-"*90)+"\n"
file.write @to_print.join
file.write ("-"*90)+"\n"
file.write "CSV file saved: \n\t#{Dir.pwd}\n\t#{csvfilename}\n"
file.write ("-"*90)+"\n"
file.write "Total word Count:\t#{all_the_text.join.split.size} Words\n"
file.write "FPS: \t\t\t#{transcribed_total_time.fps}fps\n"
file.write "Total lenght of Transcribed interview: \t\t#{transcribed_total_time} |\t#{transcribed_total_time.hours} Hour, #{transcribed_total_time.minutes} Min, #{transcribed_total_time.seconds} Seconds, #{transcribed_total_time.frames} Frames \t|\t #{transcribed_total_time.to_seconds} Seconds \t|\t Frame count: #{}\n"
file.write "Total lenght of Video interview: \t\t#{tc_total} |\t#{tc_total.hours} Hour, #{tc_total.minutes} Min, #{tc_total.seconds} Seconds, #{tc_total.frames} Frames\t|\t #{tc_total.to_seconds} Seconds \t|\tFrame count: #{}\n"
difference = tc_total-transcribed_total_time
file.write "Non transcribed time in video is: \t\t#{tc_total-transcribed_total_time} |\t#{difference.hours} Hour, #{difference.minutes} Min, #{difference.seconds} Seconds, #{difference.frames} Frames \t|\t #{difference.to_seconds} Seconds \t|\t Frame count: #{}\n"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment