Skip to content

Instantly share code, notes, and snippets.

@pietrop
Last active September 23, 2015 14:50
Show Gist options
  • Save pietrop/b3229885ff73c0bf941e to your computer and use it in GitHub Desktop.
Save pietrop/b3229885ff73c0bf941e to your computer and use it in GitHub Desktop.
=begin
Convert sbv to CSV and Plain Text
To convert sbv files generated using youtube captioning to CSV and plain text format.
run from terminal as $ ruby sbv_to_csv.rb Captions.sbv
converts sbv file from youtube into csv file, doing also timecode conversion using timecode gem. from ie 0:09:07.730 to 00:09:07:18
=end
require 'csv'
require 'timecode'
class Line
attr_accessor :n, :tc_in, :tc_out, :text, :tc_in_converted, :tc_out_converted
def initialize(tc_in, tc_out, text, n)
@tc_in, @tc_out, @text,@n = tc_in, tc_out, text, n
end
def tc_convert(tc)
if !tc.nil?
Timecode.parse("0"+tc, fps = 25)
end#if end
end
end
def tc_convert(tc)
if !tc.nil?
Timecode.parse("0"+tc, fps = 25)
end#if end
end
def tc_convert_no_zero(tc)
if !tc.nil?
Timecode.parse(tc, fps = 25)
end#if end
end
=begin
#troubles in getting user input, but ideally would like to prompt for: reel, tc_meta, clip_name
print "what's the reel name in the metadata?"
reel = gets.chomp
print "what's TC start for this file in the metadata? \n ie 00:49:04:20 "
tc_meta = gets.chomp
print "what's filename of the clip? ie CC0027_01.MOV"
clip_name = gets.chomp
=end
# puts "please put the filename, reel, tc_meta, and clip name"
filename= "Glass_english.sbv"
tc_meta = "10:51:57:00"
text_chunks =[]
n = 0
#the actual file, can now be opened into a variable I've called sbvfile
sbvfile = File.open(filename)
#get all the lines from the sbv file into a line array
lines =[]
sbvfile.each do |line|
lines << line
end #end of sbvfile.each
#join all the lines into the array
lines = lines.join
puts lines.inspect
#create an array where every tc in, tc out, and text is in one element.
lines = lines.split("\r\n\r\n")
# puts lines.inspect
puts "*"*10
puts lines.size
puts "*"*10
#for word count
all_the_text = []
tc_count = []
@to_print =[]
@to_print_no_tc=[]
#setup to create a csv file with the same name of the sbv file
CSV.open( "#{filename.split('.')[0]}.csv", "wb") do |csv|
#give header/first row default names
csv << [ "N", "Time Code In", "Time Code Out","tc_meta", "tc_in_meta", "tc_out_meta","Transcribed Speech", "Speaker Name", "Comments", "Tags" ]
#n, tc_in,tc_out,reel,tc_meta,clip_name
#iterate over lines
lines.each do |l|
n+=1
#splitting at \n allow to isolate tc, and collect two block of texts
tc, text1,text2 = l.split("\n")
#work around when text2 is nil is to check for it and set it to space
if text2.nil?
text2 =" "
end
#add text togethere
text = text1 + text2
all_the_text << text
#divide tc into tc in and tc out
tc_in,tc_out = tc.split(',')
tc_count << [tc_in, tc_out]
# puts tc_in
# puts tc_out
# puts text.inspect
#using line object to create instance of line, to do the conversion of TC later on
ln = Line.new(tc_in, tc_out, text, n)
# puts ln.inspect
# puts ln.tc_convert(ln.tc_in)
# puts ln.tc_in
# puts ln.tc_convert("0:09:07.730")
#creating CSV File
# using line object tc_convert method and passing in ln tc_in to get it re formatted from ie 0:09:07.730 to 00:09:07:18
csv << [ n, ln.tc_convert(ln.tc_in), ln.tc_convert(ln.tc_out), tc_meta.upcase, ln.tc_convert(ln.tc_in) + tc_convert_no_zero(tc_meta) , ln.tc_convert(ln.tc_out) + tc_convert_no_zero(tc_meta), text ]
#print preview in terminal
puts "#{n}\t| #{ln.tc_convert(ln.tc_in)} | #{ln.tc_convert(ln.tc_out)} | #{text}\n"
@to_print << "#{n}|#{ln.tc_convert(ln.tc_in)}|#{ln.tc_convert(ln.tc_out)}|\t#{text}\n"
@to_print_no_tc<<"#{text}\n"
end#close lines looping
end #close CSV
csvfilename = "#{filename.split('.')[0]}.csv"
#print name of csv file saved
puts "-"*90
puts "CSV file saved: \n\t#{Dir.pwd}\n\t#{csvfilename}"
puts "-"*90
#word count
puts"Total word Count:\t#{all_the_text.join.split.size} Words"
#total of transcribed text time, different from total of video
time = []
tc_bucket = []
# puts tc_count.inspect
tc_count.each do |tc|
tc_i = tc_convert(tc[0])
# Timecode.parse("#{tc[0]}", fps = 25)
tc_o= tc_convert(tc[1])
# Timecode.parse("#{tc[1]}", fps = 25)
tc_c = tc_o - tc_i
tc_bucket << tc_c
end
transcribed_total_time =Timecode.parse("00:00:00:00", fps = 25)
tc_bucket.each do |t|
t.inspect
transcribed_total_time += t
end
#Total of video recording of transcriptions
i= tc_count.size
i -= 1
tc_start = tc_convert(tc_count[0][0])
tc_end = tc_convert(tc_count[i][1])
tc_total = tc_end + tc_start
File.open( "#{filename.split('.')[0]}.text", "wb") do |file|
file.write "Transcription file names:\n\t #{filename.split('.')[0]}.text\t Sbv Filename : #{filename} \t CSV Filename #{csvfilename} \n"
file.write "Video file clip name: NA"
file.write "Card Metadata Reel Name: \t Video timecode Metadata Start: #{tc_meta}\n"
file.write ("-"*90)+"\n"
file.write @to_print.join
file.write ("-"*90)+"\n"
file.write "CSV file saved: \n\t#{Dir.pwd}\n\t#{csvfilename}\n"
file.write ("-"*90)+"\n"
file.write "Total word Count:\t#{all_the_text.join.split.size} Words\n"
file.write "FPS: \t\t\t#{transcribed_total_time.fps}fps\n"
file.write "Total lenght of Transcribed interview: \t\t#{transcribed_total_time} |\t#{transcribed_total_time.hours} Hour, #{transcribed_total_time.minutes} Min, #{transcribed_total_time.seconds} Seconds, #{transcribed_total_time.frames} Frames \t|\t #{transcribed_total_time.to_seconds} Seconds \t|\t Frame count: #{transcribed_total_time.total}\n"
file.write "Total lenght of Video interview: \t\t#{tc_total} |\t#{tc_total.hours} Hour, #{tc_total.minutes} Min, #{tc_total.seconds} Seconds, #{tc_total.frames} Frames\t|\t #{tc_total.to_seconds} Seconds \t|\tFrame count: #{tc_total.total}\n"
difference = tc_total-transcribed_total_time
file.write "Non transcribed time in video is: \t\t#{tc_total-transcribed_total_time} |\t#{difference.hours} Hour, #{difference.minutes} Min, #{difference.seconds} Seconds, #{difference.frames} Frames \t|\t #{difference.to_seconds} Seconds \t|\t Frame count: #{difference.total}\n"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment