Last active
September 23, 2015 14:50
-
-
Save pietrop/b3229885ff73c0bf941e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
=begin | |
Convert sbv to CSV and Plain Text | |
To convert sbv files generated using youtube captioning to CSV and plain text format. | |
run from terminal as $ ruby sbv_to_csv.rb Captions.sbv | |
converts sbv file from youtube into csv file, doing also timecode conversion using timecode gem. from ie 0:09:07.730 to 00:09:07:18 | |
=end | |
require 'csv' | |
require 'timecode' | |
class Line | |
attr_accessor :n, :tc_in, :tc_out, :text, :tc_in_converted, :tc_out_converted | |
def initialize(tc_in, tc_out, text, n) | |
@tc_in, @tc_out, @text,@n = tc_in, tc_out, text, n | |
end | |
def tc_convert(tc) | |
if !tc.nil? | |
Timecode.parse("0"+tc, fps = 25) | |
end#if end | |
end | |
end | |
def tc_convert(tc) | |
if !tc.nil? | |
Timecode.parse("0"+tc, fps = 25) | |
end#if end | |
end | |
def tc_convert_no_zero(tc) | |
if !tc.nil? | |
Timecode.parse(tc, fps = 25) | |
end#if end | |
end | |
=begin | |
#troubles in getting user input, but ideally would like to prompt for: reel, tc_meta, clip_name | |
print "what's the reel name in the metadata?" | |
reel = gets.chomp | |
print "what's TC start for this file in the metadata? \n ie 00:49:04:20 " | |
tc_meta = gets.chomp | |
print "what's filename of the clip? ie CC0027_01.MOV" | |
clip_name = gets.chomp | |
=end | |
# puts "please put the filename, reel, tc_meta, and clip name" | |
filename= "Glass_english.sbv" | |
tc_meta = "10:51:57:00" | |
text_chunks =[] | |
n = 0 | |
#the actual file, can now be opened into a variable I've called sbvfile | |
sbvfile = File.open(filename) | |
#get all the lines from the sbv file into a line array | |
lines =[] | |
sbvfile.each do |line| | |
lines << line | |
end #end of sbvfile.each | |
#join all the lines into the array | |
lines = lines.join | |
puts lines.inspect | |
#create an array where every tc in, tc out, and text is in one element. | |
lines = lines.split("\r\n\r\n") | |
# puts lines.inspect | |
puts "*"*10 | |
puts lines.size | |
puts "*"*10 | |
#for word count | |
all_the_text = [] | |
tc_count = [] | |
@to_print =[] | |
@to_print_no_tc=[] | |
#setup to create a csv file with the same name of the sbv file | |
CSV.open( "#{filename.split('.')[0]}.csv", "wb") do |csv| | |
#give header/first row default names | |
csv << [ "N", "Time Code In", "Time Code Out","tc_meta", "tc_in_meta", "tc_out_meta","Transcribed Speech", "Speaker Name", "Comments", "Tags" ] | |
#n, tc_in,tc_out,reel,tc_meta,clip_name | |
#iterate over lines | |
lines.each do |l| | |
n+=1 | |
#splitting at \n allow to isolate tc, and collect two block of texts | |
tc, text1,text2 = l.split("\n") | |
#work around when text2 is nil is to check for it and set it to space | |
if text2.nil? | |
text2 =" " | |
end | |
#add text togethere | |
text = text1 + text2 | |
all_the_text << text | |
#divide tc into tc in and tc out | |
tc_in,tc_out = tc.split(',') | |
tc_count << [tc_in, tc_out] | |
# puts tc_in | |
# puts tc_out | |
# puts text.inspect | |
#using line object to create instance of line, to do the conversion of TC later on | |
ln = Line.new(tc_in, tc_out, text, n) | |
# puts ln.inspect | |
# puts ln.tc_convert(ln.tc_in) | |
# puts ln.tc_in | |
# puts ln.tc_convert("0:09:07.730") | |
#creating CSV File | |
# using line object tc_convert method and passing in ln tc_in to get it re formatted from ie 0:09:07.730 to 00:09:07:18 | |
csv << [ n, ln.tc_convert(ln.tc_in), ln.tc_convert(ln.tc_out), tc_meta.upcase, ln.tc_convert(ln.tc_in) + tc_convert_no_zero(tc_meta) , ln.tc_convert(ln.tc_out) + tc_convert_no_zero(tc_meta), text ] | |
#print preview in terminal | |
puts "#{n}\t| #{ln.tc_convert(ln.tc_in)} | #{ln.tc_convert(ln.tc_out)} | #{text}\n" | |
@to_print << "#{n}|#{ln.tc_convert(ln.tc_in)}|#{ln.tc_convert(ln.tc_out)}|\t#{text}\n" | |
@to_print_no_tc<<"#{text}\n" | |
end#close lines looping | |
end #close CSV | |
csvfilename = "#{filename.split('.')[0]}.csv" | |
#print name of csv file saved | |
puts "-"*90 | |
puts "CSV file saved: \n\t#{Dir.pwd}\n\t#{csvfilename}" | |
puts "-"*90 | |
#word count | |
puts"Total word Count:\t#{all_the_text.join.split.size} Words" | |
#total of transcribed text time, different from total of video | |
time = [] | |
tc_bucket = [] | |
# puts tc_count.inspect | |
tc_count.each do |tc| | |
tc_i = tc_convert(tc[0]) | |
# Timecode.parse("#{tc[0]}", fps = 25) | |
tc_o= tc_convert(tc[1]) | |
# Timecode.parse("#{tc[1]}", fps = 25) | |
tc_c = tc_o - tc_i | |
tc_bucket << tc_c | |
end | |
transcribed_total_time =Timecode.parse("00:00:00:00", fps = 25) | |
tc_bucket.each do |t| | |
t.inspect | |
transcribed_total_time += t | |
end | |
#Total of video recording of transcriptions | |
i= tc_count.size | |
i -= 1 | |
tc_start = tc_convert(tc_count[0][0]) | |
tc_end = tc_convert(tc_count[i][1]) | |
tc_total = tc_end + tc_start | |
File.open( "#{filename.split('.')[0]}.text", "wb") do |file| | |
file.write "Transcription file names:\n\t #{filename.split('.')[0]}.text\t Sbv Filename : #{filename} \t CSV Filename #{csvfilename} \n" | |
file.write "Video file clip name: NA" | |
file.write "Card Metadata Reel Name: \t Video timecode Metadata Start: #{tc_meta}\n" | |
file.write ("-"*90)+"\n" | |
file.write @to_print.join | |
file.write ("-"*90)+"\n" | |
file.write "CSV file saved: \n\t#{Dir.pwd}\n\t#{csvfilename}\n" | |
file.write ("-"*90)+"\n" | |
file.write "Total word Count:\t#{all_the_text.join.split.size} Words\n" | |
file.write "FPS: \t\t\t#{transcribed_total_time.fps}fps\n" | |
file.write "Total lenght of Transcribed interview: \t\t#{transcribed_total_time} |\t#{transcribed_total_time.hours} Hour, #{transcribed_total_time.minutes} Min, #{transcribed_total_time.seconds} Seconds, #{transcribed_total_time.frames} Frames \t|\t #{transcribed_total_time.to_seconds} Seconds \t|\t Frame count: #{transcribed_total_time.total}\n" | |
file.write "Total lenght of Video interview: \t\t#{tc_total} |\t#{tc_total.hours} Hour, #{tc_total.minutes} Min, #{tc_total.seconds} Seconds, #{tc_total.frames} Frames\t|\t #{tc_total.to_seconds} Seconds \t|\tFrame count: #{tc_total.total}\n" | |
difference = tc_total-transcribed_total_time | |
file.write "Non transcribed time in video is: \t\t#{tc_total-transcribed_total_time} |\t#{difference.hours} Hour, #{difference.minutes} Min, #{difference.seconds} Seconds, #{difference.frames} Frames \t|\t #{difference.to_seconds} Seconds \t|\t Frame count: #{difference.total}\n" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment