pietrop · September 23, 2015 14:50
diff --git a/sbv_to_csv.rb b/sbv_to_csv.rb
 =begin 
 Convert sbv to CSV and Plain Text
 To convert sbv files generated using youtube captioning to CSV and plain text format.

 run from terminal as $ ruby sbv_to_csv.rb Captions.sbv
 converts sbv file from youtube into csv file, doing also timecode conversion using timecode gem. from ie 0:09:07.730 to  00:09:07:18
 =end

 require 'csv'
 require 'timecode'


 class Line
  attr_accessor :n, :tc_in, :tc_out, :text, :tc_in_converted, :tc_out_converted
    def initialize(tc_in, tc_out, text, n)
     @tc_in, @tc_out, @text,@n = tc_in, tc_out, text, n
    end

    def tc_convert(tc)
      if !tc.nil?
      Timecode.parse("0"+tc, fps = 25)
    end#if end
    end
 end


 def tc_convert(tc)
      if !tc.nil?
      Timecode.parse("0"+tc, fps = 25)
    end#if end
    end

    def tc_convert_no_zero(tc)
      if !tc.nil?
      Timecode.parse(tc, fps = 25)
    end#if end
    end
 =begin
 #troubles in getting user input, but ideally would like to prompt for: reel, tc_meta, clip_name

 print "what's the reel name in the metadata?"
 reel = gets.chomp

 print "what's TC start for this file in the metadata? \n ie 00:49:04:20 "
 tc_meta = gets.chomp
 print "what's filename of the clip? ie CC0027_01.MOV"
 clip_name = gets.chomp
 =end

 # puts "please put the filename, reel, tc_meta, and clip name"
 filename= "Glass_english.sbv"


 tc_meta = "10:51:57:00"




 text_chunks =[]
 n = 0

 #the actual file, can now be opened into a variable I've called sbvfile
 sbvfile = File.open(filename)


 #get all the lines from the sbv file into a line array
 lines =[]
 sbvfile.each do |line|
  lines << line
 end #end of sbvfile.each


 #join all the lines into the array
 lines = lines.join
 puts lines.inspect
 #create an array where every tc in, tc out, and text is in one element.
 lines = lines.split("\r\n\r\n")
 # puts lines.inspect
 puts "*"*10
 puts lines.size
 puts "*"*10


 #for word count
 all_the_text = []
 tc_count = []
 @to_print =[]
 @to_print_no_tc=[]
 #setup to create a csv file with the same name of the sbv file
 CSV.open( "#{filename.split('.')[0]}.csv", "wb") do |csv|
 #give header/first row default names
 csv << [ "N", "Time Code In", "Time Code Out","tc_meta", "tc_in_meta", "tc_out_meta","Transcribed Speech", "Speaker Name", "Comments", "Tags" ]
 #n, tc_in,tc_out,reel,tc_meta,clip_name
 #iterate over lines
 lines.each do |l|

  n+=1

 #splitting at \n allow to isolate tc, and collect two block of texts
  tc, text1,text2 = l.split("\n")
  #work around when text2 is nil is to check for it and set it to space
    if text2.nil?
      text2 =" "
    end
    #add text togethere
    text = text1 + text2
    all_the_text << text
    #divide tc into tc in and tc out
    tc_in,tc_out = tc.split(',')
    tc_count << [tc_in, tc_out]
    # puts tc_in
    # puts tc_out
    # puts text.inspect
    #using line object to create instance of line, to do the conversion of TC later on
    ln = Line.new(tc_in, tc_out, text, n)
    # puts ln.inspect
 # puts ln.tc_convert(ln.tc_in)
 # puts ln.tc_in
 # puts ln.tc_convert("0:09:07.730")

 #creating CSV File
 # using line object tc_convert method and passing in ln tc_in to get it re formatted from ie 0:09:07.730 to  00:09:07:18
 csv << [ n, ln.tc_convert(ln.tc_in), ln.tc_convert(ln.tc_out), tc_meta.upcase,  ln.tc_convert(ln.tc_in) + tc_convert_no_zero(tc_meta) , ln.tc_convert(ln.tc_out) + tc_convert_no_zero(tc_meta), text ]


 #print preview in terminal
 puts "#{n}\t|  #{ln.tc_convert(ln.tc_in)}  |  #{ln.tc_convert(ln.tc_out)}  |   #{text}\n"

 @to_print << "#{n}|#{ln.tc_convert(ln.tc_in)}|#{ln.tc_convert(ln.tc_out)}|\t#{text}\n"
 @to_print_no_tc<<"#{text}\n"
 end#close lines looping

 end #close CSV
 csvfilename = "#{filename.split('.')[0]}.csv"
 #print name of csv file saved
 puts "-"*90
 puts "CSV file saved: \n\t#{Dir.pwd}\n\t#{csvfilename}"
 puts "-"*90



 #word count
 puts"Total word Count:\t#{all_the_text.join.split.size} Words"

 #total of transcribed text time, different from total of video
 time = []
 tc_bucket = []
 # puts tc_count.inspect
 tc_count.each do |tc|


 tc_i = tc_convert(tc[0])

  # Timecode.parse("#{tc[0]}", fps = 25)
 tc_o= tc_convert(tc[1])

 # Timecode.parse("#{tc[1]}", fps = 25)
 tc_c = tc_o - tc_i

 tc_bucket << tc_c

 end

 transcribed_total_time =Timecode.parse("00:00:00:00", fps = 25)
 tc_bucket.each do |t|
  t.inspect
 transcribed_total_time += t
 end




 #Total of video recording of transcriptions

 i= tc_count.size
 i -= 1
 tc_start = tc_convert(tc_count[0][0])
 tc_end = tc_convert(tc_count[i][1])

 tc_total = tc_end + tc_start




 File.open( "#{filename.split('.')[0]}.text", "wb") do |file|
  file.write  "Transcription file names:\n\t #{filename.split('.')[0]}.text\t Sbv Filename : #{filename} \t CSV Filename #{csvfilename} \n"
  file.write  "Video file clip name: NA"
  file.write "Card Metadata Reel Name: \t Video timecode Metadata Start: #{tc_meta}\n"
  file.write  ("-"*90)+"\n"
  file.write  @to_print.join
  file.write  ("-"*90)+"\n"
  file.write  "CSV file saved: \n\t#{Dir.pwd}\n\t#{csvfilename}\n"
  file.write  ("-"*90)+"\n"
  file.write "Total word Count:\t#{all_the_text.join.split.size} Words\n"
  file.write "FPS: \t\t\t#{transcribed_total_time.fps}fps\n"
  file.write "Total lenght of Transcribed interview: \t\t#{transcribed_total_time} |\t#{transcribed_total_time.hours} Hour, #{transcribed_total_time.minutes} Min, #{transcribed_total_time.seconds} Seconds, #{transcribed_total_time.frames} Frames \t|\t  #{transcribed_total_time.to_seconds} Seconds \t|\t Frame count: #{transcribed_total_time.total}\n"
  file.write "Total lenght of Video interview: \t\t#{tc_total} |\t#{tc_total.hours} Hour,  #{tc_total.minutes} Min, #{tc_total.seconds} Seconds, #{tc_total.frames} Frames\t|\t #{tc_total.to_seconds} Seconds \t|\tFrame count: #{tc_total.total}\n"
  difference = tc_total-transcribed_total_time
  file.write "Non transcribed time in video is: \t\t#{tc_total-transcribed_total_time} |\t#{difference.hours} Hour, #{difference.minutes} Min, #{difference.seconds} Seconds, #{difference.frames} Frames \t|\t  #{difference.to_seconds} Seconds \t|\t Frame count: #{difference.total}\n"
 end
	=begin
	Convert sbv to CSV and Plain Text
	To convert sbv files generated using youtube captioning to CSV and plain text format.

	run from terminal as $ ruby sbv_to_csv.rb Captions.sbv
	converts sbv file from youtube into csv file, doing also timecode conversion using timecode gem. from ie 0:09:07.730 to 00:09:07:18
	=end

	require 'csv'
	require 'timecode'


	class Line
	attr_accessor :n, :tc_in, :tc_out, :text, :tc_in_converted, :tc_out_converted
	def initialize(tc_in, tc_out, text, n)
	@tc_in, @tc_out, @text,@n = tc_in, tc_out, text, n
	end

	def tc_convert(tc)
	if !tc.nil?
	Timecode.parse("0"+tc, fps = 25)
	end#if end
	end
	end


	def tc_convert(tc)
	if !tc.nil?
	Timecode.parse("0"+tc, fps = 25)
	end#if end
	end

	def tc_convert_no_zero(tc)
	if !tc.nil?
	Timecode.parse(tc, fps = 25)
	end#if end
	end
	=begin
	#troubles in getting user input, but ideally would like to prompt for: reel, tc_meta, clip_name

	print "what's the reel name in the metadata?"
	reel = gets.chomp

	print "what's TC start for this file in the metadata? \n ie 00:49:04:20 "
	tc_meta = gets.chomp
	print "what's filename of the clip? ie CC0027_01.MOV"
	clip_name = gets.chomp
	=end

	# puts "please put the filename, reel, tc_meta, and clip name"
	filename= "Glass_english.sbv"


	tc_meta = "10:51:57:00"




	text_chunks =[]
	n = 0

	#the actual file, can now be opened into a variable I've called sbvfile
	sbvfile = File.open(filename)


	#get all the lines from the sbv file into a line array
	lines =[]
	sbvfile.each do \|line\|
	lines << line
	end #end of sbvfile.each


	#join all the lines into the array
	lines = lines.join
	puts lines.inspect
	#create an array where every tc in, tc out, and text is in one element.
	lines = lines.split("\r\n\r\n")
	# puts lines.inspect
	puts ""10
	puts lines.size
	puts ""10


	#for word count
	all_the_text = []
	tc_count = []
	@to_print =[]
	@to_print_no_tc=[]
	#setup to create a csv file with the same name of the sbv file
	CSV.open( "#{filename.split('.')[0]}.csv", "wb") do \|csv\|
	#give header/first row default names
	csv << [ "N", "Time Code In", "Time Code Out","tc_meta", "tc_in_meta", "tc_out_meta","Transcribed Speech", "Speaker Name", "Comments", "Tags" ]
	#n, tc_in,tc_out,reel,tc_meta,clip_name
	#iterate over lines
	lines.each do \|l\|

	n+=1

	#splitting at \n allow to isolate tc, and collect two block of texts
	tc, text1,text2 = l.split("\n")
	#work around when text2 is nil is to check for it and set it to space
	if text2.nil?
	text2 =" "
	end
	#add text togethere
	text = text1 + text2
	all_the_text << text
	#divide tc into tc in and tc out
	tc_in,tc_out = tc.split(',')
	tc_count << [tc_in, tc_out]
	# puts tc_in
	# puts tc_out
	# puts text.inspect
	#using line object to create instance of line, to do the conversion of TC later on
	ln = Line.new(tc_in, tc_out, text, n)
	# puts ln.inspect
	# puts ln.tc_convert(ln.tc_in)
	# puts ln.tc_in
	# puts ln.tc_convert("0:09:07.730")

	#creating CSV File
	# using line object tc_convert method and passing in ln tc_in to get it re formatted from ie 0:09:07.730 to 00:09:07:18
	csv << [ n, ln.tc_convert(ln.tc_in), ln.tc_convert(ln.tc_out), tc_meta.upcase, ln.tc_convert(ln.tc_in) + tc_convert_no_zero(tc_meta) , ln.tc_convert(ln.tc_out) + tc_convert_no_zero(tc_meta), text ]


	#print preview in terminal
	puts "#{n}\t\| #{ln.tc_convert(ln.tc_in)} \| #{ln.tc_convert(ln.tc_out)} \| #{text}\n"

	@to_print << "#{n}\|#{ln.tc_convert(ln.tc_in)}\|#{ln.tc_convert(ln.tc_out)}\|\t#{text}\n"
	@to_print_no_tc<<"#{text}\n"
	end#close lines looping

	end #close CSV
	csvfilename = "#{filename.split('.')[0]}.csv"
	#print name of csv file saved
	puts "-"*90
	puts "CSV file saved: \n\t#{Dir.pwd}\n\t#{csvfilename}"
	puts "-"*90



	#word count
	puts"Total word Count:\t#{all_the_text.join.split.size} Words"

	#total of transcribed text time, different from total of video
	time = []
	tc_bucket = []
	# puts tc_count.inspect
	tc_count.each do \|tc\|


	tc_i = tc_convert(tc[0])

	# Timecode.parse("#{tc[0]}", fps = 25)
	tc_o= tc_convert(tc[1])

	# Timecode.parse("#{tc[1]}", fps = 25)
	tc_c = tc_o - tc_i

	tc_bucket << tc_c

	end

	transcribed_total_time =Timecode.parse("00:00:00:00", fps = 25)
	tc_bucket.each do \|t\|
	t.inspect
	transcribed_total_time += t
	end




	#Total of video recording of transcriptions

	i= tc_count.size
	i -= 1
	tc_start = tc_convert(tc_count[0][0])
	tc_end = tc_convert(tc_count[i][1])

	tc_total = tc_end + tc_start




	File.open( "#{filename.split('.')[0]}.text", "wb") do \|file\|
	file.write "Transcription file names:\n\t #{filename.split('.')[0]}.text\t Sbv Filename : #{filename} \t CSV Filename #{csvfilename} \n"
	file.write "Video file clip name: NA"
	file.write "Card Metadata Reel Name: \t Video timecode Metadata Start: #{tc_meta}\n"
	file.write ("-"*90)+"\n"
	file.write @to_print.join
	file.write ("-"*90)+"\n"
	file.write "CSV file saved: \n\t#{Dir.pwd}\n\t#{csvfilename}\n"
	file.write ("-"*90)+"\n"
	file.write "Total word Count:\t#{all_the_text.join.split.size} Words\n"
	file.write "FPS: \t\t\t#{transcribed_total_time.fps}fps\n"
	file.write "Total lenght of Transcribed interview: \t\t#{transcribed_total_time} \|\t#{transcribed_total_time.hours} Hour, #{transcribed_total_time.minutes} Min, #{transcribed_total_time.seconds} Seconds, #{transcribed_total_time.frames} Frames \t\|\t #{transcribed_total_time.to_seconds} Seconds \t\|\t Frame count: #{transcribed_total_time.total}\n"
	file.write "Total lenght of Video interview: \t\t#{tc_total} \|\t#{tc_total.hours} Hour, #{tc_total.minutes} Min, #{tc_total.seconds} Seconds, #{tc_total.frames} Frames\t\|\t #{tc_total.to_seconds} Seconds \t\|\tFrame count: #{tc_total.total}\n"
	difference = tc_total-transcribed_total_time
	file.write "Non transcribed time in video is: \t\t#{tc_total-transcribed_total_time} \|\t#{difference.hours} Hour, #{difference.minutes} Min, #{difference.seconds} Seconds, #{difference.frames} Frames \t\|\t #{difference.to_seconds} Seconds \t\|\t Frame count: #{difference.total}\n"
	end