yoshikischmitz · December 22, 2020 22:34
diff --git a/0_readme.md b/0_readme.md
diff --git a/1_pairs_txt_to_csv.rb b/1_pairs_txt_to_csv.rb
 require 'csv'

 # Data can be downloaded here:
 # http://www.hlt.utdallas.edu/~qx/2MParallel.tar.gz
 CSV.open(ARGV[1], 'w+') do |csv|
  csv << ["english", "chinese"]
  IO.readlines(ARGV[0], "\n\n").each_with_index do |row, index|
    en, zh = row.split("\n")
    csv << [en, zh] 
  end 
 end
diff --git a/2_full_text_search_table.sql b/2_full_text_search_table.sql
 # This will create a fully indexed full text search table. Note that
 # it will also index our English sentences, and generate a lot of 
 # unnecessary data. As this data was prepared for a bare-minimum working
 # prototype, I did not investigate ways to prevent this from happening, 
 # but let me know in the comments if you know of a good way around that
 # issue.
 CREATE VIRTUAL TABLE example_sentences USING fts4(en TEXT, zh TEXT);
diff --git a/3_import_sentences_to_sqlite.rb b/3_import_sentences_to_sqlite.rb
 require 'sqlite3'
 require 'thread'               
 require 'csv'                  
 require 'progress_bar'

 # Take a CSV file of bilingual sentence pairs in the following format: english,chinese
 # Batch imports this to a sqlite3 table with the following columns: en,zh
 # See `2_full_text_search_table` for the table definition.
 # call like so: `ruby 1_import_sentences_to_sqlite.rb path-to-database path-to-csv-file`

 # This script will eventually be converted to python and integrated with the zh-examples plugin:
 # https://github.com/otacon507/zh-examples
 # Note: Takes about 1:30 to import 260MB(2196905 sentences in the Xian Qian dataset) of data with an i7 laptop.
 # This is a single-transaction operation. If you run out of memory running this script the transaction
 # will be roll-backed.
 # The default batch-size is 500, any larger number will exceed the term-limit for sqlite(2000 terms)

 db = SQLite3::Database.open ARGV[0]
 FILENAME = ARGV[1]
 file = CSV.open(FILENAME) 
 count = %x{wc -l #{FILENAME}}.split.first.to_i
 bar = ProgressBar.new(count)
 queue = Queue.new
  
 BATCH_SIZE = 500
  
 # Produce the queries in a different thread, since we can read from the file and build our 
 # queries faster than the queries can be processed:
 query_generator = Thread.new do
  file.each_slice(BATCH_SIZE) do |batch|
    # Escape single-quotes:
    batch.each{|x| x.map!{|y| y.to_s.gsub("'","''")}}
    first_row = batch.pop
    query = <<-SQL
      INSERT INTO 'example_sentences'
        SELECT '#{first_row[0]}' AS 'en', '#{first_row[1]}' AS 'zh'
        #{
           batch.map do |row|
             "UNION SELECT '#{row[0]}', '#{row[1]}'"
           end.join(' ') 
         };
    SQL
    queue << query
  end
 end

 # Start a transaction in the main thread and keep processing queries
 # off the queue until the query_generator is dead and the queue is empty:
 db.transaction
 loop do
  db.execute(queue.pop)
  bar.increment!(BATCH_SIZE)
  break if !query_generator.alive? && queue.empty?
 end
 puts "finished processing, committing changes..."
 db.commit
 query_generator.join
	require 'csv'

	# Data can be downloaded here:
	# http://www.hlt.utdallas.edu/~qx/2MParallel.tar.gz
	CSV.open(ARGV[1], 'w+') do \|csv\|
	csv << ["english", "chinese"]
	IO.readlines(ARGV[0], "\n\n").each_with_index do \|row, index\|
	en, zh = row.split("\n")
	csv << [en, zh]
	end
	end
	# This will create a fully indexed full text search table. Note that
	# it will also index our English sentences, and generate a lot of
	# unnecessary data. As this data was prepared for a bare-minimum working
	# prototype, I did not investigate ways to prevent this from happening,
	# but let me know in the comments if you know of a good way around that
	# issue.
	CREATE VIRTUAL TABLE example_sentences USING fts4(en TEXT, zh TEXT);
	require 'sqlite3'
	require 'thread'
	require 'csv'
	require 'progress_bar'

	# Take a CSV file of bilingual sentence pairs in the following format: english,chinese
	# Batch imports this to a sqlite3 table with the following columns: en,zh
	# See `2_full_text_search_table` for the table definition.
	# call like so: `ruby 1_import_sentences_to_sqlite.rb path-to-database path-to-csv-file`

	# This script will eventually be converted to python and integrated with the zh-examples plugin:
	# https://github.com/otacon507/zh-examples
	# Note: Takes about 1:30 to import 260MB(2196905 sentences in the Xian Qian dataset) of data with an i7 laptop.
	# This is a single-transaction operation. If you run out of memory running this script the transaction
	# will be roll-backed.
	# The default batch-size is 500, any larger number will exceed the term-limit for sqlite(2000 terms)

	db = SQLite3::Database.open ARGV[0]
	FILENAME = ARGV[1]
	file = CSV.open(FILENAME)
	count = %x{wc -l #{FILENAME}}.split.first.to_i
	bar = ProgressBar.new(count)
	queue = Queue.new

	BATCH_SIZE = 500

	# Produce the queries in a different thread, since we can read from the file and build our
	# queries faster than the queries can be processed:
	query_generator = Thread.new do
	file.each_slice(BATCH_SIZE) do \|batch\|
	# Escape single-quotes:
	batch.each{\|x\| x.map!{\|y\| y.to_s.gsub("'","''")}}
	first_row = batch.pop
	query = <<-SQL
	INSERT INTO 'example_sentences'
	SELECT '#{first_row[0]}' AS 'en', '#{first_row[1]}' AS 'zh'
	#{
	batch.map do \|row\|
	"UNION SELECT '#{row[0]}', '#{row[1]}'"
	end.join(' ')
	};
	SQL
	queue << query
	end
	end

	# Start a transaction in the main thread and keep processing queries
	# off the queue until the query_generator is dead and the queue is empty:
	db.transaction
	loop do
	db.execute(queue.pop)
	bar.increment!(BATCH_SIZE)
	break if !query_generator.alive? && queue.empty?
	end
	puts "finished processing, committing changes..."
	db.commit
	query_generator.join