Created
May 11, 2023 15:06
-
-
Save lankz/ca3c9fbe83d9c915880facd861793c02 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'dotenv' | |
require 'ruby/openai' | |
require 'csv' | |
require 'parallel' | |
require 'tmpdir' | |
require 'ruby-progressbar' | |
Dotenv.load() | |
training_data = Dir.glob("training-data/**/*.txt") | |
progress = ProgressBar.create( | |
total: training_data.length, | |
format: "%a %e %P% Processed: %c from %C") | |
csv_files = [] | |
Parallel.each(training_data.each_slice(100).to_a, in_threads: 8) do |files| | |
openai = OpenAI::Client.new(access_token: ENV['OPENAI_API_KEY']) | |
# create a temporary directory into which we'll write the csv file | |
# note that we don't use the block form of Dir.mktmpdir because | |
# we need to access the directory after the block has finished | |
# executing | |
temp_dir = Dir.mktmpdir | |
csv_filename = "#{temp_dir}/embeddings.csv" | |
CSV.open(csv_filename, "w") do |csv| | |
files.each do |file| | |
progress.increment | |
text = File.read(file).dump | |
response = openai.embeddings( | |
parameters: { | |
model: "text-embedding-ada-002", | |
input: text | |
}) | |
csv << [response['data'][0]['embedding'], text] | |
end | |
end | |
csv_files << csv_filename | |
end | |
# now we have a bunch of csv files in the csv_files array | |
# we need to combine them into a single csv file | |
CSV.open("embeddings.csv", "w") do |csv| | |
csv << [:embedding, :text] | |
csv_files.each do |csv_file| | |
CSV.foreach(csv_file) do |row| | |
csv << row | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment