Last active
June 23, 2024 08:48
-
-
Save pstaender/3733075a538d0c6743363382d7b6f134 to your computer and use it in GitHub Desktop.
Rename your shitty named invoice pdfs to `YYYY-MM-DD_name-of-the-biller_invoice-number.pdf` (the code below outputs the name in german, remove `Please answer in german.` if you want english output)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# usage: bundle exec ruby rename_invoice_pdfs.rb folder/*.pdf | |
require "openai" | |
require "json" | |
require "fileutils" | |
require "shellwords" | |
dry_run = false | |
client = OpenAI::Client.new( | |
access_token: "your-api-key", | |
log_errors: true # Highly recommended in development, so you can see what errors OpenAI is returning. Not recommended in production because it could leak private data to your logs. | |
) | |
question = "Please give me a filename for the text delimited by triple quotes. Please answer in german. The filename shoould start with the invoice date in the format year-month-day. Followd by a hyphen and contains the name of biller and the the invoice number. Please return the filename in the json field filename." | |
files = ARGV.filter { |f| f.end_with?('.pdf') } | |
files.each do |file| | |
text = `docker run --rm -i kadock/pdftotext < #{Shellwords.escape(file)}` | |
next if text.nil? || text.strip == '' | |
response = client.chat( | |
parameters: { | |
model: "gpt-4-turbo", # Required. | |
response_format: { type: "json_object" }, | |
messages: [{ role: "user", content: "#{question}\n\n\"\"\"#{text}\"\"\""}], # Required. | |
} | |
) | |
pdf_filename = JSON.parse(response.dig("choices", 0, "message", "content"))["filename"] | |
next if pdf_filename.strip == '' | |
file_name = File.basename(pdf_filename, ".*").gsub(/\s+/, '_').downcase | |
pdf_filename = File.dirname(file) + "/#{file_name}.pdf" | |
if File.exist?(pdf_filename) | |
pdf_filename = pdf_filename.gsub(/\.pdf$/, "_#{Time.now.to_i}.pdf") | |
end | |
puts "#{file} -> #{pdf_filename}" | |
FileUtils.mv(file, pdf_filename) unless dry_run | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment