Created
August 11, 2018 16:35
-
-
Save missingno15/761a6c78a25148d1e5cf46f6809f3e46 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "csv" | |
require "base64" | |
require "typhoeus" # HTTP toolkit that is backed by CuRL | |
require "pry" # debugger | |
require "json" # JSON parser that comes with standard Ruby library | |
require "oj" # JSON parser but uses C extensions | |
keywords = [] | |
# { | |
# "Chapter Number" => 1, | |
# "Chapter Title" => "Basic Theory", | |
# "Section Number" -> 1.1, | |
# "Section Title" => "Basic Theory", | |
# "Topic" => "Discrete mathematics", | |
# "Importance Rating" => 3, # scale of 1-3 where 3 is most important | |
# "Keyword count", | |
# "Keyword" => [keyword] | |
# } | |
headers = [ | |
"Chapter Number", | |
"Chapter Title", | |
"Section Title", | |
"Section Number", | |
"Topic", | |
"Importance Rating", | |
"Keyword count", | |
"Keywords" | |
] | |
GOOGLE_VISION = "https://vision.googleapis.com/v1/images:annotate" | |
API_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" | |
# Get all the images that I took of the section keywords | |
images = Dir. | |
children("images"). | |
map { |image| File.join(File.expand_path(File.dirname(__FILE__)), "images", image) }. | |
reject { |file| /DS_Store/.match?(file) } | |
# Go through each image and run Google's OCR through it | |
images.each do |image| | |
# Prepare a POST body payload to send to Google | |
payload = { | |
"requests" => [{ | |
"image" => { | |
"content" => Base64.encode64(File.read(image)) | |
}, | |
"features" => { | |
"type" => "TEXT_DETECTION" | |
} | |
}] | |
} | |
# Send request and transform it to a Ruby Hash/Map | |
response = Typhoeus.post( | |
"#{GOOGLE_VISION}?fields=responses%2FfullTextAnnotation%2Ftext&key=#{API_KEY}", | |
body: JSON.dump(payload), | |
headers: { "Content-Type" => "application/json"} | |
).body.yield_self { |body| Oj.load(body) } | |
# Pull out results | |
text = response.dig("responses", 0, "fullTextAnnotation", "text") | |
if text | |
# Clean up output so its more manageable | |
words = text.split("\n").map { |keyword| keyword.gsub(/[^A-Za-z()\s]/, "").strip.capitalize } | |
binding.pry | |
end | |
binding.pry | |
end | |
File.write("keywords.json", JSON.pretty_generate(keywords)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment