Skip to content

Instantly share code, notes, and snippets.

@HoneyLuka
Created July 23, 2020 10:51
Show Gist options
  • Save HoneyLuka/17c180497c0f7d5719a089c836aa38b8 to your computer and use it in GitHub Desktop.
Save HoneyLuka/17c180497c0f7d5719a089c836aa38b8 to your computer and use it in GitHub Desktop.
require 'open-uri'
require 'net/http'
require 'nokogiri'
require 'json'
require 'digest/md5'
$ignore_file_name = "ignore_url.cache"
$array = []
def append_ignore_url url
begin
file_obj = File.open($ignore_file_name, 'a')
file_obj.write(url << "\n")
file_obj.close()
rescue => exception
end
end
def is_ignore_url url
if File.exist?($ignore_file_name) == false
return false
end
content = nil
begin
file_obj = File.open($ignore_file_name, 'r')
content = file_obj.read()
file_obj.close()
rescue => exception
return false
end
return content.include?(url)
end
def get_html_cache_from_cache url
file_name = './html_cache/' << Digest::MD5.hexdigest(url) << '.html'
if File.exist?(file_name) == false
return nil
end
content = nil
begin
file_obj = File.open(file_name, 'r')
content = file_obj.read()
file_obj.close()
rescue => exception
content = nil
end
return content
end
def save_html_to_cache html_content, url
if html_content == nil || url == nil
return
end
create_cache_folder()
file_name = './html_cache/' << Digest::MD5.hexdigest(url) << '.html'
begin
file_obj = File.new(file_name, 'w')
file_obj.write(html_content)
file_obj.close()
rescue => exception
end
end
def create_cache_folder
folder_name = 'html_cache'
if File.exist?(folder_name) == true
return
end
begin
Dir.mkdir(folder_name)
rescue => exception
end
end
def request_html_content url
if is_ignore_url(url)
puts "html_url:#{url} is in ignore_list, ignore"
return nil
end
uri = URI.parse(url)
response = nil
retry_count = 0
while response.class != Net::HTTPOK && retry_count < 3
if response.class == Net::HTTPNotFound
puts "html_url:#{url} is 404, append to ignore_list"
append_ignore_url(url)
return nil
end
if retry_count > 0
puts "html_url:#{url} request failed, retry"
end
begin
response = Net::HTTP.get_response(uri)
rescue => exception
response = nil
end
retry_count += 1
end
case response
when Net::HTTPOK
content = response.body
save_html_to_cache(content, url)
return content
else
puts "html_url:#{url} request failed, give up"
return nil
end
end
def extract_exif_info_from_url url
html_content = get_html_cache_from_cache(url)
if html_content == nil
html_content = request_html_content(url)
end
if html_content == nil
puts "html_url:#{url} catch failed"
return nil
end
doc = Nokogiri::HTML(html_content)
if doc == nil
return nil
end
divs = doc.xpath("//div[@id='main']")
div = divs.first
if div == nil
return nil
end
json_dict = {}
trs = div.xpath("//table/tr")
trs.each do |tr|
th = tr.xpath("th").first
name = th.text
if name == "IFD"
next
end
td = tr.xpath("td").last
value = td.text
json_dict[name] = value
end
name = "Description"
desc_value = ""
desc_ele = div.first_element_child
while desc_ele != nil && desc_ele.text != name
desc_ele = desc_ele.next_element
end
if desc_ele == nil
json_dict[name] = desc_value
return json_dict
end
desc_p = desc_ele.next_element
while desc_p != nil && desc_p.name == "p"
desc_value << desc_p.inner_html.gsub("\r\n", " ").squeeze(" ").strip << "<br>"
desc_p = desc_p.next_element
end
json_dict[name] = desc_value.gsub("<br> ", "<br>").gsub(" <br>", "<br>")
return json_dict
end
#main
tags = ["ApertureValue", "BrightnessValue", "ColorSpace", "ComponentsConfiguration", "ExposureTime", "FNumber", "ExposureProgram", "SpectralSensitivity", "ISOSpeed", "ISOSpeedLatitudeyyy", "ISOSpeedLatitudezzz", "ISOSpeedRatings", "OECF", "Version", "DateTimeOriginal", "DateTimeDigitized", "CompressedBitsPerPixel", "ShutterSpeedValue", "ExposureBiasValue", "MaxApertureValue", "SubjectDistance", "MeteringMode", "LightSource", "Flash", "FocalLength", "SubjectArea", "MakerNote", "UserComment", "SubsecTime", "SubsecTimeDigitized", "FlashPixVersion", "PixelXDimension", "PixelYDimension", "RelatedSoundFile", "FlashEnergy", "SpatialFrequencyResponse", "FocalPlaneXResolution", "FocalPlaneYResolution", "FocalPlaneResolutionUnit", "SubjectLocation", "ExposureIndex", "SensingMethod", "FileSource", "SceneType", "CFAPattern", "CustomRendered", "ExposureMode", "WhiteBalance", "DigitalZoomRatio", "FocalLenIn35mmFilm", "SceneCaptureType", "GainControl", "Contrast", "Saturation", "Sharpness", "DeviceSettingDescription", "SubjectDistRange", "ImageUniqueID", "Gamma", "CameraOwnerName", "BodySerialNumber", "LensSpecification", "LensMake", "LensModel", "LensSerialNumber", "SensitivityType", "StandardOutputSensitivity", "RecommendedExposureIndex", "SubsecTimeDigitized", "SubsecTimeOriginal", "offsettime", "OffsetTimeOriginal", "OffsetTimeDigitized", "CompositeImage", "SourceImageNumberOfCompositeImage", "SourceExposureTimesOfCompositeImage"]
tags.each do |item|
url = "https://www.awaresystems.be/imaging/tiff/tifftags/privateifd/exif/#{item.downcase}.html"
json_dict = extract_exif_info_from_url(url)
puts "sleep 3"
sleep 3
if json_dict == nil
next
end
if json_dict.size > 0
$array << json_dict
end
end
result_file_name = "exif.json"
if File.exist?(result_file_name) == true
puts "result file exist, delete"
File.delete(result_file_name)
end
if $array.count == 0
puts "array is empty ignore"
return
end
json = JSON.pretty_generate($array)
json = json.gsub("<br>", "\\n")
file_obj = File.new(result_file_name, 'w')
file_obj.write(json)
file_obj.close()
puts "result file generate success"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment