Created
July 23, 2020 10:51
-
-
Save HoneyLuka/17c180497c0f7d5719a089c836aa38b8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'net/http' | |
require 'nokogiri' | |
require 'json' | |
require 'digest/md5' | |
$ignore_file_name = "ignore_url.cache" | |
$array = [] | |
def append_ignore_url url | |
begin | |
file_obj = File.open($ignore_file_name, 'a') | |
file_obj.write(url << "\n") | |
file_obj.close() | |
rescue => exception | |
end | |
end | |
def is_ignore_url url | |
if File.exist?($ignore_file_name) == false | |
return false | |
end | |
content = nil | |
begin | |
file_obj = File.open($ignore_file_name, 'r') | |
content = file_obj.read() | |
file_obj.close() | |
rescue => exception | |
return false | |
end | |
return content.include?(url) | |
end | |
def get_html_cache_from_cache url | |
file_name = './html_cache/' << Digest::MD5.hexdigest(url) << '.html' | |
if File.exist?(file_name) == false | |
return nil | |
end | |
content = nil | |
begin | |
file_obj = File.open(file_name, 'r') | |
content = file_obj.read() | |
file_obj.close() | |
rescue => exception | |
content = nil | |
end | |
return content | |
end | |
def save_html_to_cache html_content, url | |
if html_content == nil || url == nil | |
return | |
end | |
create_cache_folder() | |
file_name = './html_cache/' << Digest::MD5.hexdigest(url) << '.html' | |
begin | |
file_obj = File.new(file_name, 'w') | |
file_obj.write(html_content) | |
file_obj.close() | |
rescue => exception | |
end | |
end | |
def create_cache_folder | |
folder_name = 'html_cache' | |
if File.exist?(folder_name) == true | |
return | |
end | |
begin | |
Dir.mkdir(folder_name) | |
rescue => exception | |
end | |
end | |
def request_html_content url | |
if is_ignore_url(url) | |
puts "html_url:#{url} is in ignore_list, ignore" | |
return nil | |
end | |
uri = URI.parse(url) | |
response = nil | |
retry_count = 0 | |
while response.class != Net::HTTPOK && retry_count < 3 | |
if response.class == Net::HTTPNotFound | |
puts "html_url:#{url} is 404, append to ignore_list" | |
append_ignore_url(url) | |
return nil | |
end | |
if retry_count > 0 | |
puts "html_url:#{url} request failed, retry" | |
end | |
begin | |
response = Net::HTTP.get_response(uri) | |
rescue => exception | |
response = nil | |
end | |
retry_count += 1 | |
end | |
case response | |
when Net::HTTPOK | |
content = response.body | |
save_html_to_cache(content, url) | |
return content | |
else | |
puts "html_url:#{url} request failed, give up" | |
return nil | |
end | |
end | |
def extract_exif_info_from_url url | |
html_content = get_html_cache_from_cache(url) | |
if html_content == nil | |
html_content = request_html_content(url) | |
end | |
if html_content == nil | |
puts "html_url:#{url} catch failed" | |
return nil | |
end | |
doc = Nokogiri::HTML(html_content) | |
if doc == nil | |
return nil | |
end | |
divs = doc.xpath("//div[@id='main']") | |
div = divs.first | |
if div == nil | |
return nil | |
end | |
json_dict = {} | |
trs = div.xpath("//table/tr") | |
trs.each do |tr| | |
th = tr.xpath("th").first | |
name = th.text | |
if name == "IFD" | |
next | |
end | |
td = tr.xpath("td").last | |
value = td.text | |
json_dict[name] = value | |
end | |
name = "Description" | |
desc_value = "" | |
desc_ele = div.first_element_child | |
while desc_ele != nil && desc_ele.text != name | |
desc_ele = desc_ele.next_element | |
end | |
if desc_ele == nil | |
json_dict[name] = desc_value | |
return json_dict | |
end | |
desc_p = desc_ele.next_element | |
while desc_p != nil && desc_p.name == "p" | |
desc_value << desc_p.inner_html.gsub("\r\n", " ").squeeze(" ").strip << "<br>" | |
desc_p = desc_p.next_element | |
end | |
json_dict[name] = desc_value.gsub("<br> ", "<br>").gsub(" <br>", "<br>") | |
return json_dict | |
end | |
#main | |
tags = ["ApertureValue", "BrightnessValue", "ColorSpace", "ComponentsConfiguration", "ExposureTime", "FNumber", "ExposureProgram", "SpectralSensitivity", "ISOSpeed", "ISOSpeedLatitudeyyy", "ISOSpeedLatitudezzz", "ISOSpeedRatings", "OECF", "Version", "DateTimeOriginal", "DateTimeDigitized", "CompressedBitsPerPixel", "ShutterSpeedValue", "ExposureBiasValue", "MaxApertureValue", "SubjectDistance", "MeteringMode", "LightSource", "Flash", "FocalLength", "SubjectArea", "MakerNote", "UserComment", "SubsecTime", "SubsecTimeDigitized", "FlashPixVersion", "PixelXDimension", "PixelYDimension", "RelatedSoundFile", "FlashEnergy", "SpatialFrequencyResponse", "FocalPlaneXResolution", "FocalPlaneYResolution", "FocalPlaneResolutionUnit", "SubjectLocation", "ExposureIndex", "SensingMethod", "FileSource", "SceneType", "CFAPattern", "CustomRendered", "ExposureMode", "WhiteBalance", "DigitalZoomRatio", "FocalLenIn35mmFilm", "SceneCaptureType", "GainControl", "Contrast", "Saturation", "Sharpness", "DeviceSettingDescription", "SubjectDistRange", "ImageUniqueID", "Gamma", "CameraOwnerName", "BodySerialNumber", "LensSpecification", "LensMake", "LensModel", "LensSerialNumber", "SensitivityType", "StandardOutputSensitivity", "RecommendedExposureIndex", "SubsecTimeDigitized", "SubsecTimeOriginal", "offsettime", "OffsetTimeOriginal", "OffsetTimeDigitized", "CompositeImage", "SourceImageNumberOfCompositeImage", "SourceExposureTimesOfCompositeImage"] | |
tags.each do |item| | |
url = "https://www.awaresystems.be/imaging/tiff/tifftags/privateifd/exif/#{item.downcase}.html" | |
json_dict = extract_exif_info_from_url(url) | |
puts "sleep 3" | |
sleep 3 | |
if json_dict == nil | |
next | |
end | |
if json_dict.size > 0 | |
$array << json_dict | |
end | |
end | |
result_file_name = "exif.json" | |
if File.exist?(result_file_name) == true | |
puts "result file exist, delete" | |
File.delete(result_file_name) | |
end | |
if $array.count == 0 | |
puts "array is empty ignore" | |
return | |
end | |
json = JSON.pretty_generate($array) | |
json = json.gsub("<br>", "\\n") | |
file_obj = File.new(result_file_name, 'w') | |
file_obj.write(json) | |
file_obj.close() | |
puts "result file generate success" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment