mkdir -p data image
ruby download.rb
ruby import.rb
pg_dump --dbname=orca_test --encoding=UTF8 --no-owner --clean --table "tbl_yakujyo" | gzip --best --to-stdout > tbl_yakujyo.sql.gz
pg_dump --dbname=orca_test --encoding=UTF8 --no-owner --clean --table "tbl_med_image" | gzip --best --to-stdout > tbl_med_image.sql.gz
gzcat tbl_yakujyo.sql.gz | psql --host=127.0.0.1 --port=15432 --username=orca --dbname=orca
gzcat tbl_med_image.sql.gz | psql --host=127.0.0.1 --port=15432 --username=orca --dbname=orca
Last active
January 9, 2018 08:24
-
-
Save miyucy/186d94d15103a50ca61d513d5b3bc9cc to your computer and use it in GitHub Desktop.
medimg
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'httpclient' | |
require 'nokogiri' | |
require 'thread' | |
require 'time' | |
require 'fileutils' | |
require 'json' | |
class Download | |
attr_reader :link_html, :link_filter, :download_dir, :status_file, | |
:http_client, :download_urls, :download_errors | |
def initialize(link_html, link_filter, download_dir, status_file, http_client) | |
@link_html = link_html | |
@link_filter = link_filter | |
@download_dir = download_dir | |
@status_file = status_file | |
@http_client = http_client | |
@download_urls = Queue.new | |
@download_errors = [] | |
@mutex = Mutex.new | |
end | |
def perform | |
response = http_client.get link_html | |
unless response.ok? | |
puts response.status | |
puts response.body | |
exit 1 | |
end | |
mtimes = if File.exist? status_file | |
JSON.parse File.read(status_file) | |
else | |
{} | |
end | |
puts "mtimes.size=#{mtimes.size}" | |
html = Nokogiri::HTML response.body | |
(html / 'a').each do |node| | |
href = node.attribute("href")&.value | |
next unless href | |
next unless link_filter.call(href) | |
url = link_html + href | |
download_urls.push [url, mtimes[href]] | |
end | |
download_threads = 4.times.map do | |
Thread.new do | |
while tuple = download_urls.pop | |
url, mtime = tuple | |
file = File.basename url | |
path = File.join download_dir, file | |
request_headers = {} | |
request_headers['If-Modified-Since'] = mtime if mtime | |
request_headers.clear unless File.size?(path) | |
response = http_client.get(url, header: request_headers) | |
case response.status | |
when 304 | |
# skip | |
when 200 | |
File.binwrite(path, response.body) | |
mtime = response.headers['Last-Modified'] || response.headers['Date'] | |
@mutex.synchronize do | |
mtimes[file] = mtime | |
end if mtime | |
puts "file=#{file} size=#{File.size?(path)} mtime=#{mtime}" | |
else | |
download_errors << [url, response.status, response.body] | |
end | |
end | |
end | |
end | |
download_threads.each { download_urls.push nil } | |
download_threads.each(&:join) | |
unless download_errors.empty? | |
puts "Error: #{download_errors.size}" | |
download_errors.each do |url, status, body| | |
puts "url = #{url}" | |
puts "status = #{status}" | |
puts "body = #{body.inspect}" | |
puts "" | |
end | |
end | |
File.write status_file, mtimes.to_json | |
end | |
end | |
http_client = HTTPClient.new | |
dl = Download.new('http://ftp.orca.med.or.jp/pub/orca_data/master/siori/data/', | |
->(href) { href =~ /\.p7m\z/i }, | |
File.join(__dir__, 'data'), | |
'data.json', | |
http_client) | |
dl.perform | |
dl = Download.new('http://ftp.orca.med.or.jp/pub/orca_data/master/siori/image/', | |
->(href) { href =~ /\.jpg\z/i }, | |
File.join(__dir__, 'image'), | |
'image.json', | |
http_client) | |
dl.perform |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'openssl' | |
require 'nkf' | |
require 'pg' | |
require 'base64' | |
class Import | |
CREYMD = Time.now.strftime('%Y%m%d') | |
PATTERN = { | |
':TERMID:' => '', | |
':HOSPNUM:' => '1', | |
':CREYMD:' => CREYMD, | |
} | |
PATTERN_REG = Regexp.union(*PATTERN.keys.map { |e| Regexp.escape e }) | |
def self.import | |
new.perform | |
end | |
def initialize | |
@pg = PG::Connection.new('postgres:///orca_test') | |
end | |
def perform | |
@pg.exec "TRUNCATE tbl_yakujyo" | |
@pg.exec "TRUNCATE tbl_med_image" | |
@pg.exec "TRUNCATE tbl_yakujyo_temp" | |
Dir['data/*.p7m'].each do |path| | |
transaction do | |
#puts "Processing: #{File.basename(path)}" | |
srycd = File.basename(path).sub('-siori.dat.p7m', '') | |
@pg.exec "SAVEPOINT tmp#{srycd}" | |
data = replace_placeholder(NKF.nkf('-Ewx', extract_pkcs7_data(File.binread(path)))) | |
begin | |
@pg.exec data | |
rescue PG::UniqueViolation => e | |
puts "Error: #{File.basename(path)}\n#{data}\n#{e.inspect}" | |
puts "" | |
@pg.exec "ROLLBACK TO SAVEPOINT tmp#{srycd}" | |
break | |
end | |
# ファイル名のsrycdとインサートされたデータのsrycdが違う | |
sql = "SELECT COUNT(*) FROM tbl_yakujyo_temp WHERE srycd = $1" | |
params = [srycd] | |
result = @pg.exec sql, params | |
unless result[0]["count"].to_i > 0 | |
puts "Error: #{File.basename(path)}\n#{data}" | |
puts "" | |
@pg.exec "ROLLBACK TO SAVEPOINT tmp#{srycd}" | |
break | |
end | |
rows = @pg.exec("SELECT * FROM tbl_yakujyo_temp ORDER BY rennum ASC").each.map { |row| row } | |
@pg.exec "ROLLBACK TO SAVEPOINT tmp#{srycd}" | |
photo = "" | |
%w[photo1_filename photo2_filename photo3_filename].each_with_index do |attr_name, idx| | |
photo_type = idx + 1 | |
rows.each do |row| | |
filename = row[attr_name] | |
next unless filename | |
next if filename.empty? | |
photo_filepath = File.join(__dir__, "image", filename) | |
#puts "Photo type=#{photo_type} file=#{File.basename(photo_filepath)}" | |
next unless File.exist? photo_filepath | |
image_data = Base64.strict_encode64(File.binread(photo_filepath)) | |
params = [row["hospnum"], srycd, photo_type, image_data] | |
sql = <<~SQL | |
INSERT INTO tbl_med_image | |
(hospnum, srycd, imagekbn, imagedata) | |
VALUES | |
($1, $2, $3, $4) | |
SQL | |
@pg.exec sql, params | |
photo = filename if photo.empty? | |
#{ srycd: srycd, photo_type: photo_type, filename: filename, image_data: image_data } | |
break | |
end | |
end | |
rows.each do |row| | |
sql = <<~SQL | |
INSERT INTO tbl_yakujyo | |
(teknum, termid, srycd, shape, color, mark, effect, caution, creymd, hospnum, name, photo) | |
VALUES | |
(1, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11) | |
SQL | |
params = [ | |
row["termid"], srycd, row["shape"], row["color"], row["mark"], | |
row["effect"], row["caution"], row["creymd"], row["hospnum"], row["name"], photo | |
] | |
@pg.exec sql, params | |
break | |
end | |
end | |
end | |
end | |
def transaction | |
@pg.exec "BEGIN" | |
yield | |
ensure | |
@pg.exec "COMMIT" | |
# @pg.exec "ABORT" | |
end | |
def extract_pkcs7_data(smime) | |
pkcs7 = OpenSSL::PKCS7.read_smime(smime) | |
signed_data = OpenSSL::ASN1.decode(pkcs7).value[1].value[0] | |
encap_content_info = signed_data.value[2] if signed_data | |
econtent = encap_content_info.value[1] if encap_content_info | |
data = econtent.value[0].value if econtent | |
data | |
end | |
def replace_placeholder(str) | |
str.gsub(PATTERN_REG, PATTERN) | |
end | |
end | |
Import.import |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment