miyucy · January 9, 2018 08:24
diff --git a/README.md b/README.md
diff --git a/download.rb b/download.rb
 require 'httpclient'
 require 'nokogiri'
 require 'thread'
 require 'time'
 require 'fileutils'
 require 'json'

 class Download
  attr_reader :link_html, :link_filter, :download_dir, :status_file,
              :http_client, :download_urls, :download_errors

  def initialize(link_html, link_filter, download_dir, status_file, http_client)
    @link_html = link_html
    @link_filter = link_filter
    @download_dir = download_dir
    @status_file = status_file
    @http_client = http_client
    @download_urls = Queue.new
    @download_errors = []
    @mutex = Mutex.new
  end

  def perform
    response = http_client.get link_html
    unless response.ok?
      puts response.status
      puts response.body
      exit 1
    end

    mtimes = if File.exist? status_file
               JSON.parse File.read(status_file)
             else
               {}
             end
    puts "mtimes.size=#{mtimes.size}"

    html = Nokogiri::HTML response.body
    (html / 'a').each do |node|
      href = node.attribute("href")&.value
      next unless href
      next unless link_filter.call(href)
      url = link_html + href
      download_urls.push [url, mtimes[href]]
    end

    download_threads = 4.times.map do
      Thread.new do
        while tuple = download_urls.pop
          url, mtime = tuple
          file = File.basename url
          path = File.join download_dir, file

          request_headers = {}
          request_headers['If-Modified-Since'] = mtime if mtime
          request_headers.clear unless File.size?(path)

          response = http_client.get(url, header: request_headers)
          case response.status
          when 304
            # skip
          when 200
            File.binwrite(path, response.body)
            mtime = response.headers['Last-Modified'] || response.headers['Date']
            @mutex.synchronize do
              mtimes[file] = mtime
            end if mtime
            puts "file=#{file} size=#{File.size?(path)} mtime=#{mtime}"
          else
            download_errors << [url, response.status, response.body]
          end
        end
      end
    end

    download_threads.each { download_urls.push nil }
    download_threads.each(&:join)

    unless download_errors.empty?
      puts "Error: #{download_errors.size}"
      download_errors.each do |url, status, body|
        puts "url = #{url}"
        puts "status = #{status}"
        puts "body = #{body.inspect}"
        puts ""
      end
    end

    File.write status_file, mtimes.to_json
  end
 end

 http_client = HTTPClient.new

 dl = Download.new('http://ftp.orca.med.or.jp/pub/orca_data/master/siori/data/',
                  ->(href) { href =~ /\.p7m\z/i },
                  File.join(__dir__, 'data'),
                  'data.json',
                  http_client)
 dl.perform

 dl = Download.new('http://ftp.orca.med.or.jp/pub/orca_data/master/siori/image/',
                  ->(href) { href =~ /\.jpg\z/i },
                  File.join(__dir__, 'image'),
                  'image.json',
                  http_client)
 dl.perform
diff --git a/import.rb b/import.rb
 require 'openssl'
 require 'nkf'
 require 'pg'
 require 'base64'

 class Import
  CREYMD = Time.now.strftime('%Y%m%d')
  PATTERN = {
    ':TERMID:'  => '',
    ':HOSPNUM:' => '1',
    ':CREYMD:'  => CREYMD,
  }
  PATTERN_REG = Regexp.union(*PATTERN.keys.map { |e| Regexp.escape e })

  def self.import
    new.perform
  end

  def initialize
    @pg = PG::Connection.new('postgres:///orca_test')
  end

  def perform
    @pg.exec "TRUNCATE tbl_yakujyo"
    @pg.exec "TRUNCATE tbl_med_image"
    @pg.exec "TRUNCATE tbl_yakujyo_temp"
    Dir['data/*.p7m'].each do |path|
      transaction do
        #puts "Processing: #{File.basename(path)}"

        srycd = File.basename(path).sub('-siori.dat.p7m', '')
        @pg.exec "SAVEPOINT tmp#{srycd}"

        data = replace_placeholder(NKF.nkf('-Ewx', extract_pkcs7_data(File.binread(path))))
        begin
          @pg.exec data
        rescue PG::UniqueViolation => e
          puts "Error: #{File.basename(path)}\n#{data}\n#{e.inspect}"
          puts ""
          @pg.exec "ROLLBACK TO SAVEPOINT tmp#{srycd}"
          break
        end

        # ファイル名のsrycdとインサートされたデータのsrycdが違う
        sql = "SELECT COUNT(*) FROM tbl_yakujyo_temp WHERE srycd = $1"
        params = [srycd]
        result = @pg.exec sql, params
        unless result[0]["count"].to_i > 0
          puts "Error: #{File.basename(path)}\n#{data}"
          puts ""
          @pg.exec "ROLLBACK TO SAVEPOINT tmp#{srycd}"
          break
        end

        rows = @pg.exec("SELECT * FROM tbl_yakujyo_temp ORDER BY rennum ASC").each.map { |row| row }
        @pg.exec "ROLLBACK TO SAVEPOINT tmp#{srycd}"

        photo = ""
        %w[photo1_filename photo2_filename photo3_filename].each_with_index do |attr_name, idx|
          photo_type = idx + 1
          rows.each do |row|
            filename = row[attr_name]
            next unless filename
            next if filename.empty?
            photo_filepath = File.join(__dir__, "image", filename)
            #puts "Photo type=#{photo_type} file=#{File.basename(photo_filepath)}"
            next unless File.exist? photo_filepath
            image_data = Base64.strict_encode64(File.binread(photo_filepath))
            params = [row["hospnum"], srycd, photo_type, image_data]
            sql = <<~SQL
            INSERT INTO tbl_med_image
              (hospnum, srycd, imagekbn, imagedata)
            VALUES
              ($1, $2, $3, $4)
            SQL
            @pg.exec sql, params
            photo = filename if photo.empty?
            #{ srycd: srycd, photo_type: photo_type, filename: filename, image_data: image_data }
            break
          end
        end

        rows.each do |row|
          sql = <<~SQL
          INSERT INTO tbl_yakujyo
            (teknum, termid, srycd, shape, color, mark, effect, caution, creymd, hospnum, name, photo)
          VALUES
            (1, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
          SQL
          params = [
            row["termid"], srycd, row["shape"], row["color"], row["mark"],
            row["effect"], row["caution"], row["creymd"], row["hospnum"], row["name"], photo
          ]
          @pg.exec sql, params
          break
        end
      end
    end
  end

  def transaction
    @pg.exec "BEGIN"
    yield
  ensure
    @pg.exec "COMMIT"
    # @pg.exec "ABORT"
  end

  def extract_pkcs7_data(smime)
    pkcs7 = OpenSSL::PKCS7.read_smime(smime)
    signed_data = OpenSSL::ASN1.decode(pkcs7).value[1].value[0]
    encap_content_info = signed_data.value[2] if signed_data
    econtent = encap_content_info.value[1] if encap_content_info
    data = econtent.value[0].value if econtent
    data
  end

  def replace_placeholder(str)
    str.gsub(PATTERN_REG, PATTERN)
  end
 end

 Import.import
	require 'httpclient'
	require 'nokogiri'
	require 'thread'
	require 'time'
	require 'fileutils'
	require 'json'

	class Download
	attr_reader :link_html, :link_filter, :download_dir, :status_file,
	:http_client, :download_urls, :download_errors

	def initialize(link_html, link_filter, download_dir, status_file, http_client)
	@link_html = link_html
	@link_filter = link_filter
	@download_dir = download_dir
	@status_file = status_file
	@http_client = http_client
	@download_urls = Queue.new
	@download_errors = []
	@mutex = Mutex.new
	end

	def perform
	response = http_client.get link_html
	unless response.ok?
	puts response.status
	puts response.body
	exit 1
	end

	mtimes = if File.exist? status_file
	JSON.parse File.read(status_file)
	else
	{}
	end
	puts "mtimes.size=#{mtimes.size}"

	html = Nokogiri::HTML response.body
	(html / 'a').each do \|node\|
	href = node.attribute("href")&.value
	next unless href
	next unless link_filter.call(href)
	url = link_html + href
	download_urls.push [url, mtimes[href]]
	end

	download_threads = 4.times.map do
	Thread.new do
	while tuple = download_urls.pop
	url, mtime = tuple
	file = File.basename url
	path = File.join download_dir, file

	request_headers = {}
	request_headers['If-Modified-Since'] = mtime if mtime
	request_headers.clear unless File.size?(path)

	response = http_client.get(url, header: request_headers)
	case response.status
	when 304
	# skip
	when 200
	File.binwrite(path, response.body)
	mtime = response.headers['Last-Modified'] \|\| response.headers['Date']
	@mutex.synchronize do
	mtimes[file] = mtime
	end if mtime
	puts "file=#{file} size=#{File.size?(path)} mtime=#{mtime}"
	else
	download_errors << [url, response.status, response.body]
	end
	end
	end
	end

	download_threads.each { download_urls.push nil }
	download_threads.each(&:join)

	unless download_errors.empty?
	puts "Error: #{download_errors.size}"
	download_errors.each do \|url, status, body\|
	puts "url = #{url}"
	puts "status = #{status}"
	puts "body = #{body.inspect}"
	puts ""
	end
	end

	File.write status_file, mtimes.to_json
	end
	end

	http_client = HTTPClient.new

	dl = Download.new('http://ftp.orca.med.or.jp/pub/orca_data/master/siori/data/',
	->(href) { href =~ /\.p7m\z/i },
	File.join(__dir__, 'data'),
	'data.json',
	http_client)
	dl.perform

	dl = Download.new('http://ftp.orca.med.or.jp/pub/orca_data/master/siori/image/',
	->(href) { href =~ /\.jpg\z/i },
	File.join(__dir__, 'image'),
	'image.json',
	http_client)
	dl.perform
	require 'openssl'
	require 'nkf'
	require 'pg'
	require 'base64'

	class Import
	CREYMD = Time.now.strftime('%Y%m%d')
	PATTERN = {
	':TERMID:' => '',
	':HOSPNUM:' => '1',
	':CREYMD:' => CREYMD,
	}
	PATTERN_REG = Regexp.union(*PATTERN.keys.map { \|e\| Regexp.escape e })

	def self.import
	new.perform
	end

	def initialize
	@pg = PG::Connection.new('postgres:///orca_test')
	end

	def perform
	@pg.exec "TRUNCATE tbl_yakujyo"
	@pg.exec "TRUNCATE tbl_med_image"
	@pg.exec "TRUNCATE tbl_yakujyo_temp"
	Dir['data/*.p7m'].each do \|path\|
	transaction do
	#puts "Processing: #{File.basename(path)}"

	srycd = File.basename(path).sub('-siori.dat.p7m', '')
	@pg.exec "SAVEPOINT tmp#{srycd}"

	data = replace_placeholder(NKF.nkf('-Ewx', extract_pkcs7_data(File.binread(path))))
	begin
	@pg.exec data
	rescue PG::UniqueViolation => e
	puts "Error: #{File.basename(path)}\n#{data}\n#{e.inspect}"
	puts ""
	@pg.exec "ROLLBACK TO SAVEPOINT tmp#{srycd}"
	break
	end

	# ファイル名のsrycdとインサートされたデータのsrycdが違う
	sql = "SELECT COUNT(*) FROM tbl_yakujyo_temp WHERE srycd = $1"
	params = [srycd]
	result = @pg.exec sql, params
	unless result[0]["count"].to_i > 0
	puts "Error: #{File.basename(path)}\n#{data}"
	puts ""
	@pg.exec "ROLLBACK TO SAVEPOINT tmp#{srycd}"
	break
	end

	rows = @pg.exec("SELECT * FROM tbl_yakujyo_temp ORDER BY rennum ASC").each.map { \|row\| row }
	@pg.exec "ROLLBACK TO SAVEPOINT tmp#{srycd}"

	photo = ""
	%w[photo1_filename photo2_filename photo3_filename].each_with_index do \|attr_name, idx\|
	photo_type = idx + 1
	rows.each do \|row\|
	filename = row[attr_name]
	next unless filename
	next if filename.empty?
	photo_filepath = File.join(__dir__, "image", filename)
	#puts "Photo type=#{photo_type} file=#{File.basename(photo_filepath)}"
	next unless File.exist? photo_filepath
	image_data = Base64.strict_encode64(File.binread(photo_filepath))
	params = [row["hospnum"], srycd, photo_type, image_data]
	sql = <<~SQL
	INSERT INTO tbl_med_image
	(hospnum, srycd, imagekbn, imagedata)
	VALUES
	($1, $2, $3, $4)
	SQL
	@pg.exec sql, params
	photo = filename if photo.empty?
	#{ srycd: srycd, photo_type: photo_type, filename: filename, image_data: image_data }
	break
	end
	end

	rows.each do \|row\|
	sql = <<~SQL
	INSERT INTO tbl_yakujyo
	(teknum, termid, srycd, shape, color, mark, effect, caution, creymd, hospnum, name, photo)
	VALUES
	(1, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
	SQL
	params = [
	row["termid"], srycd, row["shape"], row["color"], row["mark"],
	row["effect"], row["caution"], row["creymd"], row["hospnum"], row["name"], photo
	]
	@pg.exec sql, params
	break
	end
	end
	end
	end

	def transaction
	@pg.exec "BEGIN"
	yield
	ensure
	@pg.exec "COMMIT"
	# @pg.exec "ABORT"
	end

	def extract_pkcs7_data(smime)
	pkcs7 = OpenSSL::PKCS7.read_smime(smime)
	signed_data = OpenSSL::ASN1.decode(pkcs7).value[1].value[0]
	encap_content_info = signed_data.value[2] if signed_data
	econtent = encap_content_info.value[1] if encap_content_info
	data = econtent.value[0].value if econtent
	data
	end

	def replace_placeholder(str)
	str.gsub(PATTERN_REG, PATTERN)
	end
	end

	Import.import