Skip to content

Instantly share code, notes, and snippets.

@munky69rock
Last active May 24, 2022 04:25
Show Gist options
  • Save munky69rock/a6aa7587bb0751b509bfba6ed0a5a2d3 to your computer and use it in GitHub Desktop.
Save munky69rock/a6aa7587bb0751b509bfba6ed0a5a2d3 to your computer and use it in GitHub Desktop.
法人番号公表サイトから都道府県別の全件データをまるっとダウンロードするスクリプト
# frozen_string_literal: true
require 'nokogiri'
require 'open-uri'
require 'http'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0'
DOWNLOAD_INDEX_URL = 'http://www.houjin-bangou.nta.go.jp/download/zenken/'
DOWNLOAD_URL = 'http://www.houjin-bangou.nta.go.jp/download/zenken/index.html'
TOKEN_KEY = 'jp.go.nta.houjin_bangou.framework.web.common.CNSFWTokenProcessor.request.token'
def get_download_params
html = open(DOWNLOAD_INDEX_URL, &:read)
doc = Nokogiri::HTML(html)
token = doc.css("input[name='#{TOKEN_KEY}']").first.attribute('value').value
ids = doc.css('.tbl02)[1].css('a').map { |a| a.attribute('onclick').value.gsub(/^.*\((\d+)\).*$/, '\1') }
{
ids: ids,
token: token
}
end
def download_files(id, token)
puts "downloading ... #{id}"
response = HTTP
.headers(user_agent: USER_AGENT)
.post(DOWNLOAD_URL, form: {
"#{TOKEN_KEY}": token,
selDlFileNo: id,
event: :download
})
filename = response.headers['Content-Disposition'].sub("attachment; filename*=utf-8'jp'", '')
puts filename
filepath = File.join(File.expand_path('.', __dir__), filename)
unless File.exist?(filepath)
File.open(filepath, 'wb') do |f|
response.body.each do |chunk|
f.write chunk
end
end
end
filepath
end
def main
params = get_download_params
files = params[:ids].map { |id| download_files(id, params[:token]) }
puts files
end
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment