Last active
May 24, 2022 04:25
-
-
Save munky69rock/a6aa7587bb0751b509bfba6ed0a5a2d3 to your computer and use it in GitHub Desktop.
法人番号公表サイトから都道府県別の全件データをまるっとダウンロードするスクリプト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# frozen_string_literal: true | |
require 'nokogiri' | |
require 'open-uri' | |
require 'http' | |
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:62.0) Gecko/20100101 Firefox/62.0' | |
DOWNLOAD_INDEX_URL = 'http://www.houjin-bangou.nta.go.jp/download/zenken/' | |
DOWNLOAD_URL = 'http://www.houjin-bangou.nta.go.jp/download/zenken/index.html' | |
TOKEN_KEY = 'jp.go.nta.houjin_bangou.framework.web.common.CNSFWTokenProcessor.request.token' | |
def get_download_params | |
html = open(DOWNLOAD_INDEX_URL, &:read) | |
doc = Nokogiri::HTML(html) | |
token = doc.css("input[name='#{TOKEN_KEY}']").first.attribute('value').value | |
ids = doc.css('.tbl02)[1].css('a').map { |a| a.attribute('onclick').value.gsub(/^.*\((\d+)\).*$/, '\1') } | |
{ | |
ids: ids, | |
token: token | |
} | |
end | |
def download_files(id, token) | |
puts "downloading ... #{id}" | |
response = HTTP | |
.headers(user_agent: USER_AGENT) | |
.post(DOWNLOAD_URL, form: { | |
"#{TOKEN_KEY}": token, | |
selDlFileNo: id, | |
event: :download | |
}) | |
filename = response.headers['Content-Disposition'].sub("attachment; filename*=utf-8'jp'", '') | |
puts filename | |
filepath = File.join(File.expand_path('.', __dir__), filename) | |
unless File.exist?(filepath) | |
File.open(filepath, 'wb') do |f| | |
response.body.each do |chunk| | |
f.write chunk | |
end | |
end | |
end | |
filepath | |
end | |
def main | |
params = get_download_params | |
files = params[:ids].map { |id| download_files(id, params[:token]) } | |
puts files | |
end | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment