Last active
November 3, 2015 16:23
-
-
Save tumugin/9669fd6eac37242e0ad3 to your computer and use it in GitHub Desktop.
東進の過去問DLサービスの問題PDFと解答PDFを自動生成するスクリプト
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
require 'nokogiri' | |
require 'open-uri' | |
require 'net/http' | |
require 'kconv' | |
require "pdf/toolkit" | |
#AUTH INFO | |
$username = "(ENTER YOUR USERNAME)" | |
$password = "(ENTER YOUR PASSWORD)" | |
#URL | |
mainurl = ARGV[0] | |
#HTTP GET | |
def ToshinAuthHTTPGET(url) | |
uri = URI.parse(url) | |
req = Net::HTTP::Post.new(uri.path) | |
req.basic_auth($username, $password) | |
res = Net::HTTP.new(uri.host, uri.port).start {|http| http.request(req) } | |
html = res.body.toutf8 | |
end | |
def ToshinAuthSaveBinary(url,savepath) | |
uri = URI.parse(url) | |
req = Net::HTTP::Post.new(uri.path) | |
req.basic_auth($username, $password) | |
res = Net::HTTP.new(uri.host, uri.port).start {|http| http.request(req) } | |
open(savepath, 'wb'){|f| | |
f.write(res.body) | |
} | |
end | |
#main logic | |
if mainurl.nil? | |
p "URL empty." | |
exit | |
end | |
#kakomon INFO | |
mainpageRAW = ToshinAuthHTTPGET(mainurl) | |
#print mainpageRAW | |
urls = [] | |
doc = Nokogiri::HTML(mainpageRAW) | |
#大学名取得 | |
univName = doc.search("title")[0].content | |
univName.gsub!(/nyushimondai.com - /,"") | |
#使えない文字を置き換える | |
univName.gsub!(/[?"\\\/\<>*|]/,"_") | |
#Get university name, year and subject name | |
match = /(?<univ>.*) - (?<faculty>.*) - (?<subject>.*)/.match(univName) | |
p match | |
univName = "#{match[:univ]} #{match[:subject]}" | |
print univName + "\n" | |
#一問目も追加 | |
monpdf = nil | |
urls.push(mainurl) | |
doc.search("a").each do |atag| | |
if atag.content =~ /第\d*問/ | |
#相対パスから絶対パスに変換する | |
urls.push(URI::join(mainurl,atag["href"]).to_s) | |
print "Found " + atag.content + "\n" | |
elsif atag.content =~ /問題PDF/ | |
monpdf = URI::join(mainurl,atag["href"]).to_s | |
print "Found " + atag.content + "\n" | |
end | |
end | |
#p urls | |
#urls内のページから解答へのリンクを探しだす | |
answer_URL = [] | |
urls.each do |murl| | |
print "fetching " + murl + "\n" | |
rawhtml = ToshinAuthHTTPGET(murl) | |
html = Nokogiri::HTML(rawhtml) | |
html.search("a").each do |tag| | |
if tag.content =~ /解答/ | |
answer_URL.push(URI::join(mainurl,tag["href"]).to_s); | |
break | |
end | |
end | |
end | |
p answer_URL | |
#answer_URLからGIFを探しだす | |
gif_url = [] | |
answer_URL.each do |ans| | |
print "fetching " + ans + "\n" | |
rawhtml = ToshinAuthHTTPGET(ans) | |
html = Nokogiri::HTML(rawhtml) | |
gif = html.search("img")[0]["src"] | |
gif_url.push(URI::join(mainurl,gif).to_s) | |
end | |
p gif_url | |
#gif_urlからGIFを適当に落とす | |
Dir.mkdir(univName) unless Dir.exist?(univName) | |
gif_url.each do |answer| | |
print "Saving " + answer + "\n" | |
fname = File.basename(answer) | |
ToshinAuthSaveBinary(answer,univName + "/" + fname) | |
end | |
#trim images with ImageMagick | |
print "Trimming images....\n" | |
system("mogrify -trim +repage \"#{univName}/*.gif\"") | |
#convert it to PDF | |
print "Converting to PDF....\n" | |
system("convert \"#{univName}/*.gif\" \"#{univName}/#{univName} 解答.pdf\"") | |
#Fix Mojibake(Please somebody fix multibyte bugs on ImageMagick!!!!) | |
print "Fixing mojibake....\n" | |
my_pdf = PDF::Toolkit.open("#{univName}/#{univName} 解答.pdf") | |
my_pdf["Title"] = "#{univName} 解答" | |
my_pdf.save! | |
#Save PDF | |
print "Downloading kakomon files....\n" | |
ToshinAuthSaveBinary(monpdf,univName + "/" + "#{univName}.pdf") | |
#Fix title for some PDFs | |
print "Fixing title....\n" | |
k_pdf = PDF::Toolkit.open("#{univName}/#{univName}.pdf") | |
k_pdf["Title"] = "#{univName}" | |
k_pdf.save! | |
#Complete | |
print "ALL OK!!\n" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Update 3
間違えて最初のやつのgist消してしまいました。ごめんなさい。
PDFの文字化けを直したりとか何やらかんやら処理加えてます。
かなり切羽詰っているので処理も大分適当になってます。