Last active
January 4, 2018 17:20
-
-
Save Apkawa/e1fa971c774c700254e5 to your computer and use it in GitHub Desktop.
Downloader buyed e-books from Books.ru
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# encoding=utf-8 | |
# = INSTALL = | |
# sudo apt-get install ruby-mechanize | |
# sudo gem install vcr | |
require 'uri' | |
require 'fileutils' | |
require 'pathname' | |
require 'vcr' | |
require 'mechanize' | |
require 'yaml' | |
require 'yaml/dbm' | |
require 'optparse' | |
SITE_URL = "http://www.books.ru" | |
def smart_truncate_filename(filename) | |
if (filename.bytesize + 4) <= 254 | |
return filename | |
end | |
filename.split("").reduce("") {|_s, c| _s.bytesize + 4 < 254 ? _s += c : _s} | |
end | |
class BooksRuDownloader | |
def initialize(login, password, root_dir = '/tmp/books_ru/', short_dir_tree = false) | |
puts root_dir, short_dir_tree | |
@login = login | |
@password = password | |
@root_dir = root_dir | |
@short_dir = short_dir_tree | |
@client = Mechanize.new | |
@client.max_file_buffer = 1024 * 1024 * 1024 | |
@client.pluggable_parser.default = Mechanize::Download | |
if (!File.directory? @root_dir) | |
FileUtils.makedirs @root_dir | |
end | |
@yaml_file = File.join(@root_dir, "books.yaml") | |
@data = YAML::DBM.new(@yaml_file) | |
end | |
def do_login | |
@client.get(URI.join(SITE_URL, "/member/login.php")) do |page| | |
page.form_with :id => "loginform" do |login_form| | |
login_form.login = @login | |
login_form.password = @password | |
end.click_button | |
end | |
end | |
def orders_links | |
order_page = @client.get(URI.join(SITE_URL, "/member/orders/")) | |
return order_page.parser.xpath("//tr[@class='closed']/td/a/@href") | |
end | |
def collect_books_data | |
book_data = orders_links.collect do |link| | |
order_detail_page = @client.get(link) | |
order_book_links = order_detail_page.parser.xpath( | |
"//table[@class='catalog']/tbody/tr[not(@class='total')]" | |
).collect do |tr_el| | |
desc_a = tr_el.xpath("td[@class='descr']//a").to_a | |
book_id = URI.parse(desc_a[0].attribute("href")).path.slice(/[\d]+/).to_s | |
unless @data.has_key?(book_id) | |
book = Hash.new | |
book[:id] = book_id | |
book[:title] = desc_a[0].text.to_s | |
detail_page = @client.get(SITE_URL + desc_a[0].attribute("href")) | |
book[:categories] = detail_page.parser.xpath("//div[@class='inline-catalog']/div[@class='path']") | |
.collect {|c| c.xpath(".//a//text()").collect {|a| a.to_s}} | |
book[:authors] = detail_page.parser.xpath("//td[@class='book-info']//p[@class='author']/a/text()").collect {|a| a.to_s} | |
book[:download_urls] = tr_el.xpath("td[@class='status']/a/@href").collect do |url| | |
url.to_s.gsub('/member/download_agreement.php?back_url=', '') | |
end | |
@data[book_id] = book | |
book | |
end | |
end | |
order_book_links | |
end.flatten | |
return book_data | |
end | |
def download_book book_data | |
dir_list = book_data[:categories] | |
if @short_dir | |
dir_list = [dir_list[0]] | |
end | |
dirs = dir_list | |
.reject {|_dirs| _dirs[0] == 'Акции'} | |
.collect {|_dirs| | |
if @short_dir | |
_dirs = _dirs.slice(-1, 1) | |
end | |
if _dirs[0] == 'книги' | |
_dirs = _dirs.drop(1) | |
end | |
book_dir_root = File.join(@root_dir, _dirs) | |
FileUtils.makedirs book_dir_root | |
book_dir_root | |
} | |
if dirs.empty? | |
dirs = [["Без категории"]] | |
end | |
authors = "" | |
unless book_data[:authors].empty? | |
authors = "%s - " % book_data[:authors].join(", ") | |
end | |
book_file_name = "#{authors}%{title}" % book_data | |
book_file_name.gsub!(' (файл PDF)', '') | |
book_file_name.gsub!('/', "\u2215") | |
book_file_name = smart_truncate_filename(book_file_name) | |
puts book_file_name | |
book_data[:download_urls].each do |url| | |
parsed_uri = URI.parse(URI.decode(url)) | |
parsed_query = URI.decode_www_form(parsed_uri.query) | |
file_type = Hash[parsed_query]['file_type'] | |
file_name = book_file_name + '.' + file_type | |
d_root = dirs[0] | |
file_path = File.join(d_root, file_name) | |
is_exists = File.exists? file_path | |
is_valid = true | |
download_url = URI.join(SITE_URL, url) | |
if is_exists | |
resp = @client.head download_url | |
if resp.header.has_key? 'content-range' | |
size = resp.header['content-range'].split('/')[1].to_i + 48 # WTF | |
else | |
size = resp.header['content-length'].to_i | |
end | |
is_valid = size == File.size?(file_path) | |
end | |
if not is_exists or not is_valid | |
puts "SAVE #{file_path}" | |
begin | |
@client.get(download_url).save(file_path) | |
rescue SystemExit, Interrupt | |
raise | |
rescue Exception => e | |
puts "FAIL SAVE: #{e}" | |
end | |
else | |
puts "SKIP #{book_file_name}" | |
end | |
dirs.drop(1).each {|dir| | |
symlink_path = File.join(dir, file_name) | |
unless File.exists? symlink_path | |
rel_path = File.join(Pathname.new(d_root).relative_path_from(Pathname.new(dir)), file_name) | |
File.symlink(rel_path, symlink_path) | |
puts "SYMLINK #{rel_path} -> #{symlink_path}" | |
end | |
} | |
end | |
end | |
def download_books | |
@data.values.each {|book_data| | |
download_book book_data | |
} | |
end | |
end | |
options = { | |
:root_dir => "/tmp/books_ru/", | |
:short_dir_tree => false, | |
} | |
OptionParser.new do |opts| | |
opts.banner = "Usage: books_ru.rb USERNAME PASSWORD" | |
opts.on("--root_dir") do |value| | |
options[:root_dir] = value | |
end | |
opts.on("--short_dirs") do | |
options[:short_dir_tree] = true | |
end | |
end.parse! | |
if ARGV.length < 2 | |
raise OptionParser::MissingArgument "Require USERNAME and PASSWORD" | |
end | |
def main | |
downloader = BooksRuDownloader.new *ARGV | |
downloader.do_login | |
downloader.collect_books_data | |
downloader.download_books | |
end | |
main |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment