Skip to content

Instantly share code, notes, and snippets.

@Apkawa
Last active January 4, 2018 17:20
Show Gist options
  • Save Apkawa/e1fa971c774c700254e5 to your computer and use it in GitHub Desktop.
Save Apkawa/e1fa971c774c700254e5 to your computer and use it in GitHub Desktop.
Downloader buyed e-books from Books.ru
#!/usr/bin/env ruby
# encoding=utf-8
# = INSTALL =
# sudo apt-get install ruby-mechanize
# sudo gem install vcr
require 'uri'
require 'fileutils'
require 'pathname'
require 'vcr'
require 'mechanize'
require 'yaml'
require 'yaml/dbm'
require 'optparse'
SITE_URL = "http://www.books.ru"
def smart_truncate_filename(filename)
if (filename.bytesize + 4) <= 254
return filename
end
filename.split("").reduce("") {|_s, c| _s.bytesize + 4 < 254 ? _s += c : _s}
end
class BooksRuDownloader
def initialize(login, password, root_dir = '/tmp/books_ru/', short_dir_tree = false)
puts root_dir, short_dir_tree
@login = login
@password = password
@root_dir = root_dir
@short_dir = short_dir_tree
@client = Mechanize.new
@client.max_file_buffer = 1024 * 1024 * 1024
@client.pluggable_parser.default = Mechanize::Download
if (!File.directory? @root_dir)
FileUtils.makedirs @root_dir
end
@yaml_file = File.join(@root_dir, "books.yaml")
@data = YAML::DBM.new(@yaml_file)
end
def do_login
@client.get(URI.join(SITE_URL, "/member/login.php")) do |page|
page.form_with :id => "loginform" do |login_form|
login_form.login = @login
login_form.password = @password
end.click_button
end
end
def orders_links
order_page = @client.get(URI.join(SITE_URL, "/member/orders/"))
return order_page.parser.xpath("//tr[@class='closed']/td/a/@href")
end
def collect_books_data
book_data = orders_links.collect do |link|
order_detail_page = @client.get(link)
order_book_links = order_detail_page.parser.xpath(
"//table[@class='catalog']/tbody/tr[not(@class='total')]"
).collect do |tr_el|
desc_a = tr_el.xpath("td[@class='descr']//a").to_a
book_id = URI.parse(desc_a[0].attribute("href")).path.slice(/[\d]+/).to_s
unless @data.has_key?(book_id)
book = Hash.new
book[:id] = book_id
book[:title] = desc_a[0].text.to_s
detail_page = @client.get(SITE_URL + desc_a[0].attribute("href"))
book[:categories] = detail_page.parser.xpath("//div[@class='inline-catalog']/div[@class='path']")
.collect {|c| c.xpath(".//a//text()").collect {|a| a.to_s}}
book[:authors] = detail_page.parser.xpath("//td[@class='book-info']//p[@class='author']/a/text()").collect {|a| a.to_s}
book[:download_urls] = tr_el.xpath("td[@class='status']/a/@href").collect do |url|
url.to_s.gsub('/member/download_agreement.php?back_url=', '')
end
@data[book_id] = book
book
end
end
order_book_links
end.flatten
return book_data
end
def download_book book_data
dir_list = book_data[:categories]
if @short_dir
dir_list = [dir_list[0]]
end
dirs = dir_list
.reject {|_dirs| _dirs[0] == 'Акции'}
.collect {|_dirs|
if @short_dir
_dirs = _dirs.slice(-1, 1)
end
if _dirs[0] == 'книги'
_dirs = _dirs.drop(1)
end
book_dir_root = File.join(@root_dir, _dirs)
FileUtils.makedirs book_dir_root
book_dir_root
}
if dirs.empty?
dirs = [["Без категории"]]
end
authors = ""
unless book_data[:authors].empty?
authors = "%s - " % book_data[:authors].join(", ")
end
book_file_name = "#{authors}%{title}" % book_data
book_file_name.gsub!(' (файл PDF)', '')
book_file_name.gsub!('/', "\u2215")
book_file_name = smart_truncate_filename(book_file_name)
puts book_file_name
book_data[:download_urls].each do |url|
parsed_uri = URI.parse(URI.decode(url))
parsed_query = URI.decode_www_form(parsed_uri.query)
file_type = Hash[parsed_query]['file_type']
file_name = book_file_name + '.' + file_type
d_root = dirs[0]
file_path = File.join(d_root, file_name)
is_exists = File.exists? file_path
is_valid = true
download_url = URI.join(SITE_URL, url)
if is_exists
resp = @client.head download_url
if resp.header.has_key? 'content-range'
size = resp.header['content-range'].split('/')[1].to_i + 48 # WTF
else
size = resp.header['content-length'].to_i
end
is_valid = size == File.size?(file_path)
end
if not is_exists or not is_valid
puts "SAVE #{file_path}"
begin
@client.get(download_url).save(file_path)
rescue SystemExit, Interrupt
raise
rescue Exception => e
puts "FAIL SAVE: #{e}"
end
else
puts "SKIP #{book_file_name}"
end
dirs.drop(1).each {|dir|
symlink_path = File.join(dir, file_name)
unless File.exists? symlink_path
rel_path = File.join(Pathname.new(d_root).relative_path_from(Pathname.new(dir)), file_name)
File.symlink(rel_path, symlink_path)
puts "SYMLINK #{rel_path} -> #{symlink_path}"
end
}
end
end
def download_books
@data.values.each {|book_data|
download_book book_data
}
end
end
options = {
:root_dir => "/tmp/books_ru/",
:short_dir_tree => false,
}
OptionParser.new do |opts|
opts.banner = "Usage: books_ru.rb USERNAME PASSWORD"
opts.on("--root_dir") do |value|
options[:root_dir] = value
end
opts.on("--short_dirs") do
options[:short_dir_tree] = true
end
end.parse!
if ARGV.length < 2
raise OptionParser::MissingArgument "Require USERNAME and PASSWORD"
end
def main
downloader = BooksRuDownloader.new *ARGV
downloader.do_login
downloader.collect_books_data
downloader.download_books
end
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment