Apkawa · January 4, 2018 17:20
diff --git a/books_ru.rb b/books_ru.rb
 #!/usr/bin/env ruby
 # encoding=utf-8
 # = INSTALL =
 # sudo apt-get install ruby-mechanize
 # sudo gem install vcr
 require 'uri'
 require 'fileutils'
 require 'pathname'

 require 'vcr'
 require 'mechanize'
 require 'yaml'
 require 'yaml/dbm'

 require 'optparse'


 SITE_URL = "http://www.books.ru"

 def smart_truncate_filename(filename)
  if (filename.bytesize + 4) <= 254
    return filename
  end
  filename.split("").reduce("") {|_s, c| _s.bytesize + 4 < 254 ? _s += c : _s}
 end

 class BooksRuDownloader
  def initialize(login, password, root_dir = '/tmp/books_ru/', short_dir_tree = false)
    puts root_dir, short_dir_tree
    @login = login
    @password = password
    @root_dir = root_dir
    @short_dir = short_dir_tree
    @client = Mechanize.new
    @client.max_file_buffer = 1024 * 1024 * 1024
    @client.pluggable_parser.default = Mechanize::Download
    if (!File.directory? @root_dir)
      FileUtils.makedirs @root_dir
    end

    @yaml_file = File.join(@root_dir, "books.yaml")
    @data = YAML::DBM.new(@yaml_file)
  end

  def do_login
    @client.get(URI.join(SITE_URL, "/member/login.php")) do |page|
      page.form_with :id => "loginform" do |login_form|
        login_form.login = @login
        login_form.password = @password
      end.click_button
    end
  end

  def orders_links
    order_page = @client.get(URI.join(SITE_URL, "/member/orders/"))
    return order_page.parser.xpath("//tr[@class='closed']/td/a/@href")
  end

  def collect_books_data
    book_data = orders_links.collect do |link|
      order_detail_page = @client.get(link)
      order_book_links = order_detail_page.parser.xpath(
          "//table[@class='catalog']/tbody/tr[not(@class='total')]"
      ).collect do |tr_el|
        desc_a = tr_el.xpath("td[@class='descr']//a").to_a
        book_id = URI.parse(desc_a[0].attribute("href")).path.slice(/[\d]+/).to_s
        unless @data.has_key?(book_id)
          book = Hash.new
          book[:id] = book_id
          book[:title] = desc_a[0].text.to_s
          detail_page = @client.get(SITE_URL + desc_a[0].attribute("href"))
          book[:categories] = detail_page.parser.xpath("//div[@class='inline-catalog']/div[@class='path']")
                                  .collect {|c| c.xpath(".//a//text()").collect {|a| a.to_s}}
          book[:authors] = detail_page.parser.xpath("//td[@class='book-info']//p[@class='author']/a/text()").collect {|a| a.to_s}
          book[:download_urls] = tr_el.xpath("td[@class='status']/a/@href").collect do |url|
            url.to_s.gsub('/member/download_agreement.php?back_url=', '')
          end
          @data[book_id] = book
          book
        end
      end
      order_book_links
    end.flatten
    return book_data
  end

  def download_book book_data
    dir_list = book_data[:categories]
    if @short_dir
      dir_list = [dir_list[0]]
    end

    dirs = dir_list
               .reject {|_dirs| _dirs[0] == 'Акции'}
               .collect {|_dirs|
                 if @short_dir
                   _dirs = _dirs.slice(-1, 1)
                 end
                 if _dirs[0] == 'книги'
                   _dirs = _dirs.drop(1)
                 end
                 book_dir_root = File.join(@root_dir, _dirs)
                 FileUtils.makedirs book_dir_root
                 book_dir_root
               }
    if dirs.empty?
      dirs = [["Без категории"]]
    end

    authors = ""
    unless book_data[:authors].empty?
      authors = "%s - " % book_data[:authors].join(", ")
    end

    book_file_name = "#{authors}%{title}" % book_data
    book_file_name.gsub!(' (файл PDF)', '')
    book_file_name.gsub!('/', "\u2215")
    book_file_name = smart_truncate_filename(book_file_name)


    puts book_file_name

    book_data[:download_urls].each do |url|
      parsed_uri = URI.parse(URI.decode(url))
      parsed_query = URI.decode_www_form(parsed_uri.query)
      file_type = Hash[parsed_query]['file_type']

      file_name = book_file_name + '.' + file_type
      d_root = dirs[0]
      file_path = File.join(d_root, file_name)
      is_exists = File.exists? file_path
      is_valid = true
      download_url = URI.join(SITE_URL, url)
      if is_exists
        resp = @client.head download_url
        if resp.header.has_key? 'content-range'
          size = resp.header['content-range'].split('/')[1].to_i + 48 # WTF
        else
          size = resp.header['content-length'].to_i
        end
        is_valid = size == File.size?(file_path)
      end
      if not is_exists or not is_valid
        puts "SAVE #{file_path}"
        begin
          @client.get(download_url).save(file_path)
        rescue SystemExit, Interrupt
          raise
        rescue Exception => e
          puts "FAIL SAVE: #{e}"
        end
      else
        puts "SKIP #{book_file_name}"
      end
      dirs.drop(1).each {|dir|
        symlink_path = File.join(dir, file_name)
        unless File.exists? symlink_path
          rel_path = File.join(Pathname.new(d_root).relative_path_from(Pathname.new(dir)), file_name)
          File.symlink(rel_path, symlink_path)
          puts "SYMLINK #{rel_path} -> #{symlink_path}"
        end
      }
    end
  end

  def download_books
    @data.values.each {|book_data|
      download_book book_data
    }
  end
 end

 options = {
    :root_dir => "/tmp/books_ru/",
    :short_dir_tree => false,
 }

 OptionParser.new do |opts|
  opts.banner = "Usage: books_ru.rb USERNAME PASSWORD"
  opts.on("--root_dir") do |value|
    options[:root_dir] = value
  end
  opts.on("--short_dirs") do
    options[:short_dir_tree] = true
  end
 end.parse!

 if ARGV.length < 2
  raise OptionParser::MissingArgument "Require USERNAME and PASSWORD"
 end

 def main
  downloader = BooksRuDownloader.new *ARGV
  downloader.do_login
  downloader.collect_books_data
  downloader.download_books
 end

 main
	#!/usr/bin/env ruby
	# encoding=utf-8
	# = INSTALL =
	# sudo apt-get install ruby-mechanize
	# sudo gem install vcr
	require 'uri'
	require 'fileutils'
	require 'pathname'

	require 'vcr'
	require 'mechanize'
	require 'yaml'
	require 'yaml/dbm'

	require 'optparse'


	SITE_URL = "http://www.books.ru"

	def smart_truncate_filename(filename)
	if (filename.bytesize + 4) <= 254
	return filename
	end
	filename.split("").reduce("") {\|_s, c\| _s.bytesize + 4 < 254 ? _s += c : _s}
	end

	class BooksRuDownloader
	def initialize(login, password, root_dir = '/tmp/books_ru/', short_dir_tree = false)
	puts root_dir, short_dir_tree
	@login = login
	@password = password
	@root_dir = root_dir
	@short_dir = short_dir_tree
	@client = Mechanize.new
	@client.max_file_buffer = 1024 * 1024 * 1024
	@client.pluggable_parser.default = Mechanize::Download
	if (!File.directory? @root_dir)
	FileUtils.makedirs @root_dir
	end

	@yaml_file = File.join(@root_dir, "books.yaml")
	@data = YAML::DBM.new(@yaml_file)
	end

	def do_login
	@client.get(URI.join(SITE_URL, "/member/login.php")) do \|page\|
	page.form_with :id => "loginform" do \|login_form\|
	login_form.login = @login
	login_form.password = @password
	end.click_button
	end
	end

	def orders_links
	order_page = @client.get(URI.join(SITE_URL, "/member/orders/"))
	return order_page.parser.xpath("//tr[@class='closed']/td/a/@href")
	end

	def collect_books_data
	book_data = orders_links.collect do \|link\|
	order_detail_page = @client.get(link)
	order_book_links = order_detail_page.parser.xpath(
	"//table[@class='catalog']/tbody/tr[not(@class='total')]"
	).collect do \|tr_el\|
	desc_a = tr_el.xpath("td[@class='descr']//a").to_a
	book_id = URI.parse(desc_a[0].attribute("href")).path.slice(/[\d]+/).to_s
	unless @data.has_key?(book_id)
	book = Hash.new
	book[:id] = book_id
	book[:title] = desc_a[0].text.to_s
	detail_page = @client.get(SITE_URL + desc_a[0].attribute("href"))
	book[:categories] = detail_page.parser.xpath("//div[@class='inline-catalog']/div[@class='path']")
	.collect {\|c\| c.xpath(".//a//text()").collect {\|a\| a.to_s}}
	book[:authors] = detail_page.parser.xpath("//td[@class='book-info']//p[@class='author']/a/text()").collect {\|a\| a.to_s}
	book[:download_urls] = tr_el.xpath("td[@class='status']/a/@href").collect do \|url\|
	url.to_s.gsub('/member/download_agreement.php?back_url=', '')
	end
	@data[book_id] = book
	book
	end
	end
	order_book_links
	end.flatten
	return book_data
	end

	def download_book book_data
	dir_list = book_data[:categories]
	if @short_dir
	dir_list = [dir_list[0]]
	end

	dirs = dir_list
	.reject {\|_dirs\| _dirs[0] == 'Акции'}
	.collect {\|_dirs\|
	if @short_dir
	_dirs = _dirs.slice(-1, 1)
	end
	if _dirs[0] == 'книги'
	_dirs = _dirs.drop(1)
	end
	book_dir_root = File.join(@root_dir, _dirs)
	FileUtils.makedirs book_dir_root
	book_dir_root
	}
	if dirs.empty?
	dirs = [["Без категории"]]
	end

	authors = ""
	unless book_data[:authors].empty?
	authors = "%s - " % book_data[:authors].join(", ")
	end

	book_file_name = "#{authors}%{title}" % book_data
	book_file_name.gsub!(' (файл PDF)', '')
	book_file_name.gsub!('/', "\u2215")
	book_file_name = smart_truncate_filename(book_file_name)


	puts book_file_name

	book_data[:download_urls].each do \|url\|
	parsed_uri = URI.parse(URI.decode(url))
	parsed_query = URI.decode_www_form(parsed_uri.query)
	file_type = Hash[parsed_query]['file_type']

	file_name = book_file_name + '.' + file_type
	d_root = dirs[0]
	file_path = File.join(d_root, file_name)
	is_exists = File.exists? file_path
	is_valid = true
	download_url = URI.join(SITE_URL, url)
	if is_exists
	resp = @client.head download_url
	if resp.header.has_key? 'content-range'
	size = resp.header['content-range'].split('/')[1].to_i + 48 # WTF
	else
	size = resp.header['content-length'].to_i
	end
	is_valid = size == File.size?(file_path)
	end
	if not is_exists or not is_valid
	puts "SAVE #{file_path}"
	begin
	@client.get(download_url).save(file_path)
	rescue SystemExit, Interrupt
	raise
	rescue Exception => e
	puts "FAIL SAVE: #{e}"
	end
	else
	puts "SKIP #{book_file_name}"
	end
	dirs.drop(1).each {\|dir\|
	symlink_path = File.join(dir, file_name)
	unless File.exists? symlink_path
	rel_path = File.join(Pathname.new(d_root).relative_path_from(Pathname.new(dir)), file_name)
	File.symlink(rel_path, symlink_path)
	puts "SYMLINK #{rel_path} -> #{symlink_path}"
	end
	}
	end
	end

	def download_books
	@data.values.each {\|book_data\|
	download_book book_data
	}
	end
	end

	options = {
	:root_dir => "/tmp/books_ru/",
	:short_dir_tree => false,
	}

	OptionParser.new do \|opts\|
	opts.banner = "Usage: books_ru.rb USERNAME PASSWORD"
	opts.on("--root_dir") do \|value\|
	options[:root_dir] = value
	end
	opts.on("--short_dirs") do
	options[:short_dir_tree] = true
	end
	end.parse!

	if ARGV.length < 2
	raise OptionParser::MissingArgument "Require USERNAME and PASSWORD"
	end

	def main
	downloader = BooksRuDownloader.new *ARGV
	downloader.do_login
	downloader.collect_books_data
	downloader.download_books
	end

	main