Created
October 20, 2011 00:17
-
-
Save naquad/1300063 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby -w | |
# encoding: utf-8 | |
require 'bundler/setup' | |
require 'nokogiri' | |
require 'httpclient' | |
require 'uri' | |
require 'ruby-debug' | |
require 'unicode_utils' | |
class Object | |
def try(what, *args, &block) | |
self and send(what, *args, &block) | |
end | |
end | |
class String | |
UnicodeUtils.methods(false).each do |method| | |
define_method method do |*args, &block| | |
UnicodeUtils.send(method, self, *args, &block) | |
end | |
end | |
def escaped? | |
@escaped | |
end | |
def escape! | |
@escaped = true | |
self | |
end | |
def unescape! | |
@escaped = false | |
self | |
end | |
end | |
module HTTP | |
class Message | |
class << self | |
alias :real_escape :escape | |
def escape(str) | |
str.escaped? ? str : real_escape(str) | |
end | |
end | |
end | |
end | |
class Kinozal < HTTPClient | |
attr_reader :response | |
DEFAULT_AGENT_NAME = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100724 Firefox/3.6.8' | |
BASE_URL = 'http://kinozal.tv' | |
LOGIN_URL = 'http://kinozal.tv/takelogin.php' | |
BROWSE_URL = 'http://kinozal.tv/browse.php' | |
FIELD_TR = { | |
'cdvd-привод' => :drive, | |
'cd-привод' => :drive, | |
'cубтитры' => :subtitles, | |
'dvd-привод' => :drive, | |
'автор' => :author, | |
'автор и ведущий' => :author, | |
'альбом' => :album, | |
'аудио' => :audio, | |
'аудиокарта' => :soundcard, | |
'балет и оркестр' => :orchestra, | |
'ведущая' => :spokesman, | |
'ведущие' => :spokesman, | |
'ведущий' => :spokesman, | |
'версия' => :version, | |
'видео' => :video, | |
'видеокарта' => :videocard, | |
'водяные знаки' => :watermarks, | |
'в ролях' => :cast, | |
'выпущено' => :produced, | |
'глубина цвета' => :color, | |
'год выпуска' => :year, | |
'год выхода' => :year, | |
'год издания' => :year, | |
'год издания аудиокниги' => :year, | |
'год издания аудио книги' => :year, | |
'дата релиз' => :release_date, | |
'дата релиза' => :release_date, | |
'дата составления' => :release_date, | |
'дирижер' => :conductor, | |
'жанр' => :genre, | |
'жюри' => :joury, | |
'занимаемое место на жд' => :space, | |
'игровая платформа' => :platform, | |
'издатель' => :publisher, | |
'издательство' => :publisher, | |
'интерфейс' => :interface, | |
'исполнители' => :artist, | |
'исполнитель' => :artist, | |
'качество' => :quality, | |
'количество' => :amount, | |
'количество страниц' => :pages, | |
'комментатор' => :commenter, | |
'комментаторы' => :commenter, | |
'композитор' => :composer, | |
'название' => :title, | |
'озвучивает' => :voice, | |
'операционная система' => :platform, | |
'оригинальное название' => :original_title, | |
'орининальное название' => :original_title, | |
'оркестр' => :orchestra, | |
'память' => :memory, | |
'перевод' => :translation, | |
'продолжительность' => :duration, | |
'производство' => :publisher, | |
'процессор' => :cpu, | |
'размер' => :size, | |
'размеры изображений' => :image_size, | |
'размеры листа' => :page_size, | |
'разработчик' => :developer, | |
'разрешение' => :resolution, | |
'режиссер' => :director, | |
'режиссёр' => :director, | |
'релиз' => :release_date, | |
'роли дублировали' => :dub, | |
'роли озвучивали' => :dub, | |
'свободное место на жд' => :space, | |
'серия' => :episode, | |
'составитель' => :releaser, | |
'страна' => :country, | |
'субтиры' => :subtitles, | |
'субтитры' => :subtitles, | |
'текст читает' => :spokesman, | |
'формат' => :format, | |
'формат 3d' => :format, | |
'хор' => :chorus, | |
'язык' => :language, | |
'язяк' => :language, | |
} | |
def initialize(*args, &block) | |
super | |
self.agent_name = DEFAULT_AGENT_NAME unless agent_name | |
end | |
def post(uri, *args, &block) | |
request(:post, uri, nil, *args, &block) | |
end | |
def login(login, password) | |
post(LOGIN_URL, { | |
username: login, | |
password: password, | |
returnto: '' | |
}, {Referer: BASE_URL}) | |
raise RuntimeError, "Login failed! CAPTCHA?" unless @response.code == 302 | |
end | |
def base(u) | |
URI.join(BASE_URL, u).to_s | |
end | |
def latest_torrents(url = nil) | |
url ||= BROWSE_URL | |
ret = [] | |
while url | |
get(url) | |
parser = html_parser | |
if !defined?(@@categories) and cats = parser.at_css('select[name="c"]') | |
@@categories = Hash[cats.css('option').map {|x| [x[:value], x.text]}] | |
end | |
parser.css('table.mn table.mn2').last.css('tr').each_with_index do |row, idx| | |
next if idx.zero? | |
cells = row.css('td') | |
result = {} | |
result[:category_id] = cells[0].at_css('a/@href').value.match(/c=(\d+)/)[1] | |
result[:category] = @@categories[result[:category_id]] | |
result[:title] = cells[1].text.strip.gsub(/\s*\/.*/, '') | |
result[:url] = base(cells[1].at_css('a/@href').value) | |
result[:comments] = cells[2].text.strip.to_i | |
result[:downloaded] = cells[5].text.strip.to_i | |
result[:seeders] = cells[6].text.strip.to_i | |
result[:peers] = cells[7].text.strip.to_i | |
yield result if block_given? | |
ret << result | |
end | |
url = base(url.value) if url = parser.at_xpath('//a[text()="вперед" and contains(@href, "browse")]/@href') | |
end | |
ret | |
end | |
OK_CODES = [301, 302, 200] | |
def request(method, uri, query = nil, body = nil, extheader = {}, &block) | |
@charset = nil | |
@html_parser = nil | |
count = 0 | |
begin | |
@response = super | |
count += 1 | |
end while count < 5 && (@response.code == 404 || @response.code == 503) | |
if @response.contenttype =~ /text\/[^;]+;\s*charset=(.+)/i | |
begin | |
@response.http_body.instance_variable_set(:@body, @response.content.encode('utf-8', $1)) | |
@charset = 'UTF-8' | |
rescue | |
@charset = $1 | |
end | |
end | |
raise RuntimeError, "#{method.to_s.upcase} request to #{uri} failed: #{@response.code}" unless OK_CODES.include? @response.code | |
@response | |
end | |
def html_parser(force = false) | |
if @html_parser | |
@html_parser | |
elsif @response && (@response.contenttype =~ /text\/html/i || force) | |
@html_parser = Nokogiri::HTML(@response.content, nil, @charset) | |
end | |
end | |
def parse_section(result, table) | |
table.css('br').each {|br| br.replace "\n"} | |
table.css('b').each {|b| b.replace("(%{#{b.text}})")} | |
table.text.split("\n").each do |line| | |
key, value = line.strip.split(/\s*\(%\{([^}]+)\}\)\s*/, 2).grep(/\S/) | |
next unless value | |
key = key.strip.gsub(/\s*:$$/, '').downcase | |
next unless FIELD_TR.key?(key) | |
result[FIELD_TR[key]] = value.gsub(/\(%\{([^}]+)\}\)/, '<b>\1</b>').strip | |
end | |
table | |
end | |
def parse_torrent(url) | |
get(url) | |
result = { url: url } | |
parser = html_parser | |
download_anchor = parser.at_xpath('//a[contains(@href, "download.php")]') | |
result[:download_url] = base(download_anchor['href']) | |
result[:cover] = base(parser.at_xpath('//a[contains(@href, "details.php")]/img/@src').value) | |
begin | |
download_anchor = download_anchor.parent | |
end while download_anchor.name != 'table' | |
if scn = @response.content.match(/(\/get_srv_details\.php\?id=\d+&torrent_size=\d+&pagesd=)/) and tn = @html_parser.at_xpath('//a[@onclick and text()="Скриншоты"]/@onclick') | |
result[:screenshots] = scn[1] + tn.value.match(/showtab\((\d+)\)/)[1] | |
end | |
parse_section(result, base_data = download_anchor.next.next.next) # fucking insane, but those idiots don't use ids or classes | |
base_data = base_data.next.next | |
result[:description] = normalize_description(base_data.at_css('div')) | |
base_data = base_data.next.next.next | |
parse_section(result, base_data.at_css('#tabs')) | |
if result[:screenshots] | |
request(:get, base(result[:screenshots]), nil, nil, { Referer: url }) | |
result[:screenshots] = {} | |
html_parser.xpath('//a[img]').each do |anchor| | |
result[:screenshots][anchor['href']] = anchor.at_xpath('.//img/@src').value | |
end | |
result.delete(:screenshots) if result[:screenshots].empty? | |
end | |
result | |
end | |
def normalize_description(div) | |
bs = div.css('b') | |
if bs.length == 1 | |
bs.remove | |
else | |
bs.each {|b| b.replace("])#{b.text.strip}([") } | |
end | |
div.css('br').each {|br| br.replace("\n")} | |
div.text.gsub(/\]\)(.+?)\(\[/, '<b>\1</b>').strip | |
end | |
end | |
x = Kinozal.new | |
x.login 'xxxx', 'xxxx' | |
require 'ap' | |
x.latest_torrents do |item| | |
puts "!!!! #{item[:url]}" | |
ap x.parse_torrent(item[:url]) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment