Skip to content

Instantly share code, notes, and snippets.

@naquad
Created October 20, 2011 00:17
Show Gist options
  • Save naquad/1300063 to your computer and use it in GitHub Desktop.
Save naquad/1300063 to your computer and use it in GitHub Desktop.
#!/usr/bin/ruby -w
# encoding: utf-8
require 'bundler/setup'
require 'nokogiri'
require 'httpclient'
require 'uri'
require 'ruby-debug'
require 'unicode_utils'
class Object
def try(what, *args, &block)
self and send(what, *args, &block)
end
end
class String
UnicodeUtils.methods(false).each do |method|
define_method method do |*args, &block|
UnicodeUtils.send(method, self, *args, &block)
end
end
def escaped?
@escaped
end
def escape!
@escaped = true
self
end
def unescape!
@escaped = false
self
end
end
module HTTP
class Message
class << self
alias :real_escape :escape
def escape(str)
str.escaped? ? str : real_escape(str)
end
end
end
end
class Kinozal < HTTPClient
attr_reader :response
DEFAULT_AGENT_NAME = 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100724 Firefox/3.6.8'
BASE_URL = 'http://kinozal.tv'
LOGIN_URL = 'http://kinozal.tv/takelogin.php'
BROWSE_URL = 'http://kinozal.tv/browse.php'
FIELD_TR = {
'cdvd-привод' => :drive,
'cd-привод' => :drive,
'cубтитры' => :subtitles,
'dvd-привод' => :drive,
'автор' => :author,
'автор и ведущий' => :author,
'альбом' => :album,
'аудио' => :audio,
'аудиокарта' => :soundcard,
'балет и оркестр' => :orchestra,
'ведущая' => :spokesman,
'ведущие' => :spokesman,
'ведущий' => :spokesman,
'версия' => :version,
'видео' => :video,
'видеокарта' => :videocard,
'водяные знаки' => :watermarks,
'в ролях' => :cast,
'выпущено' => :produced,
'глубина цвета' => :color,
'год выпуска' => :year,
'год выхода' => :year,
'год издания' => :year,
'год издания аудиокниги' => :year,
'год издания аудио книги' => :year,
'дата релиз' => :release_date,
'дата релиза' => :release_date,
'дата составления' => :release_date,
'дирижер' => :conductor,
'жанр' => :genre,
'жюри' => :joury,
'занимаемое место на жд' => :space,
'игровая платформа' => :platform,
'издатель' => :publisher,
'издательство' => :publisher,
'интерфейс' => :interface,
'исполнители' => :artist,
'исполнитель' => :artist,
'качество' => :quality,
'количество' => :amount,
'количество страниц' => :pages,
'комментатор' => :commenter,
'комментаторы' => :commenter,
'композитор' => :composer,
'название' => :title,
'озвучивает' => :voice,
'операционная система' => :platform,
'оригинальное название' => :original_title,
'орининальное название' => :original_title,
'оркестр' => :orchestra,
'память' => :memory,
'перевод' => :translation,
'продолжительность' => :duration,
'производство' => :publisher,
'процессор' => :cpu,
'размер' => :size,
'размеры изображений' => :image_size,
'размеры листа' => :page_size,
'разработчик' => :developer,
'разрешение' => :resolution,
'режиссер' => :director,
'режиссёр' => :director,
'релиз' => :release_date,
'роли дублировали' => :dub,
'роли озвучивали' => :dub,
'свободное место на жд' => :space,
'серия' => :episode,
'составитель' => :releaser,
'страна' => :country,
'субтиры' => :subtitles,
'субтитры' => :subtitles,
'текст читает' => :spokesman,
'формат' => :format,
'формат 3d' => :format,
'хор' => :chorus,
'язык' => :language,
'язяк' => :language,
}
def initialize(*args, &block)
super
self.agent_name = DEFAULT_AGENT_NAME unless agent_name
end
def post(uri, *args, &block)
request(:post, uri, nil, *args, &block)
end
def login(login, password)
post(LOGIN_URL, {
username: login,
password: password,
returnto: ''
}, {Referer: BASE_URL})
raise RuntimeError, "Login failed! CAPTCHA?" unless @response.code == 302
end
def base(u)
URI.join(BASE_URL, u).to_s
end
def latest_torrents(url = nil)
url ||= BROWSE_URL
ret = []
while url
get(url)
parser = html_parser
if !defined?(@@categories) and cats = parser.at_css('select[name="c"]')
@@categories = Hash[cats.css('option').map {|x| [x[:value], x.text]}]
end
parser.css('table.mn table.mn2').last.css('tr').each_with_index do |row, idx|
next if idx.zero?
cells = row.css('td')
result = {}
result[:category_id] = cells[0].at_css('a/@href').value.match(/c=(\d+)/)[1]
result[:category] = @@categories[result[:category_id]]
result[:title] = cells[1].text.strip.gsub(/\s*\/.*/, '')
result[:url] = base(cells[1].at_css('a/@href').value)
result[:comments] = cells[2].text.strip.to_i
result[:downloaded] = cells[5].text.strip.to_i
result[:seeders] = cells[6].text.strip.to_i
result[:peers] = cells[7].text.strip.to_i
yield result if block_given?
ret << result
end
url = base(url.value) if url = parser.at_xpath('//a[text()="вперед" and contains(@href, "browse")]/@href')
end
ret
end
OK_CODES = [301, 302, 200]
def request(method, uri, query = nil, body = nil, extheader = {}, &block)
@charset = nil
@html_parser = nil
count = 0
begin
@response = super
count += 1
end while count < 5 && (@response.code == 404 || @response.code == 503)
if @response.contenttype =~ /text\/[^;]+;\s*charset=(.+)/i
begin
@response.http_body.instance_variable_set(:@body, @response.content.encode('utf-8', $1))
@charset = 'UTF-8'
rescue
@charset = $1
end
end
raise RuntimeError, "#{method.to_s.upcase} request to #{uri} failed: #{@response.code}" unless OK_CODES.include? @response.code
@response
end
def html_parser(force = false)
if @html_parser
@html_parser
elsif @response && (@response.contenttype =~ /text\/html/i || force)
@html_parser = Nokogiri::HTML(@response.content, nil, @charset)
end
end
def parse_section(result, table)
table.css('br').each {|br| br.replace "\n"}
table.css('b').each {|b| b.replace("(%{#{b.text}})")}
table.text.split("\n").each do |line|
key, value = line.strip.split(/\s*\(%\{([^}]+)\}\)\s*/, 2).grep(/\S/)
next unless value
key = key.strip.gsub(/\s*:$$/, '').downcase
next unless FIELD_TR.key?(key)
result[FIELD_TR[key]] = value.gsub(/\(%\{([^}]+)\}\)/, '<b>\1</b>').strip
end
table
end
def parse_torrent(url)
get(url)
result = { url: url }
parser = html_parser
download_anchor = parser.at_xpath('//a[contains(@href, "download.php")]')
result[:download_url] = base(download_anchor['href'])
result[:cover] = base(parser.at_xpath('//a[contains(@href, "details.php")]/img/@src').value)
begin
download_anchor = download_anchor.parent
end while download_anchor.name != 'table'
if scn = @response.content.match(/(\/get_srv_details\.php\?id=\d+&torrent_size=\d+&pagesd=)/) and tn = @html_parser.at_xpath('//a[@onclick and text()="Скриншоты"]/@onclick')
result[:screenshots] = scn[1] + tn.value.match(/showtab\((\d+)\)/)[1]
end
parse_section(result, base_data = download_anchor.next.next.next) # fucking insane, but those idiots don't use ids or classes
base_data = base_data.next.next
result[:description] = normalize_description(base_data.at_css('div'))
base_data = base_data.next.next.next
parse_section(result, base_data.at_css('#tabs'))
if result[:screenshots]
request(:get, base(result[:screenshots]), nil, nil, { Referer: url })
result[:screenshots] = {}
html_parser.xpath('//a[img]').each do |anchor|
result[:screenshots][anchor['href']] = anchor.at_xpath('.//img/@src').value
end
result.delete(:screenshots) if result[:screenshots].empty?
end
result
end
def normalize_description(div)
bs = div.css('b')
if bs.length == 1
bs.remove
else
bs.each {|b| b.replace("])#{b.text.strip}([") }
end
div.css('br').each {|br| br.replace("\n")}
div.text.gsub(/\]\)(.+?)\(\[/, '<b>\1</b>').strip
end
end
x = Kinozal.new
x.login 'xxxx', 'xxxx'
require 'ap'
x.latest_torrents do |item|
puts "!!!! #{item[:url]}"
ap x.parse_torrent(item[:url])
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment