Created
March 8, 2017 04:00
-
-
Save alexkojin/5ab9ce9622641d9ec32e1db0f21d0c58 to your computer and use it in GitHub Desktop.
Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# redtube.com crawler | |
module Crawler::Redtube | |
class Parser < Crawler::BaseParser | |
scrape :video_id, ->(i) { i.url.split('/').reverse.first.to_i } | |
scrape :title, '.videoTitle' | |
scrape :author_name, '.video-details tr:eq(1) td:eq(2) a', alt: '.video-details tr:eq(1) td:eq(2) span' | |
scrape :views, '.video-details tr:eq(2) td:eq(2)', type: :integer | |
scrape :rating_percent, '.percent-likes', type: :integer | |
scrape :duration, '.video-duration' | |
scrape :uploaded_at, '.added-time', remove: /ADDED/, type: :date | |
scrape :tags, '.video-details tr:eq(4) td.links a', array: true | |
scrape :categories, '.video-details tr:eq(1) td.links a', array: true | |
scrape :pornstars, '.video-details tr:eq(2) td.links a:not(.add-pornstar-plus)', array: true | |
scrape :hd, ->(i) { i.tags.include?('HD') || i.categories.include?('HD') } | |
scrape :embed_url, ["meta[name='twitter:player']", :content] | |
scrape :embed_code, ->(i) { %(<iframe src="#{i.embed_url}&bgcolor=000000" frameborder="0" width="560" height="315" scrolling="no" allowfullscreen></iframe>) } | |
scrape :cover_url, ['#redtube_flv_player noscript video', :poster] | |
scrape :thumb_url, ["meta[property='og:image']", :content] | |
scrape :comments_count, '.comments-btn', remove: /\D/, type: :integer | |
scrape :comments, ->(i) { | |
i.doc.css('.comment').map do |comment| | |
{ | |
comment_id: comment[:id].to_s.gsub(/comment_/, '').to_i, | |
reply_to_id: comment[:class].split(/\s/).select{|c| c.starts_with?('replyTo')}.first.try(:gsub, 'replyTo', '').try(:to_i), | |
author_name: (comment.at_css('.user-date a') || comment.at_css('.user-date span')).text, | |
body: comment.at_css('.comment-content').inner_html | |
} | |
end | |
} | |
scrape :related_videos, ['.video-listing a', :href], append: 'https://www.redtube.com', array: true | |
end | |
end | |
# Base class | |
module Crawler | |
class BaseParser | |
attr_reader :url | |
def self.scrape(name, query_or_proc, alt: nil, type: :string, remove: nil, append: nil, array: false) | |
define_method(name) do | |
begin | |
result = if query_or_proc.kind_of?(Proc) | |
query_or_proc.call(self) | |
else | |
array ? get_array(*query_or_proc) : get(*query_or_proc) | |
end | |
# run alternative query if a main returns nil or empty array | |
if result.blank? && alt | |
result = if alt.kind_of?(Proc) | |
alt.call(self) | |
else | |
array ? get_array(*alt) : get(*alt) | |
end | |
end | |
# convert and sanitize | |
if result.kind_of?(String) | |
convert_to_type(sanitize(result, remove, append), type) | |
elsif result.kind_of?(Array) | |
result = if !result.empty? && result[0].kind_of?(String) | |
result.map {|i| convert_to_type(sanitize(i, remove, append), type) } | |
else | |
result | |
end | |
# make unique array items | |
result.uniq | |
else | |
result | |
end | |
rescue NoMethodError => e | |
raise NoMethodError, "#{name} [#{query_or_proc}] : #{e.message}" | |
end | |
end | |
@@scrape_names ||= [:url] | |
@@scrape_names << name | |
end | |
def initialize(url) | |
@url = url | |
end | |
def response | |
@response ||= HTTParty.get(@url) | |
end | |
def doc | |
@doc ||= Nokogiri::HTML(response.body) | |
end | |
def get(query, attr = nil) | |
element = doc.at_css(query) | |
return unless element | |
attr ? element[attr] : element.inner_html | |
end | |
def get_array(query, attr = nil) | |
elements = doc.css(query) | |
if attr | |
elements.map{|el| el[attr] } | |
else | |
elements.map(&:inner_html) | |
end | |
end | |
def sanitize(value, remove = nil, append = nil) | |
# remove any signs | |
value.gsub!(remove, '') if remove | |
# append a string | |
value = append + value if append | |
# remove leading spaces | |
value.strip | |
end | |
# convert to special type | |
def convert_to_type(value, type) | |
case type | |
when :integer | |
# remove a space and non-breaking space | |
value.gsub(/\u{a0}|\s|,/,'').to_i | |
when :date | |
value.to_date | |
when :time | |
value.to_time | |
when :boolean | |
value.downcase == 'true' | |
else | |
value | |
end | |
end | |
def to_hash | |
@@scrape_names.reduce({}) { |hash, name| hash[name] = send(name); hash } | |
end | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment