Skip to content

Instantly share code, notes, and snippets.

@alexkojin
Created March 8, 2017 04:00
Show Gist options
  • Save alexkojin/5ab9ce9622641d9ec32e1db0f21d0c58 to your computer and use it in GitHub Desktop.
Save alexkojin/5ab9ce9622641d9ec32e1db0f21d0c58 to your computer and use it in GitHub Desktop.
Parser
# redtube.com crawler
module Crawler::Redtube
class Parser < Crawler::BaseParser
scrape :video_id, ->(i) { i.url.split('/').reverse.first.to_i }
scrape :title, '.videoTitle'
scrape :author_name, '.video-details tr:eq(1) td:eq(2) a', alt: '.video-details tr:eq(1) td:eq(2) span'
scrape :views, '.video-details tr:eq(2) td:eq(2)', type: :integer
scrape :rating_percent, '.percent-likes', type: :integer
scrape :duration, '.video-duration'
scrape :uploaded_at, '.added-time', remove: /ADDED/, type: :date
scrape :tags, '.video-details tr:eq(4) td.links a', array: true
scrape :categories, '.video-details tr:eq(1) td.links a', array: true
scrape :pornstars, '.video-details tr:eq(2) td.links a:not(.add-pornstar-plus)', array: true
scrape :hd, ->(i) { i.tags.include?('HD') || i.categories.include?('HD') }
scrape :embed_url, ["meta[name='twitter:player']", :content]
scrape :embed_code, ->(i) { %(<iframe src="#{i.embed_url}&bgcolor=000000" frameborder="0" width="560" height="315" scrolling="no" allowfullscreen></iframe>) }
scrape :cover_url, ['#redtube_flv_player noscript video', :poster]
scrape :thumb_url, ["meta[property='og:image']", :content]
scrape :comments_count, '.comments-btn', remove: /\D/, type: :integer
scrape :comments, ->(i) {
i.doc.css('.comment').map do |comment|
{
comment_id: comment[:id].to_s.gsub(/comment_/, '').to_i,
reply_to_id: comment[:class].split(/\s/).select{|c| c.starts_with?('replyTo')}.first.try(:gsub, 'replyTo', '').try(:to_i),
author_name: (comment.at_css('.user-date a') || comment.at_css('.user-date span')).text,
body: comment.at_css('.comment-content').inner_html
}
end
}
scrape :related_videos, ['.video-listing a', :href], append: 'https://www.redtube.com', array: true
end
end
# Base class
module Crawler
class BaseParser
attr_reader :url
def self.scrape(name, query_or_proc, alt: nil, type: :string, remove: nil, append: nil, array: false)
define_method(name) do
begin
result = if query_or_proc.kind_of?(Proc)
query_or_proc.call(self)
else
array ? get_array(*query_or_proc) : get(*query_or_proc)
end
# run alternative query if a main returns nil or empty array
if result.blank? && alt
result = if alt.kind_of?(Proc)
alt.call(self)
else
array ? get_array(*alt) : get(*alt)
end
end
# convert and sanitize
if result.kind_of?(String)
convert_to_type(sanitize(result, remove, append), type)
elsif result.kind_of?(Array)
result = if !result.empty? && result[0].kind_of?(String)
result.map {|i| convert_to_type(sanitize(i, remove, append), type) }
else
result
end
# make unique array items
result.uniq
else
result
end
rescue NoMethodError => e
raise NoMethodError, "#{name} [#{query_or_proc}] : #{e.message}"
end
end
@@scrape_names ||= [:url]
@@scrape_names << name
end
def initialize(url)
@url = url
end
def response
@response ||= HTTParty.get(@url)
end
def doc
@doc ||= Nokogiri::HTML(response.body)
end
def get(query, attr = nil)
element = doc.at_css(query)
return unless element
attr ? element[attr] : element.inner_html
end
def get_array(query, attr = nil)
elements = doc.css(query)
if attr
elements.map{|el| el[attr] }
else
elements.map(&:inner_html)
end
end
def sanitize(value, remove = nil, append = nil)
# remove any signs
value.gsub!(remove, '') if remove
# append a string
value = append + value if append
# remove leading spaces
value.strip
end
# convert to special type
def convert_to_type(value, type)
case type
when :integer
# remove a space and non-breaking space
value.gsub(/\u{a0}|\s|,/,'').to_i
when :date
value.to_date
when :time
value.to_time
when :boolean
value.downcase == 'true'
else
value
end
end
def to_hash
@@scrape_names.reduce({}) { |hash, name| hash[name] = send(name); hash }
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment