alexkojin · March 8, 2017 04:00
diff --git a/parser.rb b/parser.rb
 # redtube.com crawler
 module Crawler::Redtube
  class Parser < Crawler::BaseParser
    scrape :video_id, ->(i) { i.url.split('/').reverse.first.to_i }
    scrape :title, '.videoTitle'
    scrape :author_name, '.video-details tr:eq(1) td:eq(2) a', alt: '.video-details tr:eq(1) td:eq(2) span'
    scrape :views, '.video-details tr:eq(2) td:eq(2)', type: :integer
    scrape :rating_percent, '.percent-likes', type: :integer
    scrape :duration, '.video-duration'
    scrape :uploaded_at, '.added-time', remove: /ADDED/, type: :date
    scrape :tags, '.video-details tr:eq(4) td.links a', array: true
    scrape :categories, '.video-details tr:eq(1) td.links a', array: true
    scrape :pornstars, '.video-details tr:eq(2) td.links a:not(.add-pornstar-plus)', array: true
    scrape :hd, ->(i) { i.tags.include?('HD') || i.categories.include?('HD') }
    scrape :embed_url, ["meta[name='twitter:player']", :content]
    scrape :embed_code, ->(i) { %(<iframe src="#{i.embed_url}&bgcolor=000000" frameborder="0" width="560" height="315" scrolling="no" allowfullscreen></iframe>) }
    scrape :cover_url, ['#redtube_flv_player noscript video', :poster]
    scrape :thumb_url, ["meta[property='og:image']", :content]
    scrape :comments_count, '.comments-btn', remove: /\D/, type: :integer
    scrape :comments, ->(i) {
      i.doc.css('.comment').map do |comment|
        {
          comment_id: comment[:id].to_s.gsub(/comment_/, '').to_i,
          reply_to_id: comment[:class].split(/\s/).select{|c| c.starts_with?('replyTo')}.first.try(:gsub, 'replyTo', '').try(:to_i),
          author_name: (comment.at_css('.user-date a') || comment.at_css('.user-date span')).text,
          body: comment.at_css('.comment-content').inner_html
        }
      end
    }

    scrape :related_videos, ['.video-listing a', :href], append: 'https://www.redtube.com', array: true
  end
 end

 # Base class
 module Crawler
  class BaseParser
    attr_reader :url

    def self.scrape(name, query_or_proc, alt: nil, type: :string, remove: nil, append: nil, array: false)
      define_method(name) do
        begin
          result = if query_or_proc.kind_of?(Proc)
            query_or_proc.call(self)
          else
            array ? get_array(*query_or_proc) : get(*query_or_proc)
          end

          # run alternative query if a main returns nil or empty array
          if result.blank? && alt
            result = if alt.kind_of?(Proc)
              alt.call(self)
            else
              array ? get_array(*alt) : get(*alt)
            end
          end

          # convert and sanitize
          if result.kind_of?(String)
            convert_to_type(sanitize(result, remove, append), type)
          elsif result.kind_of?(Array)
            result = if !result.empty? && result[0].kind_of?(String)
              result.map {|i| convert_to_type(sanitize(i, remove, append), type) }
            else
              result
            end

            # make unique array items
            result.uniq
          else
            result
          end

        rescue NoMethodError => e
          raise NoMethodError, "#{name} [#{query_or_proc}] : #{e.message}"
        end
      end

      @@scrape_names ||= [:url]
      @@scrape_names << name
    end

    def initialize(url)
      @url = url
    end

    def response
      @response ||= HTTParty.get(@url)
    end

    def doc
      @doc ||= Nokogiri::HTML(response.body)
    end

    def get(query, attr = nil)
      element = doc.at_css(query)
      return unless element

      attr ? element[attr] : element.inner_html
    end

    def get_array(query, attr = nil)
      elements = doc.css(query)
      if attr
        elements.map{|el| el[attr] }
      else
        elements.map(&:inner_html)
      end
    end

    def sanitize(value, remove = nil, append = nil)
      # remove any signs
      value.gsub!(remove, '') if remove

      # append a string
      value = append + value if append

      # remove leading spaces
      value.strip
    end

    # convert to special type
    def convert_to_type(value, type)
      case type
      when :integer
        # remove a space and non-breaking space
        value.gsub(/\u{a0}|\s|,/,'').to_i
      when :date
        value.to_date
      when :time
        value.to_time
      when :boolean
        value.downcase == 'true'
      else
        value
      end
    end

    def to_hash
      @@scrape_names.reduce({}) { |hash, name| hash[name] = send(name); hash }
    end
  end
 end
	# redtube.com crawler
	module Crawler::Redtube
	class Parser < Crawler::BaseParser
	scrape :video_id, ->(i) { i.url.split('/').reverse.first.to_i }
	scrape :title, '.videoTitle'
	scrape :author_name, '.video-details tr:eq(1) td:eq(2) a', alt: '.video-details tr:eq(1) td:eq(2) span'
	scrape :views, '.video-details tr:eq(2) td:eq(2)', type: :integer
	scrape :rating_percent, '.percent-likes', type: :integer
	scrape :duration, '.video-duration'
	scrape :uploaded_at, '.added-time', remove: /ADDED/, type: :date
	scrape :tags, '.video-details tr:eq(4) td.links a', array: true
	scrape :categories, '.video-details tr:eq(1) td.links a', array: true
	scrape :pornstars, '.video-details tr:eq(2) td.links a:not(.add-pornstar-plus)', array: true
	scrape :hd, ->(i) { i.tags.include?('HD') \|\| i.categories.include?('HD') }
	scrape :embed_url, ["meta[name='twitter:player']", :content]
	scrape :embed_code, ->(i) { %(<iframe src="#{i.embed_url}&bgcolor=000000" frameborder="0" width="560" height="315" scrolling="no" allowfullscreen></iframe>) }
	scrape :cover_url, ['#redtube_flv_player noscript video', :poster]
	scrape :thumb_url, ["meta[property='og:image']", :content]
	scrape :comments_count, '.comments-btn', remove: /\D/, type: :integer
	scrape :comments, ->(i) {
	i.doc.css('.comment').map do \|comment\|
	{
	comment_id: comment[:id].to_s.gsub(/comment_/, '').to_i,
	reply_to_id: comment[:class].split(/\s/).select{\|c\| c.starts_with?('replyTo')}.first.try(:gsub, 'replyTo', '').try(:to_i),
	author_name: (comment.at_css('.user-date a') \|\| comment.at_css('.user-date span')).text,
	body: comment.at_css('.comment-content').inner_html
	}
	end
	}

	scrape :related_videos, ['.video-listing a', :href], append: 'https://www.redtube.com', array: true
	end
	end

	# Base class
	module Crawler
	class BaseParser
	attr_reader :url

	def self.scrape(name, query_or_proc, alt: nil, type: :string, remove: nil, append: nil, array: false)
	define_method(name) do
	begin
	result = if query_or_proc.kind_of?(Proc)
	query_or_proc.call(self)
	else
	array ? get_array(query_or_proc) : get(query_or_proc)
	end

	# run alternative query if a main returns nil or empty array
	if result.blank? && alt
	result = if alt.kind_of?(Proc)
	alt.call(self)
	else
	array ? get_array(alt) : get(alt)
	end
	end

	# convert and sanitize
	if result.kind_of?(String)
	convert_to_type(sanitize(result, remove, append), type)
	elsif result.kind_of?(Array)
	result = if !result.empty? && result[0].kind_of?(String)
	result.map {\|i\| convert_to_type(sanitize(i, remove, append), type) }
	else
	result
	end

	# make unique array items
	result.uniq
	else
	result
	end

	rescue NoMethodError => e
	raise NoMethodError, "#{name} [#{query_or_proc}] : #{e.message}"
	end
	end

	@@scrape_names \|\|= [:url]
	@@scrape_names << name
	end

	def initialize(url)
	@url = url
	end

	def response
	@response \|\|= HTTParty.get(@url)
	end

	def doc
	@doc \|\|= Nokogiri::HTML(response.body)
	end

	def get(query, attr = nil)
	element = doc.at_css(query)
	return unless element

	attr ? element[attr] : element.inner_html
	end

	def get_array(query, attr = nil)
	elements = doc.css(query)
	if attr
	elements.map{\|el\| el[attr] }
	else
	elements.map(&:inner_html)
	end
	end

	def sanitize(value, remove = nil, append = nil)
	# remove any signs
	value.gsub!(remove, '') if remove

	# append a string
	value = append + value if append

	# remove leading spaces
	value.strip
	end

	# convert to special type
	def convert_to_type(value, type)
	case type
	when :integer
	# remove a space and non-breaking space
	value.gsub(/\u{a0}\|\s\|,/,'').to_i
	when :date
	value.to_date
	when :time
	value.to_time
	when :boolean
	value.downcase == 'true'
	else
	value
	end
	end

	def to_hash
	@@scrape_names.reduce({}) { \|hash, name\| hash[name] = send(name); hash }
	end
	end
	end