Ruin0x11 · September 20, 2019 02:16
diff --git a/scrape.rb b/scrape.rb
 require 'mechanize'
 require 'nokogiri'
 require 'json'
 require 'pp'

 class String
  def between marker
    self[/#{Regexp.escape(marker)}(.*?)#{Regexp.escape(marker)}/m, 1]
  end
 end

 class Reaktor
  BASE = "https://www.native-instruments.com"
  LOGIN = "#{BASE}/typo3conf/ext/ni_account/login.php?api_path=auth/token"
  URL = "#{BASE}/en/reaktor-community/reaktor-user-library"
  USERNAME = ""
  PASSWORD = ""

  def initialize
    @agent = Mechanize.new
    @agent.user_agent_alias = 'Mac Safari'
  end

  def page_url(id)
    "#{URL}/all/all/all/all/all/latest/#{id}/all/"
  end

  def login
    data = {username: USERNAME, password: PASSWORD}
    response = @agent.post LOGIN, data.to_json, {'Content-Type' => 'application/json'}
    json = JSON.parse(response.body)
    token = json["response_body"]["access"]["token"]
    cookie = Mechanize::Cookie.new :domain => '.native-instruments.com', :name => 'access-token', :value => token, :path => '/', :expires => (Date.today + 1).to_s
    @agent.cookie_jar << cookie
    puts "Logged in."
  end

  def go
    login
    page = @agent.get(URL)
    results = Integer(page.at(".info-result").text.strip!.split(" ")[0])
    page_count = (results / 15) + 1
    page_count.times do |i|
      id = i + 1
      page = @agent.get(page_url(id))
      puts "===== Page #{id} ====="
      download(page)
    end
  end

  def download(page)
    items = page.search(".//li[contains(@class, 'item-box')]")

    items.each do |i|
      begin
        retries ||= 0
        script = i.at("script")
        path = script.text.between("'")
        fullpath = BASE + path

        file = @agent.get(fullpath)
        if file.filename == "index.html"
          login
          raise
        end
        puts "> #{file.filename}"
        next if File.file?(file.filename)
        filename = file.save
        details = get_details(i)
        details_name = filename + ".json"
        File.write(details_name, JSON.pretty_generate(details), encoding: 'UTF-8')
      rescue
        retry if (retries += 1) < 100
      end
    end
  end

  def get_details(i)
    link = i.css('div.description-title a').map { |link| link['href'] }[0]
    detail = @agent.get(BASE + link)

    name = detail.at("div.detail-headline h2").text
    headline = detail.at("div.detail-headline h3").text
    description = detail.at("div.detail-description p").text.strip
    rating = detail.at('div#rating')["data-average"]
    rating_count = detail.at('span#vote-number').text
    downloads = detail.at("span.download-count").text.strip

    author = get_detail(detail, "Author")
    version = get_detail(detail, "Version")
    created = get_detail(detail, "Created")
    made_with = get_detail(detail, "Made with")
    category = get_array_detail(detail, "Category")
    tags = get_array_detail(detail, "Tags")
    comments = get_comments(detail)

    return {link: BASE + link,
            name: name,
            headline: headline,
            description: description,
            rating: rating,
            rating_count: rating_count,
            downloads: downloads,
            author: author,
            version: version,
            created: created,
            made_with: made_with,
            category: category,
            tags: tags,
            comments: comments}
  end

  def get_detail(detail, name)
    detail = detail.search(".//div[contains(@class, 'detail-info')]").at("label:contains('#{name}:')")
    return "" if detail.nil?
    detail = detail.parent.text.strip.split(":")[1].strip
    return detail
  end

  def get_array_detail(detail, name)
    return get_detail(detail, name).split("\n").map(&:strip).reject(&:empty?)
  end

  def get_comments(detail)
    comments = detail.css("div.comment.row")

    comments = comments.map do |c|
      author = c.at("div.author").text.strip
      body = c.at("div.clear").next_sibling.next_sibling.text.strip
      time = c.at("span.time").text.strip
      { author: author, body: body, time: time }
    end

    comments.pop
    return comments
  end
 end

 r = Reaktor.new
 r.go
	require 'mechanize'
	require 'nokogiri'
	require 'json'
	require 'pp'

	class String
	def between marker
	self[/#{Regexp.escape(marker)}(.*?)#{Regexp.escape(marker)}/m, 1]
	end
	end

	class Reaktor
	BASE = "https://www.native-instruments.com"
	LOGIN = "#{BASE}/typo3conf/ext/ni_account/login.php?api_path=auth/token"
	URL = "#{BASE}/en/reaktor-community/reaktor-user-library"
	USERNAME = ""
	PASSWORD = ""

	def initialize
	@agent = Mechanize.new
	@agent.user_agent_alias = 'Mac Safari'
	end

	def page_url(id)
	"#{URL}/all/all/all/all/all/latest/#{id}/all/"
	end

	def login
	data = {username: USERNAME, password: PASSWORD}
	response = @agent.post LOGIN, data.to_json, {'Content-Type' => 'application/json'}
	json = JSON.parse(response.body)
	token = json["response_body"]["access"]["token"]
	cookie = Mechanize::Cookie.new :domain => '.native-instruments.com', :name => 'access-token', :value => token, :path => '/', :expires => (Date.today + 1).to_s
	@agent.cookie_jar << cookie
	puts "Logged in."
	end

	def go
	login
	page = @agent.get(URL)
	results = Integer(page.at(".info-result").text.strip!.split(" ")[0])
	page_count = (results / 15) + 1
	page_count.times do \|i\|
	id = i + 1
	page = @agent.get(page_url(id))
	puts "===== Page #{id} ====="
	download(page)
	end
	end

	def download(page)
	items = page.search(".//li[contains(@class, 'item-box')]")

	items.each do \|i\|
	begin
	retries \|\|= 0
	script = i.at("script")
	path = script.text.between("'")
	fullpath = BASE + path

	file = @agent.get(fullpath)
	if file.filename == "index.html"
	login
	raise
	end
	puts "> #{file.filename}"
	next if File.file?(file.filename)
	filename = file.save
	details = get_details(i)
	details_name = filename + ".json"
	File.write(details_name, JSON.pretty_generate(details), encoding: 'UTF-8')
	rescue
	retry if (retries += 1) < 100
	end
	end
	end

	def get_details(i)
	link = i.css('div.description-title a').map { \|link\| link['href'] }[0]
	detail = @agent.get(BASE + link)

	name = detail.at("div.detail-headline h2").text
	headline = detail.at("div.detail-headline h3").text
	description = detail.at("div.detail-description p").text.strip
	rating = detail.at('div#rating')["data-average"]
	rating_count = detail.at('span#vote-number').text
	downloads = detail.at("span.download-count").text.strip

	author = get_detail(detail, "Author")
	version = get_detail(detail, "Version")
	created = get_detail(detail, "Created")
	made_with = get_detail(detail, "Made with")
	category = get_array_detail(detail, "Category")
	tags = get_array_detail(detail, "Tags")
	comments = get_comments(detail)

	return {link: BASE + link,
	name: name,
	headline: headline,
	description: description,
	rating: rating,
	rating_count: rating_count,
	downloads: downloads,
	author: author,
	version: version,
	created: created,
	made_with: made_with,
	category: category,
	tags: tags,
	comments: comments}
	end

	def get_detail(detail, name)
	detail = detail.search(".//div[contains(@class, 'detail-info')]").at("label:contains('#{name}:')")
	return "" if detail.nil?
	detail = detail.parent.text.strip.split(":")[1].strip
	return detail
	end

	def get_array_detail(detail, name)
	return get_detail(detail, name).split("\n").map(&:strip).reject(&:empty?)
	end

	def get_comments(detail)
	comments = detail.css("div.comment.row")

	comments = comments.map do \|c\|
	author = c.at("div.author").text.strip
	body = c.at("div.clear").next_sibling.next_sibling.text.strip
	time = c.at("span.time").text.strip
	{ author: author, body: body, time: time }
	end

	comments.pop
	return comments
	end
	end

	r = Reaktor.new
	r.go