Created
September 20, 2019 02:16
-
-
Save Ruin0x11/e3fb433d1f91ee0a6daa6db8164105fb to your computer and use it in GitHub Desktop.
Scrape Reaktor User Library
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'mechanize' | |
require 'nokogiri' | |
require 'json' | |
require 'pp' | |
class String | |
def between marker | |
self[/#{Regexp.escape(marker)}(.*?)#{Regexp.escape(marker)}/m, 1] | |
end | |
end | |
class Reaktor | |
BASE = "https://www.native-instruments.com" | |
LOGIN = "#{BASE}/typo3conf/ext/ni_account/login.php?api_path=auth/token" | |
URL = "#{BASE}/en/reaktor-community/reaktor-user-library" | |
USERNAME = "" | |
PASSWORD = "" | |
def initialize | |
@agent = Mechanize.new | |
@agent.user_agent_alias = 'Mac Safari' | |
end | |
def page_url(id) | |
"#{URL}/all/all/all/all/all/latest/#{id}/all/" | |
end | |
def login | |
data = {username: USERNAME, password: PASSWORD} | |
response = @agent.post LOGIN, data.to_json, {'Content-Type' => 'application/json'} | |
json = JSON.parse(response.body) | |
token = json["response_body"]["access"]["token"] | |
cookie = Mechanize::Cookie.new :domain => '.native-instruments.com', :name => 'access-token', :value => token, :path => '/', :expires => (Date.today + 1).to_s | |
@agent.cookie_jar << cookie | |
puts "Logged in." | |
end | |
def go | |
login | |
page = @agent.get(URL) | |
results = Integer(page.at(".info-result").text.strip!.split(" ")[0]) | |
page_count = (results / 15) + 1 | |
page_count.times do |i| | |
id = i + 1 | |
page = @agent.get(page_url(id)) | |
puts "===== Page #{id} =====" | |
download(page) | |
end | |
end | |
def download(page) | |
items = page.search(".//li[contains(@class, 'item-box')]") | |
items.each do |i| | |
begin | |
retries ||= 0 | |
script = i.at("script") | |
path = script.text.between("'") | |
fullpath = BASE + path | |
file = @agent.get(fullpath) | |
if file.filename == "index.html" | |
login | |
raise | |
end | |
puts "> #{file.filename}" | |
next if File.file?(file.filename) | |
filename = file.save | |
details = get_details(i) | |
details_name = filename + ".json" | |
File.write(details_name, JSON.pretty_generate(details), encoding: 'UTF-8') | |
rescue | |
retry if (retries += 1) < 100 | |
end | |
end | |
end | |
def get_details(i) | |
link = i.css('div.description-title a').map { |link| link['href'] }[0] | |
detail = @agent.get(BASE + link) | |
name = detail.at("div.detail-headline h2").text | |
headline = detail.at("div.detail-headline h3").text | |
description = detail.at("div.detail-description p").text.strip | |
rating = detail.at('div#rating')["data-average"] | |
rating_count = detail.at('span#vote-number').text | |
downloads = detail.at("span.download-count").text.strip | |
author = get_detail(detail, "Author") | |
version = get_detail(detail, "Version") | |
created = get_detail(detail, "Created") | |
made_with = get_detail(detail, "Made with") | |
category = get_array_detail(detail, "Category") | |
tags = get_array_detail(detail, "Tags") | |
comments = get_comments(detail) | |
return {link: BASE + link, | |
name: name, | |
headline: headline, | |
description: description, | |
rating: rating, | |
rating_count: rating_count, | |
downloads: downloads, | |
author: author, | |
version: version, | |
created: created, | |
made_with: made_with, | |
category: category, | |
tags: tags, | |
comments: comments} | |
end | |
def get_detail(detail, name) | |
detail = detail.search(".//div[contains(@class, 'detail-info')]").at("label:contains('#{name}:')") | |
return "" if detail.nil? | |
detail = detail.parent.text.strip.split(":")[1].strip | |
return detail | |
end | |
def get_array_detail(detail, name) | |
return get_detail(detail, name).split("\n").map(&:strip).reject(&:empty?) | |
end | |
def get_comments(detail) | |
comments = detail.css("div.comment.row") | |
comments = comments.map do |c| | |
author = c.at("div.author").text.strip | |
body = c.at("div.clear").next_sibling.next_sibling.text.strip | |
time = c.at("span.time").text.strip | |
{ author: author, body: body, time: time } | |
end | |
comments.pop | |
return comments | |
end | |
end | |
r = Reaktor.new | |
r.go |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment