Last active
May 20, 2019 09:48
-
-
Save DataKinds/ffe12eeeed6d2e61e856e0dfbf513f13 to your computer and use it in GitHub Desktop.
Directed graph of TVTropes links
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# frozen_string_literal: true | |
source "https://rubygems.org" | |
git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } | |
gem "nokogiri" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "nokogiri" | |
require "open-uri" | |
require "thread" | |
require "set" | |
require "pry" | |
$OUTPUT_FILE = File.open "output", "w" | |
$URL_QUEUE = Queue.new | |
$URL_QUEUE.push "PrincessClassic" | |
$COMPLETED_URLS = Set.new | |
$COMPLETED_URLS_LOCK = Mutex.new | |
def is_valid_tvtropes_href?(href) | |
href =~ /\/pmwiki\/pmwiki\.php\/Main\/\w+\z/ | |
end | |
def get_tvtropes_linklist(url) | |
Nokogiri::HTML(open "https://tvtropes.org/pmwiki/pmwiki.php/Main/#{url}") | |
.css("a") | |
.select{ |link| is_valid_tvtropes_href? link["href"] } | |
.map{ |link| link["href"].scan(/\/pmwiki\/pmwiki\.php\/Main\/(\w+\z)/)[0][0] } | |
end | |
pry | |
$THREAD_POOL = [] | |
24.times do | |
$THREAD_POOL << Thread.new do | |
loop do | |
sleep 1 | |
# check the URL queue, and rip page if the URL queue is nonempty | |
if $URL_QUEUE.empty? | |
next | |
else | |
url = $URL_QUEUE.pop | |
if $COMPLETED_URLS.include? url | |
next | |
else | |
linklist = get_tvtropes_linklist url | |
end | |
end | |
# output the rips | |
linklist.each do |point| | |
$OUTPUT_FILE.puts "#{url}->#{point}" | |
end | |
# update the completed URL set | |
$COMPLETED_URLS_LOCK.synchronize do | |
$COMPLETED_URLS.add url | |
end | |
puts "completed #{url}" | |
# update the URL queue | |
linklist.each do |point| | |
$URL_QUEUE << point | |
end | |
end | |
end | |
end | |
$THREAD_POOL.map { |th| th.join } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment