Created
June 22, 2015 20:17
-
-
Save zloyrusskiy/99cc8661c8b5d59f9199 to your computer and use it in GitHub Desktop.
wiki_path
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogumbo' | |
require 'uri' | |
def print_parents visited, path | |
parent = path | |
while parent = visited[parent] | |
puts "parent: %s" % URI.unescape(parent) | |
end | |
end | |
def find_links url | |
puts URI.unescape(url) | |
doc = Nokogiri::HTML5.get(url) | |
links = doc.css('a') | |
links | |
.map { |link| link['href'] } | |
.compact | |
.select { |h| h.start_with? '/wiki/' } | |
.reject { |h| h.include? ':' or h.include? '#'} | |
end | |
def search_path from_term, to_term, lang | |
queue = [] | |
visited = {} | |
from_path = URI.escape("/wiki/#{from_term}") | |
to_path = URI.escape("/wiki/#{to_term}") | |
visited[from_path] = nil | |
queue << from_path | |
while queue.any? | |
path = queue.shift | |
new_paths = find_links get_url(path, lang) | |
new_paths.each do |p| | |
unless visited.has_key? p | |
visited[p] = path | |
queue << p | |
end | |
if p == to_path | |
puts "\n>> Found: #{URI.unescape(p)}" | |
print_parents visited, p | |
return | |
end | |
end | |
end | |
puts "not found" | |
end | |
def get_url path, lang | |
"https://%s.wikipedia.org%s" % [lang, path] | |
end | |
search_path('Sort', 'SAP', 'ru') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
На Crystal еще быстрее будет, наверное (пришел отсюда http://habrahabr.ru/post/260883/#comment_8472729)