Last active
September 18, 2015 16:51
-
-
Save boxmein/45d5b83df2825abb6c4c to your computer and use it in GitHub Desktop.
quick script to scrape definitions off of the Oxford Learner's Dictionary
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
# Dictionary Scraper | |
# sends a GET request to a dictionary, then scrapes the response with Nokogiri | |
# and returns the result | |
# usage: cat wordlist.txt | ruby scrape-definitions.rb 1>words.txt # 2>errors.txt | |
require 'open-uri' | |
require 'thread' | |
require 'nokogiri' | |
words = Queue.new | |
# collect all words on the cmdline | |
words.push $_ while gets | |
STDERR.puts "collected #{words.length} words" | |
# make ~4 threads | |
NTHREAD = 4 | |
# URL to send http request to | |
REQUEST_URL = "http://www.oxfordlearnersdictionaries.com/definition/english/{{word}}" | |
# REQUEST_URL = "http://enet.animato.ee/index.php?otsida={{word}}" | |
# how to transform the word before slapping into the url | |
WORD_TRANSFORM = Proc.new do |word| | |
word.gsub ' ', '-' | |
end | |
# WORD_TRANSFORM = Proc.new do |word| | |
# word.gsub ' ', '+' | |
# end | |
# how to match the definition | |
# takes a Nokogiri doc and returns a string | |
HTML_MATCH = Proc.new do |htmldoc| | |
htmldoc.css('.def').map(&:text).map(&:strip)[0, 2].join ' // ' | |
end | |
# HTML_MATCH = Proc.new do |htmldoc| | |
# htmldoc.css('a[href^="/index.php?otsida="]').map(&:text).map(&:strip).join ' // ' | |
# end | |
# collect all threads for later | |
threads = [] | |
# words we didn't find because 404 | |
crap_words = [] | |
NTHREAD.times do |thread_no| | |
threads << Thread.new do |th| | |
# puts "in thread #{thread_no}, starting on queue" | |
until words.empty? | |
word = words.pop.strip | |
word_url = WORD_TRANSFORM.call word | |
begin | |
url = REQUEST_URL.gsub("{{word}}", word_url) | |
STDERR.puts "GET " + url | |
html = Nokogiri::HTML open(url) | |
definition = HTML_MATCH.call html | |
puts "#{word}\t#{definition}" | |
rescue Exception => e | |
STDERR.puts "#{word} FAILED because #{e.inspect}" | |
end | |
sleep 0.5 | |
end | |
end | |
end | |
threads.map(&:join) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment