Created
November 11, 2017 17:49
-
-
Save savetheclocktower/0c218c810bb3103ee37fa38c7adf8508 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Scrape the JavaScript documentation on Mozilla Developer and store an index | |
# as a YAML file. | |
# | |
# The data is organized by "token." If someone hits ^H in the middle of the | |
# word "toUpperCase", we want to link them to the MDN page for | |
# `String.prototype.toUpperCase`. If they're in the middle of the word | |
# "indexOf", we want to let them choose between `Array.prototype.indexOf` and | |
# `String.prototype.indexOf`. | |
# | |
# So the YAML dump is a giant hash with the token as the key and arrays as | |
# values containing all possible options for that token. (YAML is the storage | |
# format rather than JSON because Ruby 1.8 doesn't have built-in JSON.) | |
# | |
# The point is not to generate an _exhaustive_ index, because there are a lot | |
# of obscure APIs out there. It's to cover the stuff that gets used most | |
# often. | |
# | |
# END USERS WILL NOT HAVE TO RUN THIS SCRIPT. This is the script that | |
# generates the YAML file, which should then be checked into source control. | |
# It ought to be regenerated if there's a major new addition to the language | |
# (or to the Web APIs) that's worth including. | |
require 'nokogiri' | |
require 'yaml' | |
require 'open-uri' | |
# Nonexistent entries in the hash get initialized as empty arrays. | |
$result = Hash.new { |hash, key| hash[key] = [] } | |
ALIASES = { | |
'async' => 'async function', | |
'catch' => 'try...catch', | |
'do' => 'do...while', | |
'else' => 'if...else', | |
'if' => 'if...else', | |
'in' => 'for...in', | |
'of' => 'for...of', | |
'try' => 'try...catch' | |
} | |
def aliased(token) | |
ALIASES[token] || token | |
end | |
def url(path) | |
"https://developer.mozilla.org/en-US/docs/Web#{path}" | |
end | |
# /JavaScript/Reference/Global_Objects/$term | |
GLOBAL_FUNCTIONS = [ | |
'decodeURI', | |
'decodeURIComponent', | |
'encodeURI', | |
'encodeURIComponent', | |
'eval', | |
'isFinite', | |
'isNaN', | |
'parseFloat', | |
'parseInt' | |
] | |
# /JavaScript/Reference/Global_Objects/$term | |
GLOBAL_OBJECTS = [ | |
'undefined', | |
'null', | |
'NaN', | |
'Infinity' | |
] | |
(GLOBAL_FUNCTIONS + GLOBAL_OBJECTS).each do |token| | |
gf = aliased(token) | |
$result[token] << { | |
:name => gf, | |
:url => "/JavaScript/Reference/Global_Objects/#{gf}" | |
} | |
end | |
# /JavaScript/Reference/Operators/$term | |
OPERATORS = [ | |
'await', | |
'delete', | |
'get', | |
'in', | |
'instanceof', | |
'let', | |
'new', | |
'set', | |
'this', | |
'typeof', | |
'void', | |
'yield*' | |
] | |
OPERATORS.each do |token| | |
op = aliased(token) | |
$result[token] << { | |
:name => op, | |
:url => "/JavaScript/Reference/Operators/#{op}" | |
} | |
end | |
STATEMENTS = [ | |
'async', # async function | |
'break', | |
'catch', # try...catch | |
'class', | |
'const', | |
'const', | |
'continue', | |
'debugger', | |
'do', # do...while | |
'else', # if...else | |
'export', | |
'for', | |
'function', | |
'function*', | |
'if', # if...else | |
'import', | |
'in', # for...in | |
'label', | |
'let', | |
'let', | |
'of', # for...of | |
'return', | |
'switch', | |
'throw', | |
'try', # try...catch | |
'var', | |
'while', | |
'with', | |
'yield' | |
] | |
STATEMENTS.each do |token| | |
title = aliased(token) | |
$result[title] << { | |
:name => token, | |
:url => "/JavaScript/Reference/Statements/#{title}" | |
} | |
end | |
BUILTINS = [ | |
'Array', | |
'Date', | |
'Function', | |
'Error', | |
'Boolean', | |
'JSON', | |
'Map', | |
'Number', | |
'Object', | |
'Promise', | |
'Proxy', | |
'RangeError', | |
'ReferenceError', | |
'RegExp', | |
'Set', | |
'String', | |
'Symbol', | |
'SyntaxError', | |
'TypeError', | |
'WeakMap', | |
'WeakSet', | |
] | |
# Other tokens that should be included, along with the pages they should | |
# point to. | |
MISC = { | |
:Image => '/API/HTMLImageElement/Image', | |
} | |
MISC.each do |token, v| | |
k = aliased(token.to_s) | |
$result[token.to_s] << { :name => k, :url => v } | |
end | |
# Everything in TOC will be screen-scraped for links to methods/properties. | |
# | |
# The keys are for reference, but aren't used as tokens. | |
# | |
TOC = { | |
# The rest of the builtins have their instance methods listed on their base | |
# pages, but Array doesn't, for whatever reason. | |
:"Array.prototype" => '/JavaScript/Reference/Global_Objects/Array/prototype', | |
:window => '/API/window', | |
:document => '/API/document', | |
:console => '/API/console', | |
# If we scrape _all_ DOM classes, we get way too many dupes. This seems | |
# like a good subset to start off with. | |
:Node => '/API/Node', | |
:Event => '/API/Event', | |
:HTMLElement => '/API/HTMLElement', | |
:HTMLCollection => '/API/HTMLCollection', | |
} | |
# Everything in BUILTINS should also get scraped. | |
BUILTINS.each do |builtin| | |
TOC[builtin.to_sym] = "/JavaScript/Reference/Global_Objects/#{builtin}" | |
end | |
TOC.each do |key, value| | |
$result[key.to_s] << { :name => key.to_s, :url => value } | |
STDERR.puts "Crawling #{key}: #{url(value)}" | |
begin | |
doc = Nokogiri::HTML( open( url(value) ) ) | |
rescue OpenURI::HTTPError => e | |
STDERR.puts "404: #{url(value)}" | |
exit 1 | |
end | |
count = 0 | |
# Thank god for semantic markup. All items are in definition lists. | |
doc.css('dt').each do |dt| | |
# Skip the ones that don't have wiki pages. | |
next if dt.at_css('a.new') | |
# Skip non-standard stuff. The title text will say something like | |
# "...this is not standardized." | |
next if dt.at_css('span[title*="standardized"]') | |
# All methods and properties are inside CODE tags, so anything that | |
# doesn't have one is a false positive. | |
next unless dt.at_css('a > code') | |
name = dt.at_css('a > code').text | |
url = dt.at_css('a')['href'] | |
token = name.split('.').last.gsub(/\(.*?\)/, '') | |
$result[token] << { | |
name: name, | |
url: url.gsub('/en-US/docs/Web', '') | |
} | |
count += 1 | |
end | |
STDERR.puts " found #{count}" | |
end | |
puts YAML::dump($result) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment