Created
May 23, 2014 23:17
-
-
Save nogweii/3ff685cf73b15cbba8e9 to your computer and use it in GitHub Desktop.
get a list of all of duckduckgo's bangs in a programmatic manner, useful for scripting
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'open-uri' | |
require 'json' | |
require 'yaml' | |
f = open('https://duckduckgo.com/bang.html', {"User-Agent" => 'Ruby script parsing the full list of bangs/1.0 ; http://evaryont.me'}).read | |
n = Nokogiri.parse(f) | |
class DDGBang < Struct.new(:category, :group, :bang, :aliases, :full_name) | |
def to_json(*) | |
values.to_json | |
end | |
end | |
bangs = {} | |
# Exclude the last H4, the full alphabetical list. That is treated specially. | |
n.css('h4')[0...-1].each do |bang_category| | |
category_name = bang_category.text.strip | |
# Next sibling is a text node ("\n"), then the sibling after that is the UL | |
# containing an LI per group | |
bang_category.next_sibling.next_sibling.css('li').each do |bang_group| | |
group_name = bang_group.at_css('h6 b').text.chomp(':') | |
# Whoo! We finally made it to iterating over each individual bang. | |
bang_group.children[1].to_s.split("\n").each do |bang| | |
next if bang.nil? || bang.empty? # There are empty lines, notably at the beginning of each list | |
aliases = [] | |
if bang.include? '(' # Includes parentheses, therefore there is an alias available | |
aliases = bang.split(/\s/).map{|name| name.delete('()').strip}.compact | |
bang = aliases.shift | |
end | |
bang_name = bang.strip | |
ddg_bang = DDGBang.new(category_name, group_name, bang_name, aliases, nil) | |
bangs[bang_name] = ddg_bang | |
end | |
end | |
end | |
n.css('h4').last.next_sibling.next_sibling.css('span').text.split("\n").each do |bang_full_name| | |
next if bang_full_name.empty? | |
*full_name, bang = bang_full_name.split(' ') | |
bang.delete! '()' | |
full_name = full_name.join(' ') | |
next if bangs[bang].nil? | |
bangs[bang].full_name = full_name | |
end | |
# And some special case clean ups... | |
bangs['!craigslist'].aliases = [] | |
bangs['!wowarmoryeu'].aliases = [] | |
bangs['!gv'].aliases = [] | |
bangs['!parlysearch'].aliases = [] | |
bangs.delete('!hb') | |
bangs['!hummingbird'].aliases = ['!hb'] | |
open('/tmp/ddg_bangs.yml', 'w') do |yml| | |
yml.write bangs.to_yaml | |
end | |
open('/tmp/ddg_bangs.json', 'w') do |js| | |
js.write JSON.pretty_generate(bangs) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This would be a lot easier of course if there was an official API endpoint to get a list of all the bangs. In the mean time, this should do.