Created
March 7, 2013 22:19
-
-
Save gosuri/5112348 to your computer and use it in GitHub Desktop.
The Wikipedia API blows, writing this was faster
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
# Scrapes wikipedia content for stack | |
# The API blows, writing this was faster | |
class WikiImporter | |
# Words rejected will excluded from the results | |
REJECT_WORDS = [ | |
"Comparison of web application frameworks", | |
"List of Ajax frameworks", | |
"Web application framework", | |
"Template:Application frameworks", | |
"Debuggers", | |
"Emulator", | |
"Integrated development environment" | |
] | |
attr_accessor :url, :doc | |
def initialize(url = nil) | |
@url = url | |
end | |
# Fetches the document and creates a document | |
# @param [String] url to fetch from | |
# @return [Nokogiri::Document] | |
def doc(url = @url) | |
@doc ||= Nokogiri::HTML(open(url)) | |
end | |
# Fetches the content for the provided url | |
# and returns the content | |
# @param [String] url to fetch the document | |
# @param &blk yeilds the block and retrieves the contents from the yeilded | |
# @yieldparam [Nokogiri::Document] | |
# @returns [Array] content elements | |
def self.content_for(url, &blk) | |
yield(WikiImporter.new(url).doc) | |
.map(&:content) | |
.compact | |
.uniq | |
.reject do |name| | |
REJECT_WORDS.include?(name) | |
end | |
end | |
# @return [Array] List of Programming Languages | |
def self.programing_languages(url = 'http://en.wikipedia.org/w/index.php?title=List_of_programming_languages&printable=yes') | |
content_for url do |doc| | |
doc.css('#mw-content-text .multicol a') | |
end | |
end | |
# @return [Array] List of Web Application Frameworks | |
def self.webapp_frameworks(url = 'http://en.wikipedia.org/w/index.php?title=Category:Web_application_frameworks&printable=yes') | |
content_for url do |doc| | |
doc.css('#mw-pages .mw-content-ltr a') | |
end | |
end | |
# @return [Array] List of Mobile app platforms | |
def self.mobile_app_development(url = 'http://en.wikipedia.org/w/index.php?title=Mobile_application_development&printable=yes') | |
content_for url do |doc| | |
doc.css('#mw-content-text .wikitable th a') | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment