Created
June 10, 2012 09:50
-
-
Save lancejpollard/2904712 to your computer and use it in GitHub Desktop.
Scrape Periodic Table of Elements from Wikipedia in Node.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://www.webelements.com/aluminium/ | |
# http://environmentalchemistry.com/yogi/periodic/Al.html | |
# http://www.chemicool.com/elements/aluminum.html | |
parseWikipediaTableOfElements = -> | |
urls = [] | |
$("#mw-content-text table tr td[title] a").each -> | |
urls.push($(this).attr('href')) | |
urls | |
parseElement = -> | |
result = {} | |
key = null | |
$('#mw-content-text table.infobox tr').each -> | |
if $(this).find('table').length | |
@ | |
else if $(this).children().length == 1 && $(this).find('th').length == 1 | |
# Appearance, General Properties, etc. | |
key = $(this).find('th').text().trim() | |
else | |
#console.log "NOW KEY #{key}" | |
switch key | |
when "Appearance" | |
description = $(this).text().trim() | |
# http://upload.wikimedia.org/wikipedia/commons/8/8c/Lanthanum-2.jpg | |
# http://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Lanthanum-2.jpg/600px-Lanthanum-2.jpg | |
# http://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Lanthanum-2.jpg/250px-Lanthanum-2.jpg | |
image = $(this).find('img').attr('src') | |
value = {} | |
value.description = description if description? | |
value.image = image if image? | |
result[key] = value | |
key = null | |
else | |
keys = $(this).find('th').text().trim().split(/,\s+/) | |
#console.log(keys) | |
value = $(this).find('td').text().trim() | |
if keys.length == 1 | |
result[keys[0]] = value unless keys[0] == '' | |
else | |
value = value.split(/,\s+/) | |
for k, i in keys | |
result[k] = value[i] | |
result | |
$ = undefined | |
jsdom = require("jsdom") | |
agent = require('superagent') | |
async = require('async') | |
fs = require('fs') | |
cheerio = require('cheerio') | |
data = [] | |
crawlWithJSDOM = (url, callback) -> | |
agent.get(url).end (response) => | |
jsdom.env | |
html: response.text | |
#scripts: [ "https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js" ] | |
(err, window) -> | |
$ = window.jQuery | |
callback() | |
crawlWithZombie = (url, callback) -> | |
browser.visit url, (e, b) -> | |
$ = b | |
callback | |
crawlWithCheerio = (url, callback) -> | |
agent.get(url).end (response) => | |
$ = cheerio.load(response.text) | |
callback() | |
crawl = crawlWithCheerio | |
crawlElement = (url, callback) -> | |
console.log url | |
crawl "http://en.wikipedia.org#{url}", -> | |
data.push parseElement() | |
callback() | |
crawlTable = (url) -> | |
crawl url, -> | |
urls = parseWikipediaTableOfElements()#[0..1] | |
iterator = (elementUrl, next) -> | |
process.nextTick -> | |
crawlElement(elementUrl, next) | |
async.forEachSeries urls, iterator, -> | |
console.log 'DONE' | |
json = elements: data | |
fs.writeFileSync('tmp/elements.json', JSON.stringify(json, null, 2)) | |
crawlTable("http://en.wikipedia.org/wiki/Periodic_table_(large_version)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment