Skip to content

Instantly share code, notes, and snippets.

@gotomypc
Forked from lancejpollard/index.coffee
Created October 28, 2012 15:58
Show Gist options
  • Save gotomypc/3968987 to your computer and use it in GitHub Desktop.
Save gotomypc/3968987 to your computer and use it in GitHub Desktop.
Scrape Periodic Table of Elements from Wikipedia in Node.js
# http://www.webelements.com/aluminium/
# http://environmentalchemistry.com/yogi/periodic/Al.html
# http://www.chemicool.com/elements/aluminum.html
parseWikipediaTableOfElements = ->
urls = []
$("#mw-content-text table tr td[title] a").each ->
urls.push($(this).attr('href'))
urls
parseElement = ->
result = {}
key = null
$('#mw-content-text table.infobox tr').each ->
if $(this).find('table').length
@
else if $(this).children().length == 1 && $(this).find('th').length == 1
# Appearance, General Properties, etc.
key = $(this).find('th').text().trim()
else
#console.log "NOW KEY #{key}"
switch key
when "Appearance"
description = $(this).text().trim()
# http://upload.wikimedia.org/wikipedia/commons/8/8c/Lanthanum-2.jpg
# http://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Lanthanum-2.jpg/600px-Lanthanum-2.jpg
# http://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Lanthanum-2.jpg/250px-Lanthanum-2.jpg
image = $(this).find('img').attr('src')
value = {}
value.description = description if description?
value.image = image if image?
result[key] = value
key = null
else
keys = $(this).find('th').text().trim().split(/,\s+/)
#console.log(keys)
value = $(this).find('td').text().trim()
if keys.length == 1
result[keys[0]] = value unless keys[0] == ''
else
value = value.split(/,\s+/)
for k, i in keys
result[k] = value[i]
result
$ = undefined
jsdom = require("jsdom")
agent = require('superagent')
async = require('async')
fs = require('fs')
cheerio = require('cheerio')
data = []
crawlWithJSDOM = (url, callback) ->
agent.get(url).end (response) =>
jsdom.env
html: response.text
#scripts: [ "https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js" ]
(err, window) ->
$ = window.jQuery
callback()
crawlWithZombie = (url, callback) ->
browser.visit url, (e, b) ->
$ = b
callback
crawlWithCheerio = (url, callback) ->
agent.get(url).end (response) =>
$ = cheerio.load(response.text)
callback()
crawl = crawlWithCheerio
crawlElement = (url, callback) ->
console.log url
crawl "http://en.wikipedia.org#{url}", ->
data.push parseElement()
callback()
crawlTable = (url) ->
crawl url, ->
urls = parseWikipediaTableOfElements()#[0..1]
iterator = (elementUrl, next) ->
process.nextTick ->
crawlElement(elementUrl, next)
async.forEachSeries urls, iterator, ->
console.log 'DONE'
json = elements: data
fs.writeFileSync('tmp/elements.json', JSON.stringify(json, null, 2))
crawlTable("http://en.wikipedia.org/wiki/Periodic_table_(large_version)")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment