Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save 525c1e21-bd67-4735-ac99-b4b0e5262290/1175439 to your computer and use it in GitHub Desktop.
Save 525c1e21-bd67-4735-ac99-b4b0e5262290/1175439 to your computer and use it in GitHub Desktop.
wikipedia periodic element scraper (i.e. elements.json)
scraper = require 'scraper'
async = require 'async'
index = 'http://en.wikipedia.org/wiki/Periodic_table'
console.log "-scrape #{index}"
scraper index, (error, jQuery) ->
console.log error if error
console.log "+scrape #{index}"
db = new Object
total = 0
done = 0
start = new Date
setInterval =>
for key, value of db
console.log "#{key}: #{JSON.stringify value}"
, 5000
q = async.queue (task, callback) ->
console.log "-scrape #{task}"
scraper 'http://en.wikipedia.org' + task, (error, jQuery) ->
console.log "+scrape #{task}"
name = null
symbol = null
number = null
for tr in jQuery('table:first tr')
if jQuery(tr).find('th').text() is 'Name, symbol, number'
[name, symbol, number] = jQuery(tr).find('td').text().split ','
db[number] = name: name, symbol: symbol, number: number
if jQuery(tr).find('th').text() is 'Group, period, block'
[group, period, block] = jQuery(tr).find('td').text().split ','
db[number].group = group
db[number].period = period
db[number].block = block
done++
seconds_per_request = 1 / (done / ((new Date - start) / 1000))
console.log "[#{done}/#{total}] (~#{seconds_per_request} seconds)"
callback()
, 12
q.drain = -> console.log 'drain'
for a in jQuery 'table:first td a'
a = jQuery a
href = a.attr 'href'
if href
q.push href
total++
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment