Skip to content

Instantly share code, notes, and snippets.

@jedahan
Created September 18, 2012 15:12
Show Gist options
  • Select an option

  • Save jedahan/3743673 to your computer and use it in GitHub Desktop.

Select an option

Save jedahan/3743673 to your computer and use it in GitHub Desktop.
fs = require 'fs'
Zombie = require 'zombie'
jsdom = require 'jsdom'
jquery = fs.readFileSync('./jquery.min.js').toString()
zombie = new Zombie loadCSS: false, runScripts: false
base = 'http://www.metmuseum.org/Collections/search-the-collections/'
scrapeObject = (id) ->
path = "objects/#{id}.json"
zombie.visit base+id, (e, browser, status) ->
console.log status if status isnt 200
console.log e if e?
jsdom.env
html: browser.html()
src: [ jquery ]
done: (e, window) ->
$ = window.$
object = {}
arrify = (str) -> str.split /\r\n/
removeNums = (arr) -> str.replace(/\([0-9,]+\)|:/, '').trim() for str in arr
removeNull = (arr) -> arr.filter (e) -> e.length
flatten = (arr) -> if arr.length is 1 then arr[0] else arr
process = (str) -> flatten removeNull removeNums arrify str
object.id = +id
# Add the image uri if it starts with http
object.image = $('a[name="art-object-fullscreen"] > img').attr('src')
object.image = null unless /^http/.test object.image
# Map each definition as its own key and value(s)
object[process $($('dt')[i]).text()] = process $(v).text() for v,i in $('dd')
# Make an array of related artwork ids
object['related-artworks'] = (+($(a).attr('href').match(/[0-9]+/g)[0]) for a in $('.object-info a'))
fs.writeFileSync path, JSON.stringify object
for arg in process.argv[2..]
scrapeObject id for id in require arg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment