Created
September 18, 2012 15:12
-
-
Save jedahan/3743673 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| fs = require 'fs' | |
| Zombie = require 'zombie' | |
| jsdom = require 'jsdom' | |
| jquery = fs.readFileSync('./jquery.min.js').toString() | |
| zombie = new Zombie loadCSS: false, runScripts: false | |
| base = 'http://www.metmuseum.org/Collections/search-the-collections/' | |
| scrapeObject = (id) -> | |
| path = "objects/#{id}.json" | |
| zombie.visit base+id, (e, browser, status) -> | |
| console.log status if status isnt 200 | |
| console.log e if e? | |
| jsdom.env | |
| html: browser.html() | |
| src: [ jquery ] | |
| done: (e, window) -> | |
| $ = window.$ | |
| object = {} | |
| arrify = (str) -> str.split /\r\n/ | |
| removeNums = (arr) -> str.replace(/\([0-9,]+\)|:/, '').trim() for str in arr | |
| removeNull = (arr) -> arr.filter (e) -> e.length | |
| flatten = (arr) -> if arr.length is 1 then arr[0] else arr | |
| process = (str) -> flatten removeNull removeNums arrify str | |
| object.id = +id | |
| # Add the image uri if it starts with http | |
| object.image = $('a[name="art-object-fullscreen"] > img').attr('src') | |
| object.image = null unless /^http/.test object.image | |
| # Map each definition as its own key and value(s) | |
| object[process $($('dt')[i]).text()] = process $(v).text() for v,i in $('dd') | |
| # Make an array of related artwork ids | |
| object['related-artworks'] = (+($(a).attr('href').match(/[0-9]+/g)[0]) for a in $('.object-info a')) | |
| fs.writeFileSync path, JSON.stringify object | |
| for arg in process.argv[2..] | |
| scrapeObject id for id in require arg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment