Last active
August 29, 2015 14:19
-
-
Save lmccart/5e4262594db7ed580e9e to your computer and use it in GitHub Desktop.
web scraping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
////////////////////////////////////////////////////////// | |
//// basic | |
var cheerio = require('cheerio'); | |
var request = require('request'); | |
var url = 'http://itp.nyu.edu/sigs/program/?sortby=tier&semesteryear=Spring%202015'; | |
request(url, function(err, resp, body) { | |
if (err) console.log(err); | |
$ = cheerio.load(body); | |
$('.course-title').each(function() { | |
console.log($(this).text()); | |
}); | |
}); | |
////////////////////////////////////////////////////////// | |
//// search inside element | |
var cheerio = require('cheerio'); | |
var request = require('request'); | |
var url = 'http://itp.nyu.edu/sigs/program/?sortby=tier&semesteryear=Spring%202015'; | |
request(url, function(err, resp, body) { | |
if (err) console.log(err); | |
$ = cheerio.load(body); | |
$('.course-listing li').each(function() { | |
console.log($(this).find('.course-title').text()); | |
console.log($(this).find('.course-instructors').text()); | |
console.log($(this).find('.course-desc').text()); | |
}); | |
}); | |
////////////////////////////////////////////////////////// | |
//// craigslist | |
var cheerio = require('cheerio'); | |
var request = require('request'); | |
var url = 'http://newyork.craigslist.org/search/aap'; | |
request(url, function(err, resp, body) { | |
if (err) console.log(err); | |
$ = cheerio.load(body); | |
$('.row').each(function() { | |
//console.log($(this).text()); | |
var title = $(this).find('.hdrlnk').text(); | |
var price = $(this).find('.price').text(); | |
var rooms = $(this).find('.housing').text(); | |
var pnr = $(this).find('.pnr').text(); | |
}); | |
}); | |
////////////////////////////////////////////////////////// | |
//// scrape multiple pages | |
var cheerio = require('cheerio'); | |
var request = require('request'); | |
var url = 'http://newyork.craigslist.org/search/aap'; | |
var links = []; | |
request(url, function(err, resp, body) { | |
if (err) console.log(err); | |
$ = cheerio.load(body); | |
$('.row').each(function() { | |
var l = 'http://newyork.craigslist.org'+$(this).find('.hdrlnk').attr('href'); | |
links.push(l); | |
}); | |
for (var i=0; i<links.length; i++) { | |
request(links[i], function(err, resp, body) { | |
$ = cheerio.load(body); | |
var title = $('title').text(); | |
var price = $('.postingtitle').find('.price').text(); | |
var cats = $('p.attrgroup:contains("cats are OK - purrr")').length; | |
var dogs = $('p.attrgroup:contains("dogs are OK - wooof")').length; | |
var addr = $('div.mapaddress').text(); | |
var lat = $('#map').attr('data-latitude'); | |
var lon = $('#map').attr('data-longitude'); | |
price = parseInt(price.substring(1), 10); | |
console.log(lat, lon); | |
}); | |
} | |
}); | |
////////////////////////////////////////////////////////// | |
//// using servi / database | |
var cheerio = require('cheerio'); | |
var request = require('request'); | |
// every servi application must have these 2 lines | |
var servi = require("servi"); | |
var app = new servi(true); | |
// set the port (defaults to 3000 if you leave out this line) | |
port(3001); | |
// set up a database | |
// looks for a file called "listings.db" or creates one if it doesn't exist | |
var db = useDatabase("listings"); | |
// we use forEach here rather than a normal for loop to keep track of which year we're on | |
var url = 'http://newyork.craigslist.org/search/aap'; | |
var links = []; | |
request(url, function(err, resp, body) { | |
if (err) console.log(err); | |
$ = cheerio.load(body); | |
$('.row').each(function() { | |
var l = 'http://newyork.craigslist.org'+$(this).find('.hdrlnk').attr('href'); | |
links.push(l); | |
}); | |
for (var i=0; i<links.length; i++) { | |
request(links[i], function(err, resp, body) { | |
$ = cheerio.load(body); | |
var title = $('title').text(); | |
var price = $('.postingtitle').find('.price').text(); | |
price = parseInt(price.substring(1), 10); | |
var cats = $('p.attrgroup:contains("cats are OK - purrr")').length; | |
var dogs = $('p.attrgroup:contains("dogs are OK - wooof")').length; | |
var addr = $('div.mapaddress').text(); | |
var lat = $('#map').attr('data-latitude'); | |
var lon = $('#map').attr('data-longitude'); | |
var listing = { | |
title: title, | |
price: price, | |
cats: cats, | |
dogs: dogs, | |
addr: addr, | |
lat: lat, | |
lon: lon | |
} | |
db.add(listing); | |
}); | |
} | |
}); | |
// set up the routes | |
route('/all', showAll); | |
// show all the names | |
function showAll(request){ | |
db.getAll(function(data) { | |
var listingText = ""; | |
for (i =0; i < data.length; i++) { | |
listingText += "<p><b>" + data[i].title + "</b><br/>"; | |
listingText += data[i].price + "<br/>"; | |
listingText += data[i].addr + "<br/></p>"; | |
} | |
request.respond( listingText ); | |
}); | |
} | |
start(); | |
////////////////////////////////////////////////////////// | |
//// paths / json | |
route('/all', showAll); | |
route('/max/:num', showListings); | |
// show all the names | |
function showAll(request){ | |
db.getAll(function(data) { | |
request.header("application/json"); | |
request.respond(JSON.stringify(data)); | |
}); | |
} | |
function showListings(request) { | |
var num = request.params.num; | |
db.getAll(function(data) { | |
var listings = []; | |
for (var i=0; i<data.length; i++) { | |
if (data[i].price <= num) { | |
listings.push(data[i]); | |
} | |
} | |
request.header("application/json"); | |
request.respond(JSON.stringify(listings)); | |
}); | |
} | |
////////////////////////////////////////////////////////// | |
//// add p5.js | |
function setup() { | |
createCanvas(windowWidth, windowHeight); | |
loadJSON('/all', drawScene); | |
noStroke(); | |
textAlign(CENTER); | |
background(0); | |
} | |
function drawScene(data) { | |
console.log(data); | |
for (var i=0; i<data.length; i++) { | |
var y = map(data[i].price, 1000, 4000, 0, height); | |
if (data[i].cats) { | |
fill(255, 0, 0); | |
} else { | |
fill(255); | |
} | |
text(data[i].title, width/2, y); | |
} | |
} | |
////////////////////////////////////////////////////////// | |
//// mapbox.js | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<script src='jquery.js'></script> | |
<script src="//code.jquery.com/jquery-1.11.1.min.js"></script> | |
<script src='https://api.tiles.mapbox.com/mapbox.js/v2.1.8/mapbox.js'></script> | |
<link href='https://api.tiles.mapbox.com/mapbox.js/v2.1.8/mapbox.css' rel='stylesheet' /> | |
<style> | |
body { margin:0; padding:0; } | |
#map { position:absolute; top:0; bottom:0; width:100%; } | |
</style> | |
</head> | |
<body> | |
<div id="map"></div> | |
<script> | |
// Provide your access token | |
L.mapbox.accessToken = 'YOUR_TOKEN_HERE'; | |
// Create a map in the div #map | |
var map = L.mapbox.map('map', 'examples.map-zr0njcqy').setView([40.7127, -74.0059], 13); | |
$.getJSON('/all', function(data) { | |
console.log(data); | |
for (var i=0; i<data.length; i++) { | |
if (data[i].lat) { | |
var options = { | |
title: data[i].title+' $'+data[i].price, | |
opacity: data[i].price/5000 | |
} | |
var marker = L.marker([data[i].lat, data[i].lon], options).addTo(map); | |
} | |
} | |
}); | |
</script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment