Skip to content

Instantly share code, notes, and snippets.

@lmccart
Last active August 29, 2015 14:19
Show Gist options
  • Save lmccart/5e4262594db7ed580e9e to your computer and use it in GitHub Desktop.
Save lmccart/5e4262594db7ed580e9e to your computer and use it in GitHub Desktop.
web scraping
//////////////////////////////////////////////////////////
//// basic
var cheerio = require('cheerio');
var request = require('request');
var url = 'http://itp.nyu.edu/sigs/program/?sortby=tier&semesteryear=Spring%202015';
request(url, function(err, resp, body) {
if (err) console.log(err);
$ = cheerio.load(body);
$('.course-title').each(function() {
console.log($(this).text());
});
});
//////////////////////////////////////////////////////////
//// search inside element
var cheerio = require('cheerio');
var request = require('request');
var url = 'http://itp.nyu.edu/sigs/program/?sortby=tier&semesteryear=Spring%202015';
request(url, function(err, resp, body) {
if (err) console.log(err);
$ = cheerio.load(body);
$('.course-listing li').each(function() {
console.log($(this).find('.course-title').text());
console.log($(this).find('.course-instructors').text());
console.log($(this).find('.course-desc').text());
});
});
//////////////////////////////////////////////////////////
//// craigslist
var cheerio = require('cheerio');
var request = require('request');
var url = 'http://newyork.craigslist.org/search/aap';
request(url, function(err, resp, body) {
if (err) console.log(err);
$ = cheerio.load(body);
$('.row').each(function() {
//console.log($(this).text());
var title = $(this).find('.hdrlnk').text();
var price = $(this).find('.price').text();
var rooms = $(this).find('.housing').text();
var pnr = $(this).find('.pnr').text();
});
});
//////////////////////////////////////////////////////////
//// scrape multiple pages
var cheerio = require('cheerio');
var request = require('request');
var url = 'http://newyork.craigslist.org/search/aap';
var links = [];
request(url, function(err, resp, body) {
if (err) console.log(err);
$ = cheerio.load(body);
$('.row').each(function() {
var l = 'http://newyork.craigslist.org'+$(this).find('.hdrlnk').attr('href');
links.push(l);
});
for (var i=0; i<links.length; i++) {
request(links[i], function(err, resp, body) {
$ = cheerio.load(body);
var title = $('title').text();
var price = $('.postingtitle').find('.price').text();
var cats = $('p.attrgroup:contains("cats are OK - purrr")').length;
var dogs = $('p.attrgroup:contains("dogs are OK - wooof")').length;
var addr = $('div.mapaddress').text();
var lat = $('#map').attr('data-latitude');
var lon = $('#map').attr('data-longitude');
price = parseInt(price.substring(1), 10);
console.log(lat, lon);
});
}
});
//////////////////////////////////////////////////////////
//// using servi / database
var cheerio = require('cheerio');
var request = require('request');
// every servi application must have these 2 lines
var servi = require("servi");
var app = new servi(true);
// set the port (defaults to 3000 if you leave out this line)
port(3001);
// set up a database
// looks for a file called "listings.db" or creates one if it doesn't exist
var db = useDatabase("listings");
// we use forEach here rather than a normal for loop to keep track of which year we're on
var url = 'http://newyork.craigslist.org/search/aap';
var links = [];
request(url, function(err, resp, body) {
if (err) console.log(err);
$ = cheerio.load(body);
$('.row').each(function() {
var l = 'http://newyork.craigslist.org'+$(this).find('.hdrlnk').attr('href');
links.push(l);
});
for (var i=0; i<links.length; i++) {
request(links[i], function(err, resp, body) {
$ = cheerio.load(body);
var title = $('title').text();
var price = $('.postingtitle').find('.price').text();
price = parseInt(price.substring(1), 10);
var cats = $('p.attrgroup:contains("cats are OK - purrr")').length;
var dogs = $('p.attrgroup:contains("dogs are OK - wooof")').length;
var addr = $('div.mapaddress').text();
var lat = $('#map').attr('data-latitude');
var lon = $('#map').attr('data-longitude');
var listing = {
title: title,
price: price,
cats: cats,
dogs: dogs,
addr: addr,
lat: lat,
lon: lon
}
db.add(listing);
});
}
});
// set up the routes
route('/all', showAll);
// show all the names
function showAll(request){
db.getAll(function(data) {
var listingText = "";
for (i =0; i < data.length; i++) {
listingText += "<p><b>" + data[i].title + "</b><br/>";
listingText += data[i].price + "<br/>";
listingText += data[i].addr + "<br/></p>";
}
request.respond( listingText );
});
}
start();
//////////////////////////////////////////////////////////
//// paths / json
route('/all', showAll);
route('/max/:num', showListings);
// show all the names
function showAll(request){
db.getAll(function(data) {
request.header("application/json");
request.respond(JSON.stringify(data));
});
}
function showListings(request) {
var num = request.params.num;
db.getAll(function(data) {
var listings = [];
for (var i=0; i<data.length; i++) {
if (data[i].price <= num) {
listings.push(data[i]);
}
}
request.header("application/json");
request.respond(JSON.stringify(listings));
});
}
//////////////////////////////////////////////////////////
//// add p5.js
function setup() {
createCanvas(windowWidth, windowHeight);
loadJSON('/all', drawScene);
noStroke();
textAlign(CENTER);
background(0);
}
function drawScene(data) {
console.log(data);
for (var i=0; i<data.length; i++) {
var y = map(data[i].price, 1000, 4000, 0, height);
if (data[i].cats) {
fill(255, 0, 0);
} else {
fill(255);
}
text(data[i].title, width/2, y);
}
}
//////////////////////////////////////////////////////////
//// mapbox.js
<!DOCTYPE html>
<html>
<head>
<script src='jquery.js'></script>
<script src="//code.jquery.com/jquery-1.11.1.min.js"></script>
<script src='https://api.tiles.mapbox.com/mapbox.js/v2.1.8/mapbox.js'></script>
<link href='https://api.tiles.mapbox.com/mapbox.js/v2.1.8/mapbox.css' rel='stylesheet' />
<style>
body { margin:0; padding:0; }
#map { position:absolute; top:0; bottom:0; width:100%; }
</style>
</head>
<body>
<div id="map"></div>
<script>
// Provide your access token
L.mapbox.accessToken = 'YOUR_TOKEN_HERE';
// Create a map in the div #map
var map = L.mapbox.map('map', 'examples.map-zr0njcqy').setView([40.7127, -74.0059], 13);
$.getJSON('/all', function(data) {
console.log(data);
for (var i=0; i<data.length; i++) {
if (data[i].lat) {
var options = {
title: data[i].title+' $'+data[i].price,
opacity: data[i].price/5000
}
var marker = L.marker([data[i].lat, data[i].lon], options).addTo(map);
}
}
});
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment