Last active
March 30, 2016 09:44
-
-
Save ronanguilloux/b587281fdb3b524d2c04918ffa03e59c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var express = require('express'); | |
var fs = require('fs'); | |
var request = require('request'); | |
var cheerio = require('cheerio'); | |
var SimpleJson2Csv = require('simple-json2csv'); | |
/* | |
package.json: | |
{ | |
"name": "akeneo-scraper", | |
"version": "0.0.1", | |
"description": "Scrape le web.", | |
"main": "server.js", | |
"author": "Akeneo", | |
"dependencies": { | |
"cheerio": "latest", | |
"express": "latest", | |
"request": "latest", | |
"simple-json2csv": "0.0.5" | |
} | |
} | |
*/ | |
var app = express(); | |
app.get('/scrape', function(req, res){ | |
url = 'http://imagine.magento.com/attendees'; | |
// The structure of our request call | |
// The first parameter is our URL | |
// The callback function takes 3 parameters, an error, response status code and the html | |
request(url, function(error, response, html){ | |
// First we'll check to make sure no errors occurred when making the request | |
var json = {fields: [{ name: "name", header: "Name" }], data: []}; | |
var csv = ''; | |
if(!error){ | |
// Next, we'll utilize the cheerio library on the returned html which will essentially give us jQuery functionality | |
var $ = cheerio.load(html); | |
// Finally, we'll define the variables we're going to capture | |
$('.attendee').filter(function(){ | |
// Let's store the data we filter into a variable so we can easily see what's going on. | |
var data = $(this); | |
// In examining the DOM we notice that the title rests within the first child element of the header tag. | |
// Utilizing jQuery we can easily navigate and get the text by writing the following code: | |
json.data.push({name: data.text().trim().replace('\n', '')}); | |
}) | |
} | |
// To write to the system we will use the built in 'fs' library. | |
// In this example we will pass 3 parameters to the writeFile function | |
// Parameter 1 : output.json - this is what the created filename will be called | |
// Parameter 2 : JSON.stringify(json, null, 4) - the data to write, here we do an extra step by calling JSON.stringify to make our JSON easier to read | |
// Parameter 3 : callback function - a callback function to let us know the status of our function | |
var outputFile = 'output.csv'; | |
var json2Csv = new SimpleJson2Csv(json); | |
json2Csv.pipe(fs.createWriteStream(outputFile)); | |
console.log('Data successfully scrapped and stored! - Check your project directory for the ' + outputFile + ' file'); | |
// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI. | |
res.send('Check your console!') | |
}) ; | |
}); | |
app.listen('8081') | |
console.log('Magic happens on port 8081'); | |
exports = module.exports = app; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment