Last active
July 10, 2017 13:21
-
-
Save YonathanMeguira/1df0f16419d2d258728c1a7a2f6d5736 to your computer and use it in GitHub Desktop.
NodeJs Scraping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// for the sake of the demonstration we are going to put all the code into one file, | |
// but I strongly encourage you to make your code as modular as possible | |
// also, please note that in a normal scenario, this function 'scrapeTheNews()' would be called from the index.js / starting point | |
// of your node app, because this is an async call, you would hve to wrap the inner part of the func inside a promise so that you | |
// would be able to retrieve the result in async from index.js | |
// if you want to see real world scenario with such wrapping please visit : https://github.com/jonathanmeguira/news-scraper | |
// useful to request/retrieve the page we want to scrape | |
var request = require('request'); | |
var cheerio = require('cheerio') | |
var express = require ('express') | |
var app = express(); | |
scrapeTheNews = () => { | |
const url2Scrape = 'http://www.technewsworld.com/perl/section/cyber-security/'; | |
request(url2Scrape, (error, response, body){ | |
if (error){ | |
throw error; | |
}else{ | |
if (response.statusCode === 200) { | |
var articleArray = []; | |
var $ = cheerio.load(body); | |
var articles = $('.section-table'); | |
articles.map(function () { | |
var article = $(this).find('.shadow'); | |
var articleObj = { | |
title: article.find('a').text(), | |
date: article.find('span').html(), | |
url: baseUrl + article.find('a').attr('href'), | |
description: article.find('div').text().replace(/(?:\\[rnt]|[\r\n\t]+)+/g, ""), | |
image: baseUrl + article.find('img').attr('src') | |
} | |
articlesArr.push(articleObj); | |
}).get(); | |
// this get() call is mandatory, otherwise you will get a bunch of useless DOM object references | |
app.get('/news', function(request, result){ | |
//here we send the result | |
result.send(articlesArr); | |
}) | |
app.listen(3000, () => { | |
console.log('now serving the news on port 3000'); | |
}); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment