Skip to content

Instantly share code, notes, and snippets.

@fczuardi
Created January 12, 2012 20:46
Show Gist options
  • Select an option

  • Save fczuardi/1602992 to your computer and use it in GitHub Desktop.

Select an option

Save fczuardi/1602992 to your computer and use it in GitHub Desktop.
Baixa todos os comentários de uma notícia do saocarlosagora, para vc nao ter que ficar dando clique no next de 5 em 5. Cospe JSON e CSV.
// Scrapping dos comentários de noticias do site www.saocarlosagora.com.br
// codigo do script baseado no exemplo deste blog post de David Trejo
// http://blog.dtrejo.com/scraping-made-easy-with-jquery-and-selectorga
// requisitos do npm: http-agent, jsdom, jquery
//configuracao
var site = "www.saocarlosagora.com.br"
, path = "/policia/noticia/2012/01/09/25766/dig-prende-comerciante-acusado-de-estuprar-jovem-no-santa-felicia/"
, page = "?page="
, start = 1
, end = 7
, comments = []
, csv = 'id,author,datetime,content';
var util = require('util')
, url = require('url')
, httpAgent = require('http-agent')
, jsdom = require('jsdom').jsdom
, jquery = require('jquery');
// parte de extrair os elementos relevantes de cada pagina
// pode parar de funcionar se o site atualizar o html
function getComments(agent){
var window = jsdom(agent.body).createWindow()
, $ = jquery.create(window);
$('.comment').each(function(index, item){
var comment = {
'id': item.id.substring(8)
, 'author': $(item).find('span.commented_by u').text()
, 'datetime': $(item).find('span.commented_date').html().substring(7).replace(
/([0-9][0-9])\/([0-9][0-9])\/(20[0-9][0-9])/, "$2\/$1\/$3") +
' '+$(item).find('span.commented_time').html().substring(0,5).replace('h',':')
, 'content': $(item).find('div.comment_content p:gt(0)').html()
};
comments.push(comment);
csv += "\n"+comment.id+',"'+comment.author+'",'+comment.datetime+',"'+comment.content.replace(/\"/gim,'""')+'"'
});
}
function outputResults(){
console.log('======== JSON ============')
console.log(comments);
console.log('========================')
console.log("\n\n\n")
console.log('======== CSV ============')
console.log(csv);
console.log('========================')
}
var urls = [];
for (var i=start;i<=end;i++){
urls.push(path+page+i);
}
var agent = httpAgent.create('www.saocarlosagora.com.br', urls);
console.log('Scraping', urls.length, 'pages from', agent.host);
agent.addListener('next', function (err, agent) {
getComments(agent);
console.log('Página '+(start++));
//intervalo de meio segundo entre chamadas para nao sobrecarregar o server
setTimeout(function(){agent.next()}, 500);
});
agent.addListener('error', function (err, agent) {
if (err) console.log(err);
outputResults();
});
agent.addListener('stop', function (err, agent) {
if (err) console.log(err);
outputResults();
});
agent.start();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment