Created
January 12, 2012 20:46
-
-
Save fczuardi/1602992 to your computer and use it in GitHub Desktop.
Baixa todos os comentários de uma notícia do saocarlosagora, para vc nao ter que ficar dando clique no next de 5 em 5. Cospe JSON e CSV.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Scrapping dos comentários de noticias do site www.saocarlosagora.com.br | |
| // codigo do script baseado no exemplo deste blog post de David Trejo | |
| // http://blog.dtrejo.com/scraping-made-easy-with-jquery-and-selectorga | |
| // requisitos do npm: http-agent, jsdom, jquery | |
| //configuracao | |
| var site = "www.saocarlosagora.com.br" | |
| , path = "/policia/noticia/2012/01/09/25766/dig-prende-comerciante-acusado-de-estuprar-jovem-no-santa-felicia/" | |
| , page = "?page=" | |
| , start = 1 | |
| , end = 7 | |
| , comments = [] | |
| , csv = 'id,author,datetime,content'; | |
| var util = require('util') | |
| , url = require('url') | |
| , httpAgent = require('http-agent') | |
| , jsdom = require('jsdom').jsdom | |
| , jquery = require('jquery'); | |
| // parte de extrair os elementos relevantes de cada pagina | |
| // pode parar de funcionar se o site atualizar o html | |
| function getComments(agent){ | |
| var window = jsdom(agent.body).createWindow() | |
| , $ = jquery.create(window); | |
| $('.comment').each(function(index, item){ | |
| var comment = { | |
| 'id': item.id.substring(8) | |
| , 'author': $(item).find('span.commented_by u').text() | |
| , 'datetime': $(item).find('span.commented_date').html().substring(7).replace( | |
| /([0-9][0-9])\/([0-9][0-9])\/(20[0-9][0-9])/, "$2\/$1\/$3") + | |
| ' '+$(item).find('span.commented_time').html().substring(0,5).replace('h',':') | |
| , 'content': $(item).find('div.comment_content p:gt(0)').html() | |
| }; | |
| comments.push(comment); | |
| csv += "\n"+comment.id+',"'+comment.author+'",'+comment.datetime+',"'+comment.content.replace(/\"/gim,'""')+'"' | |
| }); | |
| } | |
| function outputResults(){ | |
| console.log('======== JSON ============') | |
| console.log(comments); | |
| console.log('========================') | |
| console.log("\n\n\n") | |
| console.log('======== CSV ============') | |
| console.log(csv); | |
| console.log('========================') | |
| } | |
| var urls = []; | |
| for (var i=start;i<=end;i++){ | |
| urls.push(path+page+i); | |
| } | |
| var agent = httpAgent.create('www.saocarlosagora.com.br', urls); | |
| console.log('Scraping', urls.length, 'pages from', agent.host); | |
| agent.addListener('next', function (err, agent) { | |
| getComments(agent); | |
| console.log('Página '+(start++)); | |
| //intervalo de meio segundo entre chamadas para nao sobrecarregar o server | |
| setTimeout(function(){agent.next()}, 500); | |
| }); | |
| agent.addListener('error', function (err, agent) { | |
| if (err) console.log(err); | |
| outputResults(); | |
| }); | |
| agent.addListener('stop', function (err, agent) { | |
| if (err) console.log(err); | |
| outputResults(); | |
| }); | |
| agent.start(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment