Skip to content

Instantly share code, notes, and snippets.

@artjomb
Created September 29, 2014 10:33
Show Gist options
  • Save artjomb/0be1f1d6b62116dfd0d3 to your computer and use it in GitHub Desktop.
Save artjomb/0be1f1d6b62116dfd0d3 to your computer and use it in GitHub Desktop.
// Run as: casperjs --cookies-file=cookies.txt SE_get_close_votes.js
// to skip logging in the next time or simply: casperjs SE_get_close_votes.js
// Don't forget to input your credentials
// The login is not perfect, so it might fail. Just try again.
var casper = require('casper').create({
viewportSize: {
width: 1280,
height: 800
},
pageSettings: {
userAgent: "PhantomJS (Close Vote Scraper)"
}
}),
fs = require("fs"),
x = require('casper').selectXPath,
i = 0,
account = {
email: "",
password: ""
},
sites = [
"http://stackoverflow.com"
],
loginLink = x("//a[text()='log in' and @class='login-link']"),
loginSpan = x("//span[text()='Log in using Stack Exchange']"),
reputationSpan = "span.reputation",
profileLink = "a.profile-me",
profileVotesTab = x("//div[@id='tabs']/a[normalize-space(text())='votes']"),
profileVotesTabClosure = x("//div[contains(@class,'subtabs')]/a[normalize-space(text())='closure']"),
profileVotesTabClosureTabClosureText = x("//td/span[text()='closure']"),
historyTableNext = "a>span.page-numbers.next",
logFile = "action_close_votes.log",
scrapeFile = "close_votes.json",
scrapeFileCSV = "close_votes.csv",
failedFile = "failed_votes.csv",
minQuestions = 1,
maxQuestions = 3,
scrapeList = [],
showMessages = true;
if (account.email == "" || account.password == "") {
console.log("You didn't set login credentials. Please edit this file.");
casper.exit();
}
casper.on("error", function(msg, backtrace){
this.echo(msg);
require("utils").dump(backtrace);
this.echo("Title: " + this.getTitle());
capture(true);
});
casper.on("remote.message", function(msg){
if (!showMessages) return;
this.echo("remote.msg: " + msg);
});
casper.on("resource.error", function(resourceError){
if (!showMessages) return;
this.echo("res.err: " + JSON.stringify(resourceError));
});
casper.on("page.error", function(pageErr){
if (!showMessages) return;
this.echo("page.err: " + JSON.stringify(pageErr));
});
function capture(err){
if (err){
casper.capture("cap_err.png");
fs.write("cap_err.html", casper.getHTML());
} else {
casper.capture("cap_"+i+".png");
fs.write("cap_"+(i++)+".html", casper.getHTML());
}
}
function thenCapture(){
casper.then(function(){
capture();
});
}
function log(site, msg){
var str = (new Date()).toISOString() + " - " + site + " - " + msg + "\r\n";
casper.echo(str);
//fs.write(logFile, str, "a");
}
function navigateAndScrape(site){
this.thenClick(profileLink);
this.waitForSelector(profileVotesTab);
this.thenClick(profileVotesTab);
this.waitForSelector(profileVotesTabClosure);
this.thenClick(profileVotesTabClosure);
this.waitForSelector(profileVotesTabClosureTabClosureText); // maybe delete too?
scrapeClosurePage.call(this, 1, site);
}
function scrapeClosurePage(page, site) {
this.then(function(){
this.echo("page " + page);
});
this.waitFor(function check(){
return this.fetchText("span.page-numbers.current") === ""+page;
}, function then(){
var newArticles = this.evaluate(function(){
function endsWith(str, suffix) {
// from http://stackoverflow.com/a/2548133
return str.indexOf(suffix, str.length - suffix.length) !== -1;
}
var trList = document.querySelectorAll("table.history-table > tbody > tr:nth-child(2n-1)");
return Array.prototype.map.call(trList, function(tr){
var dateString = tr.querySelector("td > .date, td > .date > .date_brick").title,
question = tr.querySelector("a.question-hyperlink"),
title = question.innerText;
return {
url: question.href,
title: title,
dateString: dateString,
timestamp: (new Date(dateString)).getTime(),
isClosed: endsWith(title, "[closed]") ||
endsWith(title, "[duplicate]") ||
endsWith(title, "[on hold]") ||
endsWith(title, "[migrated]"),
isDeleted: tr.className.indexOf("deleted-") !== -1, // TODO: just assuming, check this!
isSuccessful: tr.children[1].children.length === 2 // "(delete)" is present
};
});
});
scrapeList = scrapeList.concat(newArticles);
});
this.then(function(){
if (this.visible(historyTableNext)) {
this.click(historyTableNext);
scrapeClosurePage.call(this, page+1, site);
}
});
}
casper.start();
casper.each(sites, function(self, site){
self.then(function(){
this.echo("Opening: " + site);
log(site, "opening");
});
self.thenOpen(site, function(){
if (this.visible(loginLink)) {
log(site, "do login");
capture();
this.thenClick(loginLink);
this.then(function(){
log(site, "login page opened");
capture();
if (this.visible(reputationSpan)) {
// SE inferred the account without login
navigateAndScrape.call(this, site);
return;
}
this.click(loginSpan);
this.waitWhileVisible("#forgot-password", function(){
log(site, "login form opened");
capture();
this.fillSelectors("form#se-login-form", {
"input[name='email']": account.email,
"input[name='password']": account.password,
}, true);
this.wait(1000);
thenCapture();
navigateAndScrape.call(this, site);
});
});
} else if (this.visible(reputationSpan)){
log(site, "already logged in");
capture();
navigateAndScrape.call(this, site);
} else {
log(site, "Warning: no log in link found");
}
});
});
casper.run(function(){
var separator = ";",
failedVote = "Title" + separator + "URL" + separator + "Date" + separator + "closed" + separator + "deleted" + separator + "success\n",
totalVote = failedVote,
failedLength = 0;
scrapeList.forEach(function(article){
var row = article.title + separator +
article.url + separator +
article.dateString + separator +
article.isClosed + separator +
article.isDeleted + separator +
article.isSuccessful + "\n";
if (!article.isClosed && !article.isDeleted && article.isSuccessful) {
failedLength++;
failedVote += row;
}
totalVote += row;
});
console.log("total length", scrapeList.length);
console.log("failed length", failedLength);
fs.write(failedFile, failedVote);
fs.write(scrapeFileCSV, totalVote);
fs.write(scrapeFile, JSON.stringify(scrapeList));
log("none", "done");
this.exit();
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment