Created
September 29, 2014 10:33
-
-
Save artjomb/0be1f1d6b62116dfd0d3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Run as: casperjs --cookies-file=cookies.txt SE_get_close_votes.js | |
// to skip logging in the next time or simply: casperjs SE_get_close_votes.js | |
// Don't forget to input your credentials | |
// The login is not perfect, so it might fail. Just try again. | |
var casper = require('casper').create({ | |
viewportSize: { | |
width: 1280, | |
height: 800 | |
}, | |
pageSettings: { | |
userAgent: "PhantomJS (Close Vote Scraper)" | |
} | |
}), | |
fs = require("fs"), | |
x = require('casper').selectXPath, | |
i = 0, | |
account = { | |
email: "", | |
password: "" | |
}, | |
sites = [ | |
"http://stackoverflow.com" | |
], | |
loginLink = x("//a[text()='log in' and @class='login-link']"), | |
loginSpan = x("//span[text()='Log in using Stack Exchange']"), | |
reputationSpan = "span.reputation", | |
profileLink = "a.profile-me", | |
profileVotesTab = x("//div[@id='tabs']/a[normalize-space(text())='votes']"), | |
profileVotesTabClosure = x("//div[contains(@class,'subtabs')]/a[normalize-space(text())='closure']"), | |
profileVotesTabClosureTabClosureText = x("//td/span[text()='closure']"), | |
historyTableNext = "a>span.page-numbers.next", | |
logFile = "action_close_votes.log", | |
scrapeFile = "close_votes.json", | |
scrapeFileCSV = "close_votes.csv", | |
failedFile = "failed_votes.csv", | |
minQuestions = 1, | |
maxQuestions = 3, | |
scrapeList = [], | |
showMessages = true; | |
if (account.email == "" || account.password == "") { | |
console.log("You didn't set login credentials. Please edit this file."); | |
casper.exit(); | |
} | |
casper.on("error", function(msg, backtrace){ | |
this.echo(msg); | |
require("utils").dump(backtrace); | |
this.echo("Title: " + this.getTitle()); | |
capture(true); | |
}); | |
casper.on("remote.message", function(msg){ | |
if (!showMessages) return; | |
this.echo("remote.msg: " + msg); | |
}); | |
casper.on("resource.error", function(resourceError){ | |
if (!showMessages) return; | |
this.echo("res.err: " + JSON.stringify(resourceError)); | |
}); | |
casper.on("page.error", function(pageErr){ | |
if (!showMessages) return; | |
this.echo("page.err: " + JSON.stringify(pageErr)); | |
}); | |
function capture(err){ | |
if (err){ | |
casper.capture("cap_err.png"); | |
fs.write("cap_err.html", casper.getHTML()); | |
} else { | |
casper.capture("cap_"+i+".png"); | |
fs.write("cap_"+(i++)+".html", casper.getHTML()); | |
} | |
} | |
function thenCapture(){ | |
casper.then(function(){ | |
capture(); | |
}); | |
} | |
function log(site, msg){ | |
var str = (new Date()).toISOString() + " - " + site + " - " + msg + "\r\n"; | |
casper.echo(str); | |
//fs.write(logFile, str, "a"); | |
} | |
function navigateAndScrape(site){ | |
this.thenClick(profileLink); | |
this.waitForSelector(profileVotesTab); | |
this.thenClick(profileVotesTab); | |
this.waitForSelector(profileVotesTabClosure); | |
this.thenClick(profileVotesTabClosure); | |
this.waitForSelector(profileVotesTabClosureTabClosureText); // maybe delete too? | |
scrapeClosurePage.call(this, 1, site); | |
} | |
function scrapeClosurePage(page, site) { | |
this.then(function(){ | |
this.echo("page " + page); | |
}); | |
this.waitFor(function check(){ | |
return this.fetchText("span.page-numbers.current") === ""+page; | |
}, function then(){ | |
var newArticles = this.evaluate(function(){ | |
function endsWith(str, suffix) { | |
// from http://stackoverflow.com/a/2548133 | |
return str.indexOf(suffix, str.length - suffix.length) !== -1; | |
} | |
var trList = document.querySelectorAll("table.history-table > tbody > tr:nth-child(2n-1)"); | |
return Array.prototype.map.call(trList, function(tr){ | |
var dateString = tr.querySelector("td > .date, td > .date > .date_brick").title, | |
question = tr.querySelector("a.question-hyperlink"), | |
title = question.innerText; | |
return { | |
url: question.href, | |
title: title, | |
dateString: dateString, | |
timestamp: (new Date(dateString)).getTime(), | |
isClosed: endsWith(title, "[closed]") || | |
endsWith(title, "[duplicate]") || | |
endsWith(title, "[on hold]") || | |
endsWith(title, "[migrated]"), | |
isDeleted: tr.className.indexOf("deleted-") !== -1, // TODO: just assuming, check this! | |
isSuccessful: tr.children[1].children.length === 2 // "(delete)" is present | |
}; | |
}); | |
}); | |
scrapeList = scrapeList.concat(newArticles); | |
}); | |
this.then(function(){ | |
if (this.visible(historyTableNext)) { | |
this.click(historyTableNext); | |
scrapeClosurePage.call(this, page+1, site); | |
} | |
}); | |
} | |
casper.start(); | |
casper.each(sites, function(self, site){ | |
self.then(function(){ | |
this.echo("Opening: " + site); | |
log(site, "opening"); | |
}); | |
self.thenOpen(site, function(){ | |
if (this.visible(loginLink)) { | |
log(site, "do login"); | |
capture(); | |
this.thenClick(loginLink); | |
this.then(function(){ | |
log(site, "login page opened"); | |
capture(); | |
if (this.visible(reputationSpan)) { | |
// SE inferred the account without login | |
navigateAndScrape.call(this, site); | |
return; | |
} | |
this.click(loginSpan); | |
this.waitWhileVisible("#forgot-password", function(){ | |
log(site, "login form opened"); | |
capture(); | |
this.fillSelectors("form#se-login-form", { | |
"input[name='email']": account.email, | |
"input[name='password']": account.password, | |
}, true); | |
this.wait(1000); | |
thenCapture(); | |
navigateAndScrape.call(this, site); | |
}); | |
}); | |
} else if (this.visible(reputationSpan)){ | |
log(site, "already logged in"); | |
capture(); | |
navigateAndScrape.call(this, site); | |
} else { | |
log(site, "Warning: no log in link found"); | |
} | |
}); | |
}); | |
casper.run(function(){ | |
var separator = ";", | |
failedVote = "Title" + separator + "URL" + separator + "Date" + separator + "closed" + separator + "deleted" + separator + "success\n", | |
totalVote = failedVote, | |
failedLength = 0; | |
scrapeList.forEach(function(article){ | |
var row = article.title + separator + | |
article.url + separator + | |
article.dateString + separator + | |
article.isClosed + separator + | |
article.isDeleted + separator + | |
article.isSuccessful + "\n"; | |
if (!article.isClosed && !article.isDeleted && article.isSuccessful) { | |
failedLength++; | |
failedVote += row; | |
} | |
totalVote += row; | |
}); | |
console.log("total length", scrapeList.length); | |
console.log("failed length", failedLength); | |
fs.write(failedFile, failedVote); | |
fs.write(scrapeFileCSV, totalVote); | |
fs.write(scrapeFile, JSON.stringify(scrapeList)); | |
log("none", "done"); | |
this.exit(); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment