Last active
June 6, 2019 04:08
-
-
Save kaid/9931456 to your computer and use it in GitHub Desktop.
知乎热门问题抓取脚本
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//==== Displayer ==== | |
var Displayer = function() { | |
this.$el = jQuery("body").prepend( | |
"<div class=\"hotqs-list\">" + | |
"<div class=\"head\">" + | |
"<div>关注</div>" + | |
"<div>回答</div>" + | |
"</div>" + | |
"<div class=\"loading\">开始抓取问题</div>" + | |
"<ul></ul>" + | |
"</div>" | |
).find(".hotqs-list"); | |
this.$list = this.$el.find("ul"); | |
this.$count = this.$el.append("<div class=\"hotqs-count\">0<div>").find(".hotqs-count"); | |
this.$el.fadeIn(); | |
} | |
Displayer.prototype.start = function() { | |
this.$el.find(".loading").remove(); | |
} | |
Displayer.prototype.increment_count = function() { | |
this.$count.text(this.count()); | |
} | |
Displayer.prototype.count = function() { | |
return this.$list.children().length; | |
} | |
Displayer.prototype.remove_lowest = function() { | |
this.$list.children().last().slideUp(function() {jQuery(this).remove();}); | |
} | |
Displayer.prototype.insert_before = function(question, index) { | |
var $li = jQuery("<li></li>").hide(); | |
var $anchor = jQuery("<a target=\"_blank\"></a>").attr("href", question.url).text(question.title); | |
var $followers = jQuery("<div></div>").text(question.followers) | |
var $answers = jQuery("<div></div>").text(question.answers) | |
$li.append($anchor, $followers, $answers); | |
if (0 === this.$list.children().length || index == -1) { | |
this.start(); | |
this.$list.append($li); | |
} else { | |
var $sibling = this.$list.children()[index] | |
$li.insertBefore($sibling); | |
} | |
$li.slideDown(); | |
this.increment_count(); | |
if (this.count() > 64) this.remove_lowest(); | |
} | |
//==== Queue ==== | |
var Queue = function() { | |
this.tasks = []; | |
this.waiting = true; | |
this.completed = 0; | |
this.start = new Date; | |
} | |
Queue.prototype.enqueue = function(ajax_setting, options) { | |
if (options && "high" === options.priority) { | |
this.tasks.unshift(ajax_setting); | |
} else { | |
this.tasks.push(ajax_setting); | |
} | |
if (this.waiting) { | |
this.waiting = false; | |
this.exec(); | |
} | |
return this; | |
} | |
Queue.prototype.exec = function() { | |
if (0 === this.tasks.length || true === this.stop) { | |
this.waiting = true; | |
return; | |
} | |
this.waiting = false; | |
var self = this; | |
var ajax_setting = this.tasks.shift(); | |
this.sleep(640, function() { | |
jQuery.ajax(ajax_setting).fail(function(xhr) { | |
if (404 == xhr.status) return; | |
self.enqueue(ajax_setting); | |
}).always(function() { | |
var t2 = new Date; | |
self.completed++; | |
self.time = ((t2 - self.start)/(1000 * 60)).toFixed(2); | |
self.exec() | |
}); | |
}); | |
} | |
Queue.prototype.stop = function() { | |
this.stop = true; | |
} | |
Queue.prototype.start = function() { | |
this.stop = false; | |
this.exec(); | |
} | |
Queue.prototype.sleep = function(millis, fn) { | |
setTimeout(function() {fn()} , millis); | |
} | |
//==== RequestParamsBuilder ==== | |
var RequetParamsBuilder = { | |
post_param : function(id) { | |
var cookie = document.cookie; | |
var xsrf = cookie.split("; ").map(function(s) {return s.split("=")}).filter(function(a){return "_xsrf" === a[0]})[0][1]; | |
return {start: id, offset: 15, _xsrf: xsrf}; | |
}, | |
get_param : function(offset) { | |
return jQuery.param({ | |
params: JSON.stringify({ | |
offset : offset, | |
type : "month" | |
}) | |
}); | |
} | |
} | |
//==== Question ==== | |
var Question = function(url, title, followers, answers) { | |
var url = url.match(/^(\/question\/[0-9]+)(\/answer\/.+$)?/)[1]; | |
var followers = ~~followers; | |
var answers = ~~answers; | |
this.url = url; | |
this.title = title; | |
this.followers = followers; | |
this.answers = answers; | |
} | |
//==== Record ==== | |
var Recorder = function() { | |
this.list = []; | |
} | |
Recorder.prototype.store = function(record) { | |
if (this.insufficient(record)) return; | |
var found = this.find(record.url); | |
if (found) { | |
found.followers = record.followers; | |
found.answers = record.answers; | |
return; | |
} | |
var index; | |
var cond = this.list.some(function(q, i) { | |
var cond = q.followers < record.followers; | |
if (cond) index = i | |
return cond | |
}); | |
if (cond) { | |
this.list.splice(index, 0, record); | |
window.displayer.insert_before(record, index) | |
} else { | |
this.list.push(record); | |
window.displayer.insert_before(record, -1); | |
} | |
if (this.list.length > 64) this.list.pop(); | |
this.highest = this.list[0]; | |
this.lowest = this.list[this.list.length - 1]; | |
} | |
Recorder.prototype.insufficient = function(record) { | |
return this.list.length >= 64 && this.lowest && this.lowest.followers >= record.followers; | |
} | |
Recorder.prototype.find = function(url) { | |
var url = url.match(/^(\/question\/[0-9]+)(\/answer\/.+$)?/)[1]; | |
return this.list.filter(function(q) {return q.url === url;})[0]; | |
} | |
//==== QuestionsFetcher ==== | |
var QuestionsFetcher = function(queue, recorder, deep) { | |
this.queue = queue; | |
this.recorder = recorder || window.recorder; | |
this.deep = deep; | |
this.feed_url = "http://www.zhihu.com/explore"; | |
this.more_url = "http://www.zhihu.com/node/ExploreAnswerListV2"; | |
this.$questions; | |
} | |
QuestionsFetcher.prototype.run = function() { | |
var self = this; | |
var ajax_setting = { | |
url : this.feed_url, | |
type : "GET", | |
success : function(res) { | |
console.log("start!"); | |
self.$questions = jQuery(res).find("[data-type=monthly]").children(); | |
self.more(5); | |
} | |
} | |
this.queue.enqueue(ajax_setting); | |
} | |
QuestionsFetcher.prototype.more = function(offset) { | |
var self = this; | |
var ajax_setting = { | |
url : this.more_url, | |
type : "GET", | |
data : RequetParamsBuilder.get_param(offset), | |
success : function(res) { | |
var $more = jQuery(res); | |
self.$questions = self.$questions.add($more); | |
if (100 <= self.$questions.length || 0 == $more.length) { | |
self.load(); | |
return; | |
} | |
self.more(offset + 5); | |
} | |
}; | |
this.queue.enqueue(ajax_setting); | |
} | |
QuestionsFetcher.prototype.load = function() { | |
if (!this.question_urls) this.question_urls = this.urls(); | |
if (0 == this.question_urls.length) { | |
this.queue; | |
return; | |
} | |
this.on(this.question_urls.shift()); | |
this.load(); | |
} | |
QuestionsFetcher.prototype.urls = function() { | |
return this.$questions.map(function() { | |
return jQuery(this).find("h2 a").attr("href"); | |
}).toArray(); | |
} | |
QuestionsFetcher.prototype.on = function(url) { | |
if (this.deep && this.recorder.find(url)) return; | |
var url = url.match(/^(\/question\/[0-9]+)(\/answer\/.+$)?/)[1]; | |
var self = this; | |
var ajax_setting = { | |
url : url, | |
type : "GET", | |
success : function(res) { | |
var $res = jQuery(res); | |
var $anchor = $res.find(".zh-question-followers-sidebar").find("a"); | |
var title = $res.find("#zh-question-title h2.zm-item-title").text(); | |
var followers = $anchor.text(); | |
var answers = $res.find("#zh-question-answer-num").data("num"); | |
self.recorder.store(new Question(url, title, followers, answers)); | |
if (self.deep) { | |
var followers_fetcher = new FollowersFetcher($anchor.attr("href"), self.queue); | |
followers_fetcher.run(); | |
} | |
} | |
} | |
this.queue.enqueue(ajax_setting, {priority: "high"}); | |
} | |
//==== FollowersFetcher ==== | |
var FollowersFetcher = function(url, queue) { | |
this.url = url; | |
this.queue = queue; | |
this.follower_urls; | |
} | |
FollowersFetcher.prototype.run = function() { | |
var self = this; | |
var ajax_setting = { | |
url : this.url, | |
type : "GET", | |
success : function(res) { | |
self.follower_urls = jQuery(res).find(".zm-profile-card.zm-profile-section-item").map(function() { | |
return jQuery(this).find(".zm-item-link-avatar").attr("href"); | |
}).toArray(); | |
self.load(); | |
} | |
}; | |
this.queue.enqueue(ajax_setting); | |
} | |
FollowersFetcher.prototype.load = function() { | |
if (0 == this.follower_urls.length) { | |
this.queue; | |
return; | |
} | |
var activities_fetcher = new FollowerActivitiesFetcher(this.follower_urls.shift(), this.queue); | |
activities_fetcher.run() | |
this.load(); | |
} | |
//==== FollowerActivitesFetcher ==== | |
var FollowerActivitiesFetcher = function(url, queue) { | |
this.url = url; | |
this.more_url = url + "/activities"; | |
this.queue = queue; | |
this.questions_fetcher = new QuestionsFetcher(queue); | |
this.questions_fetcher.deep = true; | |
this.$activities; | |
} | |
FollowerActivitiesFetcher.prototype.run = function() { | |
var self = this; | |
var ajax_setting = { | |
url : this.url, | |
type : "GET", | |
success : function(res) { | |
self.$activities = jQuery(res).find(".zm-profile-section-item.zm-item"); | |
var act_id = self.$activities.last().attr("id").slice(4); | |
self.more(act_id); | |
} | |
}; | |
this.queue.enqueue(ajax_setting, {priority: "high"}); | |
} | |
FollowerActivitiesFetcher.prototype.more = function(id) { | |
var self = this; | |
var ajax_setting = { | |
url : this.more_url, | |
type : "POST", | |
data : RequetParamsBuilder.post_param(id), | |
success : function(res) { | |
self.$activities = self.$activities.add(jQuery(res.msg[1])); | |
var act_id = self.$activities.last().attr("id").slice(4); | |
if (100 <= self.$activities.length || 0 == res.msg[0]) { | |
if (!self.questions_fetcher.question_urls) self.questions_fetcher.question_urls = self.urls(); | |
self.questions_fetcher.load() | |
return; | |
} | |
self.more(act_id); | |
} | |
}; | |
this.queue.enqueue(ajax_setting, {priority: "high"}); | |
} | |
FollowerActivitiesFetcher.prototype.urls = function() { | |
return this.$activities.map(function() { | |
return jQuery(this).find("a.question_link").attr("href"); | |
}).toArray().filter(function(i) {return i}); | |
} | |
//==== Ready! Steady! Go! ==== | |
jQuery(function() { | |
$("head").append( | |
"<style>" + | |
".hotqs-list {" + | |
"position:fixed;top:0;bottom:0;left:0;right:0;margin:auto;display:none;z-index:9999;" + | |
"height:480px;width:640px;" + | |
"border:8px solid #ccc;background:white;" + | |
"color:#888;font-size:20px;" + | |
"box-shadow:0 0 16px 0px #888;" + | |
"}" + | |
".hotqs-list ul {" + | |
"height:432px;width:100%;" + | |
"overflow:auto;overflow-x:hidden;" + | |
"}" + | |
".hotqs-list .head, .hotqs-list ul li {" + | |
"height:48px;width:624px;padding:0 8px 0 8px;" + | |
"border-bottom:1px solid #ddd;" + | |
"}" + | |
".hotqs-list ul li a {" + | |
"padding:8px 0 8px 0;display:inline-block;height:32px;width:400px;text-overflow:ellipsis;overflow:hidden;white-space:nowrap;" + | |
"}" + | |
".hotqs-list .head div, .hotqs-list ul li div {" + | |
"height:32px;width:80px;float:right;margin-right:12px;padding:8px 0 8px 0;text-align:right;font-family:monospace;" + | |
"}" + | |
".hotqs-list .hotqs-count {" + | |
"position:absolute;right:-32px;top:-8px;display:inline-block;" + | |
"height:24px;width:24px;" + | |
"background:orange;color:white;font-size:14px;" + | |
"text-align:center;" + | |
"}" + | |
".hotqs-list .loading {" + | |
"width:100%;margin-top:160px;text-align:center;" + | |
"}" + | |
"</style>" | |
); | |
var queue = new Queue | |
, displayer = new Displayer | |
, recorder = new Recorder | |
, question_fetcher = new QuestionsFetcher(queue, recorder, true); | |
window.displayer = displayer; | |
window.recorder = recorder; | |
window.queue = queue; | |
question_fetcher.run(); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment