Skip to content

Instantly share code, notes, and snippets.

@huiliu
Last active August 2, 2021 14:47
Show Gist options
  • Save huiliu/6399321c7cba7798a3ffc02380c559db to your computer and use it in GitHub Desktop.
Save huiliu/6399321c7cba7798a3ffc02380c559db to your computer and use it in GitHub Desktop.
import puppeteer, { Page, Browser } from "puppeteer-core"
import axios from "axios";
import { stat } from "fs";
export default class Main {
static userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36";
static selector = "div.item:nth-child(1) > div"
static exePath = "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe";
static async runTest(): Promise<any> {
// let exePath = "/usr/bin/chromium";
}
static async runtTopicList(): Promise<any> {
let url = "https://www.zhihu.com/topics";
// let brower = await puppeteer.launch({ executablePath: Main.exePath });
let brower = await puppeteer.connect({ browserWSEndpoint: "ws://127.0.0.1:9222" });
let page = await brower.newPage();
await page.goto(url);
let result = await page.evaluate(() => {
let result: any[] = []; let divs = document.querySelectorAll("div.item > div:nth-child(1)"); for (let div of divs) {
let a = div.querySelector("a:nth-child(1)");
let img = div.querySelector("img"); let p = div.querySelector("p");
result.push({ title: a!.textContent, href: a!.getAttribute("href"), img: img!.getAttribute("src"), desc: p!.textContent });
}
let more = document.querySelector("a.zg-btn-white:nth-child(2)"); return result;
});
console.log(JSON.stringify(result));
await brower.close();
}
static async runTopic(): Promise<any> {
let url = "https://www.zhihu.com/topic/19555513/hot";
let brower = await puppeteer.launch({ executablePath: Main.exePath });
let page = await brower.newPage();
await page.goto(url);
let counts = await page.evaluate(() => {
let result: any[] = [];
let cc = document.querySelectorAll("strong.NumberBoard-itemValue");
for (let c of cc) {
result.push(c.getAttribute("title"));
}
return result;
});
let questions = await page.evaluate(() => {
let result = [];
let qs = document.querySelectorAll("#TopicMain > div.ListShortcut > div > div > div > div > div > h2 > div");
for (let q of qs) {
result.push(q.textContent);
}
return result;
});
console.log(questions);
await page.click("#TopicMain > div.Topic-bar.Topic-bar--borderBottom > ul > li:nth-child(1) > a");
let intro = await page.evaluate(() => {
let intro = document.querySelector("#TopicMain > div.Card > div");
return intro!.textContent;
});
let result = {
user: counts[0],
questionCount: counts[1],
intro: intro,
questions: questions
};
console.log(JSON.stringify(result));
brower.close();
}
async parseUserPage(userName: string): Promise<any> {
// let keyList = ["activities", "answers", "asks", "posts", "columns", "pins", "collections", "following"];
let url = "https://www.zhihu.com/people/" + userName + "/activities";
let page = await this.createPage();
await page.goto(url);
let result = await page.evaluate(() => {
let h1NameSel = "span.ProfileHeader-name";
let h2NameSel = "span.ProfileHeader-headline";
let infoKeySel = "span.ProfileHeader-detailLabel";
let infoValueSel = "div.ProfileHeader-detailValue";
let achievementSel = "div.Profile-sideColumnItem";
let followSel = "strong.NumberBoard-itemValue";
let profileItemSel = "a.Profile-lightItem";
let footer = document.querySelector("div.ProfileHeader-contentFooter");
if (footer) {
let button = footer.querySelector("button");
if (button) button.click();
}
let personal: any = {};
let h1Name = document.querySelector(h1NameSel);
if (h1Name) {
personal.name = h1Name.textContent;
}
let subName = document.querySelector(h2NameSel);
if (subName) {
personal.subName = subName.textContent;
}
let info: any = {};
let infoKeys = document.querySelectorAll(infoKeySel);
let infoValues = document.querySelectorAll(infoValueSel);
for (let i = 0; i < infoKeys.length; ++i) {
info[infoKeys[i].textContent || ""] = infoValues[i].textContent;
}
personal.info = info;
let achievement: any[] = [];
let achives = document.querySelectorAll(achievementSel);
for (let a of achives) {
achievement.push(a.textContent);
}
personal.achievement = achievement;
let follow = document.querySelectorAll(followSel);
personal.following = follow[0].textContent;
personal.follower = follow[1].textContent;
let profileLight = document.querySelectorAll(profileItemSel);
for (let it of profileLight) {
let ii = it.querySelectorAll("span");
personal[ii[0].textContent || ""] = ii[1].textContent;
}
return personal;
}).catch(reasone => console.log(reasone));
console.log(JSON.stringify(result));
await page.close();
}
async parseUserAnswers(userName: string): Promise<any> {
let url = "https://www.zhihu.com/people/" + userName + "/answers";
return this.doParseUserAnswers(url);
}
private async doParseUserAnswers(url: string): Promise<any> {
let page = await this.createPage();
await page.goto(url);
// await page.pdf({ path: "x0.pdf" });
await this.scrollToBottom(page);
await page.waitFor(3000);
let results: any[] = [];
do {
let ret = await page.evaluate(() => {
let getAnswerTitleAndUrl = (el: Element): { text: string, url: string } => {
let answerTitleSel = "h2 > div > a";
let tmp: any = { text: "", url: "" };
let sel = el.querySelector(answerTitleSel);
if (sel) {
tmp.text = sel.textContent;
tmp.url = sel.getAttribute("href");
}
return tmp;
}
let getAnswerMetaInfo = (el: Element): { upvoter: any, comment: any } => {
let answerMetaSel = "div.ContentItem-meta > div.AnswerItem-extraInfo";
let commentSel = "div.ContentItem-actions > button"
let tmp: any = { upVoter: "", comment: "" };
let voter = el.querySelector(answerMetaSel);
if (voter) {
tmp.upVoter = voter.textContent || "";
}
let comment = el.querySelector(commentSel);
if (comment) {
tmp.comment = comment.textContent || "";
}
return tmp;
}
let getAnswer = (el: Element): { content: string, lastTime: string } => {
let allAnswerSel = "div.RichContent-inner > button";
let all = document.querySelector(allAnswerSel) as HTMLElement;
if (all) {
all.click();
}
let answerContentSel = "div.RichContent-inner";
let answerTimeSel = "div.ContentItem-time";
let tmp: any = {};
let c = el.querySelector(answerContentSel);
if (c) {
tmp.content = c.textContent;
}
let t = el.querySelector(answerTimeSel);
if (t) {
tmp.lastTime = t.textContent;
}
return tmp;
}
let answers: any[] = [];
let items = document.querySelectorAll("div.List-item");
for (let item of items) {
let title = getAnswerTitleAndUrl(item);
let meta = getAnswerMetaInfo(item);
let answer = getAnswer(item);
answers.push({
title: title.text,
url: title.url,
upvoter: meta.upvoter,
comment: meta.comment,
body: answer.content,
time: answer.lastTime
});
console.log(title.text);
}
return answers;
}).catch((reason) => {
console.error(reason);
return reason;
});
console.log(JSON.stringify(ret));
results = results.concat(ret);
try {
let nextSel = "div.Pagination > button.Button.PaginationButton.PaginationButton-next.Button--plain";
await page.click(nextSel);
console.log("move next");
} catch (error) {
break;
}
} while (true);
await page.close();
console.log(JSON.stringify(results));
return results;
}
async parseUserAsks(userName: string): Promise<any> {
let url = "https://www.zhihu.com/people/" + userName + "/asks";
return this.doParseUserFollowingQuestionsOrCollections(url);
}
async parseUserPosts(userName: string): Promise<any> {
let url = "https://www.zhihu.com/people/" + userName + "/posts";
return this.doParseUserAnswers(url);
}
async parseUserColumns(userName: string): Promise<any> {
let url = "https://www.zhihu.com/people/" + userName + "/columns";
let page = await this.createPage();
await page.goto(url);
let results: any[] = [];
do {
await this.scrollToBottom(page);
await page.waitFor(1000);
let ret = await page.evaluate(() => {
let itemSel = "div.ContentItem-main";
let itemTitleSel = ".ContentItem-title > a";
let itemMetaSel = ".ContentItem-meta";
let result: any[] = [];
let items = document.querySelectorAll(itemSel);
for (let item of items) {
let tmp: any = {};
let titleElement = item.querySelector(itemTitleSel);
if (titleElement) {
tmp.name = titleElement.textContent;
tmp.url = titleElement.getAttribute("href");
}
let meta = item.querySelector(itemMetaSel);
if (meta) {
tmp.meta = meta.textContent;
}
result.push(tmp);
}
return result;
});
console.log(JSON.stringify(ret));
results = results.concat(ret);
try {
let nextSel = "div.Pagination > button.Button.PaginationButton.PaginationButton-next.Button--plain";
await page.click(nextSel);
} catch (error) {
break;
}
} while (true);
await page.close();
return results;
}
async parseUserCollections(userName: string): Promise<any> {
let url = "https://www.zhihu.com/people/" + userName + "/collections";
return this.doParseUserFollowingQuestionsOrCollections(url);
}
async ParseUserFollowing(userName: string): Promise<any> {
let url = "https://www.zhihu.com/people/" + userName + "/following";
return this.doParseUserFollow(url);
}
async ParseUserFollowers(userName: string): Promise<any> {
let url = "https://www.zhihu.com/people/" + userName + "/followers";
return this.doParseUserFollow(url);
}
private async doParseUserFollow(url: string): Promise<any> {
let page = await this.createPage();
await page.goto(url);
let results: any[] = [];
do {
await this.scrollToBottom(page);
await page.waitFor(1000);
let ret = await page.evaluate(() => {
let itemSel = "div.ContentItem-main";
let itemTitleSel = "div.UserItem-title > span > div.Popover > div > a";
let itemMetaSel = "div.ztext";
let itemStatusSel = "div.ContentItem-status";
let result: any[] = [];
let items = document.querySelectorAll(itemSel);
for (let item of items) {
let tmp: any = {};
let titleElement = item.querySelector(itemTitleSel);
if (titleElement) {
tmp.name = titleElement.textContent;
tmp.url = titleElement.getAttribute("href");
}
let meta = item.querySelector(itemMetaSel);
if (meta) {
tmp.ztext = meta.textContent;
}
let status = item.querySelector(itemStatusSel);
if (status) {
tmp.status = status.textContent;
}
result.push(tmp);
}
return result;
});
console.log(JSON.stringify(ret));
results = results.concat(ret);
try {
let nextSel = "div.Pagination > button.Button.PaginationButton.PaginationButton-next.Button--plain";
await page.click(nextSel);
} catch (error) {
break;
}
} while (true);
await page.close();
return results;
}
async parseUserFollowingColumns(userName: string): Promise<any> {
let url = "https://www.zhihu.com/people/" + userName + "/following/columns";
return this.doParseUserFollowingColumnsOrTopics(url);
}
async parseUserFollowingTopic(userName: string): Promise<any> {
let url = "https://www.zhihu.com/people/" + userName + "/following/topics";
return this.doParseUserFollowingColumnsOrTopics(url);
}
private async doParseUserFollowingColumnsOrTopics(url: string): Promise<any> {
let page = await this.createPage();
await page.goto(url);
let results: any[] = [];
do {
let ret = await page.evaluate(() => {
let columnListSel = "div.ContentItem-main";
let ColumnsTitleSel = ".ContentItem-title > a";
let ColumnsMetaSel = ".ContentItem-meta";
let ret: any[] = [];
let columns = document.querySelectorAll(columnListSel);
for (let column of columns) {
let record: any = {};
let title = column.querySelector(ColumnsTitleSel);
if (title) {
record.name = title.textContent;
record.url = title.getAttribute("href");
}
let meta = column.querySelector(ColumnsMetaSel);
if (meta) {
record.meta = meta.textContent;
}
ret.push(record);
}
return ret;
})
console.log(JSON.stringify(ret));
results = results.concat(ret);
try {
let nextSel = "div.Pagination > button.Button.PaginationButton.PaginationButton-next.Button--plain";
await page.click(nextSel);
} catch (error) {
break;
}
} while (true);
await page.close();
console.log(JSON.stringify(results));
return results;
}
async parseUserFolloingCollections(userName: string): Promise<any> {
let url = "https://www.zhihu.com/people/" + userName + "/following/collections";
return this.doParseUserFollowingQuestionsOrCollections(url);
}
async parseUserFollowingQuestions(userName: string): Promise<any> {
let url = "https://www.zhihu.com/people/" + userName + "/following/questions";
return this.doParseUserFollowingQuestionsOrCollections(url);
}
private async doParseUserFollowingQuestionsOrCollections(url: string): Promise<any> {
let page = await this.createPage();
await page.goto(url);
let results: any[] = [];
do {
let ret = await page.evaluate(() => {
let columnListSel = "div.ContentItem";
let ColumnsTitleSel = ".ContentItem-title > div > a";
let ColumnsMetaSel = ".ContentItem-status";
let ret: any[] = [];
let items = document.querySelectorAll(columnListSel);
for (let item of items) {
let record: any = {};
let title = item.querySelector(ColumnsTitleSel);
if (title) {
record.name = title.textContent;
record.url = title.getAttribute("href");
}
let status = item.querySelector(ColumnsMetaSel);
if (status) {
record.status = status.textContent;
}
ret.push(record);
}
return ret;
}).catch((reason) => console.log(reason));
console.log(JSON.stringify(ret));
results = results.concat(ret);
try {
let nextSel = "div.Pagination > button.Button.PaginationButton.PaginationButton-next.Button--plain";
await page.click(nextSel);
} catch (error) {
break;
}
} while (true);
await page.close();
console.log(JSON.stringify(results));
return results;
}
async scrollBaidu(): Promise<any> {
let url = "https://news.baidu.com";
let brower = await puppeteer.launch({ executablePath: Main.exePath });
let page = await brower.newPage();
await page.setViewport({ width: 1920, height: 1080 });
page.on('console', msg => console.log(msg.text()));
await page.goto(url);
await page.pdf({ path: "b0.pdf" });
await this.scrollToBottom(page);
await page.waitFor(3000);
await page.pdf({ path: "b1.pdf" });
await brower.close();
}
async scrollToBottom(page: Page): Promise<any> {
const distance = 800; // should be less than or equal to window.innerHeight
const delay = 1000;
while (await page.evaluate(() => document.documentElement.scrollTop + window.innerHeight < document.documentElement.scrollHeight)) {
await page.evaluate((y) => {
document.documentElement.scrollBy(0, y);
console.log("position: ", document.documentElement.scrollTop, document.documentElement.scrollHeight);
}, distance);
await page.waitFor(delay);
}
}
async createPage(): Promise<Page> {
let brower = await this.createBrowser();
let page = await brower.newPage();
await page.setUserAgent(Main.userAgent);
await page.setViewport({ width: 1920, height: 1080 });
page.on('console', msg => console.log(msg.text()));
return page;
}
private async createBrowser(): Promise<Browser> {
let endpoint = await this.getWebSocketEndPoint().catch((reason) => { console.log(reason); return ""; });
return puppeteer.connect({ browserWSEndpoint: endpoint });
}
private async getWebSocketEndPoint(): Promise<string> {
let resp = await axios.get("http://localhost:9222/json/version");
return resp.data.webSocketDebuggerUrl;
}
}
let main = new Main();
// main.ParseUserAnswers("https://www.zhihu.com/people/mao-dou-65-61/answers");
// main.ParseUserFollowers("grace-xu-36");
// main.parseUserFollowingColumns("grace-xu-36");
// main.parseUserPage("https://www.zhihu.com/people/crackinterview/activities");
// main.parseUserFollowingColumns("cici");
// main.parseUserFollowingQuestions("cici");
// main.parseUserPage("cici");
// main.parseUserFollowingTopic("cici");
// main.parseUserAnswers("cici");
// main.parseUserColumns("cici");
main.parseUserCollections("cici");
@QGB
Copy link

QGB commented Aug 2, 2021

document.querySelectorAll('#TopicMain > div.ListShortcut > div > div > div > Div > div > h2 > div > a')
NodeList(13) [a, a, a, a, a, a, a, a, a, a, a, a, a] # 搜索页的 问题
document.querySelectorAll('#TopicMain > div.ListShortcut > div > div > div > Div > div > h2 > a')
NodeList(68) [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a]## zhuanlan文章

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment