Last active
August 2, 2021 14:47
-
-
Save huiliu/6399321c7cba7798a3ffc02380c559db to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import puppeteer, { Page, Browser } from "puppeteer-core" | |
import axios from "axios"; | |
import { stat } from "fs"; | |
export default class Main { | |
static userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"; | |
static selector = "div.item:nth-child(1) > div" | |
static exePath = "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"; | |
static async runTest(): Promise<any> { | |
// let exePath = "/usr/bin/chromium"; | |
} | |
static async runtTopicList(): Promise<any> { | |
let url = "https://www.zhihu.com/topics"; | |
// let brower = await puppeteer.launch({ executablePath: Main.exePath }); | |
let brower = await puppeteer.connect({ browserWSEndpoint: "ws://127.0.0.1:9222" }); | |
let page = await brower.newPage(); | |
await page.goto(url); | |
let result = await page.evaluate(() => { | |
let result: any[] = []; let divs = document.querySelectorAll("div.item > div:nth-child(1)"); for (let div of divs) { | |
let a = div.querySelector("a:nth-child(1)"); | |
let img = div.querySelector("img"); let p = div.querySelector("p"); | |
result.push({ title: a!.textContent, href: a!.getAttribute("href"), img: img!.getAttribute("src"), desc: p!.textContent }); | |
} | |
let more = document.querySelector("a.zg-btn-white:nth-child(2)"); return result; | |
}); | |
console.log(JSON.stringify(result)); | |
await brower.close(); | |
} | |
static async runTopic(): Promise<any> { | |
let url = "https://www.zhihu.com/topic/19555513/hot"; | |
let brower = await puppeteer.launch({ executablePath: Main.exePath }); | |
let page = await brower.newPage(); | |
await page.goto(url); | |
let counts = await page.evaluate(() => { | |
let result: any[] = []; | |
let cc = document.querySelectorAll("strong.NumberBoard-itemValue"); | |
for (let c of cc) { | |
result.push(c.getAttribute("title")); | |
} | |
return result; | |
}); | |
let questions = await page.evaluate(() => { | |
let result = []; | |
let qs = document.querySelectorAll("#TopicMain > div.ListShortcut > div > div > div > div > div > h2 > div"); | |
for (let q of qs) { | |
result.push(q.textContent); | |
} | |
return result; | |
}); | |
console.log(questions); | |
await page.click("#TopicMain > div.Topic-bar.Topic-bar--borderBottom > ul > li:nth-child(1) > a"); | |
let intro = await page.evaluate(() => { | |
let intro = document.querySelector("#TopicMain > div.Card > div"); | |
return intro!.textContent; | |
}); | |
let result = { | |
user: counts[0], | |
questionCount: counts[1], | |
intro: intro, | |
questions: questions | |
}; | |
console.log(JSON.stringify(result)); | |
brower.close(); | |
} | |
async parseUserPage(userName: string): Promise<any> { | |
// let keyList = ["activities", "answers", "asks", "posts", "columns", "pins", "collections", "following"]; | |
let url = "https://www.zhihu.com/people/" + userName + "/activities"; | |
let page = await this.createPage(); | |
await page.goto(url); | |
let result = await page.evaluate(() => { | |
let h1NameSel = "span.ProfileHeader-name"; | |
let h2NameSel = "span.ProfileHeader-headline"; | |
let infoKeySel = "span.ProfileHeader-detailLabel"; | |
let infoValueSel = "div.ProfileHeader-detailValue"; | |
let achievementSel = "div.Profile-sideColumnItem"; | |
let followSel = "strong.NumberBoard-itemValue"; | |
let profileItemSel = "a.Profile-lightItem"; | |
let footer = document.querySelector("div.ProfileHeader-contentFooter"); | |
if (footer) { | |
let button = footer.querySelector("button"); | |
if (button) button.click(); | |
} | |
let personal: any = {}; | |
let h1Name = document.querySelector(h1NameSel); | |
if (h1Name) { | |
personal.name = h1Name.textContent; | |
} | |
let subName = document.querySelector(h2NameSel); | |
if (subName) { | |
personal.subName = subName.textContent; | |
} | |
let info: any = {}; | |
let infoKeys = document.querySelectorAll(infoKeySel); | |
let infoValues = document.querySelectorAll(infoValueSel); | |
for (let i = 0; i < infoKeys.length; ++i) { | |
info[infoKeys[i].textContent || ""] = infoValues[i].textContent; | |
} | |
personal.info = info; | |
let achievement: any[] = []; | |
let achives = document.querySelectorAll(achievementSel); | |
for (let a of achives) { | |
achievement.push(a.textContent); | |
} | |
personal.achievement = achievement; | |
let follow = document.querySelectorAll(followSel); | |
personal.following = follow[0].textContent; | |
personal.follower = follow[1].textContent; | |
let profileLight = document.querySelectorAll(profileItemSel); | |
for (let it of profileLight) { | |
let ii = it.querySelectorAll("span"); | |
personal[ii[0].textContent || ""] = ii[1].textContent; | |
} | |
return personal; | |
}).catch(reasone => console.log(reasone)); | |
console.log(JSON.stringify(result)); | |
await page.close(); | |
} | |
async parseUserAnswers(userName: string): Promise<any> { | |
let url = "https://www.zhihu.com/people/" + userName + "/answers"; | |
return this.doParseUserAnswers(url); | |
} | |
private async doParseUserAnswers(url: string): Promise<any> { | |
let page = await this.createPage(); | |
await page.goto(url); | |
// await page.pdf({ path: "x0.pdf" }); | |
await this.scrollToBottom(page); | |
await page.waitFor(3000); | |
let results: any[] = []; | |
do { | |
let ret = await page.evaluate(() => { | |
let getAnswerTitleAndUrl = (el: Element): { text: string, url: string } => { | |
let answerTitleSel = "h2 > div > a"; | |
let tmp: any = { text: "", url: "" }; | |
let sel = el.querySelector(answerTitleSel); | |
if (sel) { | |
tmp.text = sel.textContent; | |
tmp.url = sel.getAttribute("href"); | |
} | |
return tmp; | |
} | |
let getAnswerMetaInfo = (el: Element): { upvoter: any, comment: any } => { | |
let answerMetaSel = "div.ContentItem-meta > div.AnswerItem-extraInfo"; | |
let commentSel = "div.ContentItem-actions > button" | |
let tmp: any = { upVoter: "", comment: "" }; | |
let voter = el.querySelector(answerMetaSel); | |
if (voter) { | |
tmp.upVoter = voter.textContent || ""; | |
} | |
let comment = el.querySelector(commentSel); | |
if (comment) { | |
tmp.comment = comment.textContent || ""; | |
} | |
return tmp; | |
} | |
let getAnswer = (el: Element): { content: string, lastTime: string } => { | |
let allAnswerSel = "div.RichContent-inner > button"; | |
let all = document.querySelector(allAnswerSel) as HTMLElement; | |
if (all) { | |
all.click(); | |
} | |
let answerContentSel = "div.RichContent-inner"; | |
let answerTimeSel = "div.ContentItem-time"; | |
let tmp: any = {}; | |
let c = el.querySelector(answerContentSel); | |
if (c) { | |
tmp.content = c.textContent; | |
} | |
let t = el.querySelector(answerTimeSel); | |
if (t) { | |
tmp.lastTime = t.textContent; | |
} | |
return tmp; | |
} | |
let answers: any[] = []; | |
let items = document.querySelectorAll("div.List-item"); | |
for (let item of items) { | |
let title = getAnswerTitleAndUrl(item); | |
let meta = getAnswerMetaInfo(item); | |
let answer = getAnswer(item); | |
answers.push({ | |
title: title.text, | |
url: title.url, | |
upvoter: meta.upvoter, | |
comment: meta.comment, | |
body: answer.content, | |
time: answer.lastTime | |
}); | |
console.log(title.text); | |
} | |
return answers; | |
}).catch((reason) => { | |
console.error(reason); | |
return reason; | |
}); | |
console.log(JSON.stringify(ret)); | |
results = results.concat(ret); | |
try { | |
let nextSel = "div.Pagination > button.Button.PaginationButton.PaginationButton-next.Button--plain"; | |
await page.click(nextSel); | |
console.log("move next"); | |
} catch (error) { | |
break; | |
} | |
} while (true); | |
await page.close(); | |
console.log(JSON.stringify(results)); | |
return results; | |
} | |
async parseUserAsks(userName: string): Promise<any> { | |
let url = "https://www.zhihu.com/people/" + userName + "/asks"; | |
return this.doParseUserFollowingQuestionsOrCollections(url); | |
} | |
async parseUserPosts(userName: string): Promise<any> { | |
let url = "https://www.zhihu.com/people/" + userName + "/posts"; | |
return this.doParseUserAnswers(url); | |
} | |
async parseUserColumns(userName: string): Promise<any> { | |
let url = "https://www.zhihu.com/people/" + userName + "/columns"; | |
let page = await this.createPage(); | |
await page.goto(url); | |
let results: any[] = []; | |
do { | |
await this.scrollToBottom(page); | |
await page.waitFor(1000); | |
let ret = await page.evaluate(() => { | |
let itemSel = "div.ContentItem-main"; | |
let itemTitleSel = ".ContentItem-title > a"; | |
let itemMetaSel = ".ContentItem-meta"; | |
let result: any[] = []; | |
let items = document.querySelectorAll(itemSel); | |
for (let item of items) { | |
let tmp: any = {}; | |
let titleElement = item.querySelector(itemTitleSel); | |
if (titleElement) { | |
tmp.name = titleElement.textContent; | |
tmp.url = titleElement.getAttribute("href"); | |
} | |
let meta = item.querySelector(itemMetaSel); | |
if (meta) { | |
tmp.meta = meta.textContent; | |
} | |
result.push(tmp); | |
} | |
return result; | |
}); | |
console.log(JSON.stringify(ret)); | |
results = results.concat(ret); | |
try { | |
let nextSel = "div.Pagination > button.Button.PaginationButton.PaginationButton-next.Button--plain"; | |
await page.click(nextSel); | |
} catch (error) { | |
break; | |
} | |
} while (true); | |
await page.close(); | |
return results; | |
} | |
async parseUserCollections(userName: string): Promise<any> { | |
let url = "https://www.zhihu.com/people/" + userName + "/collections"; | |
return this.doParseUserFollowingQuestionsOrCollections(url); | |
} | |
async ParseUserFollowing(userName: string): Promise<any> { | |
let url = "https://www.zhihu.com/people/" + userName + "/following"; | |
return this.doParseUserFollow(url); | |
} | |
async ParseUserFollowers(userName: string): Promise<any> { | |
let url = "https://www.zhihu.com/people/" + userName + "/followers"; | |
return this.doParseUserFollow(url); | |
} | |
private async doParseUserFollow(url: string): Promise<any> { | |
let page = await this.createPage(); | |
await page.goto(url); | |
let results: any[] = []; | |
do { | |
await this.scrollToBottom(page); | |
await page.waitFor(1000); | |
let ret = await page.evaluate(() => { | |
let itemSel = "div.ContentItem-main"; | |
let itemTitleSel = "div.UserItem-title > span > div.Popover > div > a"; | |
let itemMetaSel = "div.ztext"; | |
let itemStatusSel = "div.ContentItem-status"; | |
let result: any[] = []; | |
let items = document.querySelectorAll(itemSel); | |
for (let item of items) { | |
let tmp: any = {}; | |
let titleElement = item.querySelector(itemTitleSel); | |
if (titleElement) { | |
tmp.name = titleElement.textContent; | |
tmp.url = titleElement.getAttribute("href"); | |
} | |
let meta = item.querySelector(itemMetaSel); | |
if (meta) { | |
tmp.ztext = meta.textContent; | |
} | |
let status = item.querySelector(itemStatusSel); | |
if (status) { | |
tmp.status = status.textContent; | |
} | |
result.push(tmp); | |
} | |
return result; | |
}); | |
console.log(JSON.stringify(ret)); | |
results = results.concat(ret); | |
try { | |
let nextSel = "div.Pagination > button.Button.PaginationButton.PaginationButton-next.Button--plain"; | |
await page.click(nextSel); | |
} catch (error) { | |
break; | |
} | |
} while (true); | |
await page.close(); | |
return results; | |
} | |
async parseUserFollowingColumns(userName: string): Promise<any> { | |
let url = "https://www.zhihu.com/people/" + userName + "/following/columns"; | |
return this.doParseUserFollowingColumnsOrTopics(url); | |
} | |
async parseUserFollowingTopic(userName: string): Promise<any> { | |
let url = "https://www.zhihu.com/people/" + userName + "/following/topics"; | |
return this.doParseUserFollowingColumnsOrTopics(url); | |
} | |
private async doParseUserFollowingColumnsOrTopics(url: string): Promise<any> { | |
let page = await this.createPage(); | |
await page.goto(url); | |
let results: any[] = []; | |
do { | |
let ret = await page.evaluate(() => { | |
let columnListSel = "div.ContentItem-main"; | |
let ColumnsTitleSel = ".ContentItem-title > a"; | |
let ColumnsMetaSel = ".ContentItem-meta"; | |
let ret: any[] = []; | |
let columns = document.querySelectorAll(columnListSel); | |
for (let column of columns) { | |
let record: any = {}; | |
let title = column.querySelector(ColumnsTitleSel); | |
if (title) { | |
record.name = title.textContent; | |
record.url = title.getAttribute("href"); | |
} | |
let meta = column.querySelector(ColumnsMetaSel); | |
if (meta) { | |
record.meta = meta.textContent; | |
} | |
ret.push(record); | |
} | |
return ret; | |
}) | |
console.log(JSON.stringify(ret)); | |
results = results.concat(ret); | |
try { | |
let nextSel = "div.Pagination > button.Button.PaginationButton.PaginationButton-next.Button--plain"; | |
await page.click(nextSel); | |
} catch (error) { | |
break; | |
} | |
} while (true); | |
await page.close(); | |
console.log(JSON.stringify(results)); | |
return results; | |
} | |
async parseUserFolloingCollections(userName: string): Promise<any> { | |
let url = "https://www.zhihu.com/people/" + userName + "/following/collections"; | |
return this.doParseUserFollowingQuestionsOrCollections(url); | |
} | |
async parseUserFollowingQuestions(userName: string): Promise<any> { | |
let url = "https://www.zhihu.com/people/" + userName + "/following/questions"; | |
return this.doParseUserFollowingQuestionsOrCollections(url); | |
} | |
private async doParseUserFollowingQuestionsOrCollections(url: string): Promise<any> { | |
let page = await this.createPage(); | |
await page.goto(url); | |
let results: any[] = []; | |
do { | |
let ret = await page.evaluate(() => { | |
let columnListSel = "div.ContentItem"; | |
let ColumnsTitleSel = ".ContentItem-title > div > a"; | |
let ColumnsMetaSel = ".ContentItem-status"; | |
let ret: any[] = []; | |
let items = document.querySelectorAll(columnListSel); | |
for (let item of items) { | |
let record: any = {}; | |
let title = item.querySelector(ColumnsTitleSel); | |
if (title) { | |
record.name = title.textContent; | |
record.url = title.getAttribute("href"); | |
} | |
let status = item.querySelector(ColumnsMetaSel); | |
if (status) { | |
record.status = status.textContent; | |
} | |
ret.push(record); | |
} | |
return ret; | |
}).catch((reason) => console.log(reason)); | |
console.log(JSON.stringify(ret)); | |
results = results.concat(ret); | |
try { | |
let nextSel = "div.Pagination > button.Button.PaginationButton.PaginationButton-next.Button--plain"; | |
await page.click(nextSel); | |
} catch (error) { | |
break; | |
} | |
} while (true); | |
await page.close(); | |
console.log(JSON.stringify(results)); | |
return results; | |
} | |
async scrollBaidu(): Promise<any> { | |
let url = "https://news.baidu.com"; | |
let brower = await puppeteer.launch({ executablePath: Main.exePath }); | |
let page = await brower.newPage(); | |
await page.setViewport({ width: 1920, height: 1080 }); | |
page.on('console', msg => console.log(msg.text())); | |
await page.goto(url); | |
await page.pdf({ path: "b0.pdf" }); | |
await this.scrollToBottom(page); | |
await page.waitFor(3000); | |
await page.pdf({ path: "b1.pdf" }); | |
await brower.close(); | |
} | |
async scrollToBottom(page: Page): Promise<any> { | |
const distance = 800; // should be less than or equal to window.innerHeight | |
const delay = 1000; | |
while (await page.evaluate(() => document.documentElement.scrollTop + window.innerHeight < document.documentElement.scrollHeight)) { | |
await page.evaluate((y) => { | |
document.documentElement.scrollBy(0, y); | |
console.log("position: ", document.documentElement.scrollTop, document.documentElement.scrollHeight); | |
}, distance); | |
await page.waitFor(delay); | |
} | |
} | |
async createPage(): Promise<Page> { | |
let brower = await this.createBrowser(); | |
let page = await brower.newPage(); | |
await page.setUserAgent(Main.userAgent); | |
await page.setViewport({ width: 1920, height: 1080 }); | |
page.on('console', msg => console.log(msg.text())); | |
return page; | |
} | |
private async createBrowser(): Promise<Browser> { | |
let endpoint = await this.getWebSocketEndPoint().catch((reason) => { console.log(reason); return ""; }); | |
return puppeteer.connect({ browserWSEndpoint: endpoint }); | |
} | |
private async getWebSocketEndPoint(): Promise<string> { | |
let resp = await axios.get("http://localhost:9222/json/version"); | |
return resp.data.webSocketDebuggerUrl; | |
} | |
} | |
let main = new Main(); | |
// main.ParseUserAnswers("https://www.zhihu.com/people/mao-dou-65-61/answers"); | |
// main.ParseUserFollowers("grace-xu-36"); | |
// main.parseUserFollowingColumns("grace-xu-36"); | |
// main.parseUserPage("https://www.zhihu.com/people/crackinterview/activities"); | |
// main.parseUserFollowingColumns("cici"); | |
// main.parseUserFollowingQuestions("cici"); | |
// main.parseUserPage("cici"); | |
// main.parseUserFollowingTopic("cici"); | |
// main.parseUserAnswers("cici"); | |
// main.parseUserColumns("cici"); | |
main.parseUserCollections("cici"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
document.querySelectorAll('#TopicMain > div.ListShortcut > div > div > div > Div > div > h2 > div > a')
NodeList(13) [a, a, a, a, a, a, a, a, a, a, a, a, a] # 搜索页的 问题
document.querySelectorAll('#TopicMain > div.ListShortcut > div > div > div > Div > div > h2 > a')
NodeList(68) [a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a]## zhuanlan文章