raw.json 文件太大, 移步 https://gist.github.com/spacemeowx2/851d2ebe73ff475da1faf02e89e973b1
Last active
March 15, 2020 09:11
-
-
Save spacemeowx2/a7a7799abd05fd82aaf4e91c7366943a to your computer and use it in GitHub Desktop.
抽奖微博爬虫
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const DELAY_TIME = 1000 | |
async function getRepost(page) { | |
let res = await fetch(`https://m.weibo.cn/api/statuses/repostTimeline?id=4211254174873514&page=${page}`) | |
let obj = await res.json() | |
if (obj.ok !== 1) { | |
return false | |
} | |
return obj.data.data | |
} | |
async function getComment(page) { | |
let res = await fetch(`https://m.weibo.cn/api/comments/show?id=4211254174873514&page=${page}`) | |
let obj = await res.json() | |
if (obj.ok !== 1) { | |
return false | |
} | |
return obj.data.data | |
} | |
function delay (ms) { | |
return new Promise(res => setTimeout(res, ms)) | |
} | |
async function getAll() { | |
let i = 1 | |
let retry = 0 | |
let reposts = [] | |
let comments = [] | |
window.debug = { | |
reposts, | |
comments | |
} | |
console.log('repost begin') | |
while (true) { | |
let p = await getRepost(i) | |
if (p) { | |
reposts.push(...p) | |
console.log(`page ${i} done`) | |
i++ | |
} else { | |
if (retry < 4) { | |
retry++ | |
console.log(`page ${i} failed, retry: ${retry}`) | |
} else { | |
retry = 0 | |
console.log(`failed on page ${i}, exit`) | |
break | |
} | |
} | |
await delay(DELAY_TIME) | |
} | |
console.log('comment begin') | |
i = 1 | |
while (true) { | |
let p = await getComment(i) | |
if (p) { | |
comments.push(...p) | |
console.log(`page ${i} done`) | |
i++ | |
} else { | |
if (retry < 4) { | |
retry++ | |
console.log(`page ${i} failed, retry: ${retry}`) | |
} else { | |
retry = 0 | |
console.log(`failed on page ${i}, exit`) | |
break | |
} | |
} | |
await delay(DELAY_TIME) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs') | |
const util = require('util') | |
const readFile = util.promisify(fs.readFile) | |
const writeFile = util.promisify(fs.writeFile) | |
function repostSimplify (r) { | |
return { | |
id: r.id, | |
user_id: r.user.id, | |
text: r.raw_text, | |
user: r.user.screen_name, | |
time: r.created_at, | |
from: 'repost' | |
} | |
} | |
function commentSimplify (r) { | |
return { | |
id: r.id, | |
user_id: r.user.id, | |
text: r.text, | |
user: r.user.screen_name, | |
time: r.created_at, | |
from: 'comment' | |
} | |
} | |
function removeSameId (ary, ids = new Set()) { | |
return ary.filter(r => { | |
const id = r.id | |
if (ids.has(id)) { | |
return false | |
} else { | |
ids.add(id) | |
return true | |
} | |
}) | |
} | |
function removeSameUserId (ary, ids = new Set()) { | |
return ary.filter(r => { | |
const id = r.user_id | |
if (ids.has(id)) { | |
return false | |
} else { | |
ids.add(id) | |
return true | |
} | |
}) | |
} | |
function toCSV (str, i) { | |
return str + `\n\t${i.id},\t${i.user_id},\t${i.text},\t${i.user},\t${i.time},\t${i.from}` | |
} | |
async function main () { | |
const raw = JSON.parse(await readFile('raw.json')) | |
/** @type {Array} */ | |
let reposts = raw.reposts | |
let comments = raw.comments | |
reposts = reposts.map(repostSimplify) | |
comments = comments.map(commentSimplify) | |
console.log(`reposts: ${reposts.length} comments: ${comments.length}`) | |
reposts = removeSameId(reposts) | |
comments = removeSameId(comments) | |
console.log(`reposts: ${reposts.length} comments: ${comments.length}`) | |
let user_ids = new Set() | |
reposts = removeSameUserId(reposts, user_ids) | |
comments = removeSameUserId(comments, user_ids) | |
console.log(`reposts: ${reposts.length} comments: ${comments.length}`) | |
const fields = 'id,user_id,text,user,time,from' | |
await writeFile('result.csv', '\ufeff' + [...reposts, ...comments].reduce(toCSV, fields)) | |
// await writeFile('comments.csv', comments.reduce(toCSV, fields)) | |
} | |
main().catch(e => console.error(e)) | |
/** | |
repost: | |
{ | |
id: 4211990370709412, | |
raw_text: "哇这个也心动//@双极性蓝火火:哇//@eleholic:[跪了]//@颓废中的九零:[哆啦A梦吃惊]//@咕噜咕噜小摩卡:再给偶像转一次[并不简单]//@八百比海琴:竟然这么多人转[允悲],那就http://t.cn/REbLG46 ,大概会从基础的一些知识和操作到大局观一步步教吧,新人尽量领到S+,S+尽量胜率变高", | |
user: { | |
id: 1812259204, | |
screen_name: "月島蛍太太", | |
profile_image_url: "https://tvax4.sinaimg.cn/crop.0.0.480.480.180/6c04e184ly8fkxx1anwezj20dc0dcjsc.jpg", | |
verified: true, | |
verified_type: 0, | |
verified_type_ext: 0, | |
mbtype: 13, | |
profile_url: "https://m.weibo.cn/u/1812259204?uid=1812259204", | |
remark: "", | |
following: false, | |
follow_me: false | |
}, | |
like_counts: 0, | |
liked: false | |
} | |
comment: | |
{ | |
id: 4211254635439074, | |
created_at: "02-25", | |
source: "iPhone", | |
user: { | |
id: 1081455044, | |
screen_name: "大帅比二卷酱", | |
profile_image_url: "https://tva1.sinaimg.cn/crop.0.0.640.640.180/4075b1c4jw8eax6e7ay83j20hs0hs0ue.jpg", | |
verified: false, | |
verified_type: 220, | |
mbtype: 0, | |
profile_url: "https://m.weibo.cn/u/1081455044?uid=1081455044&featurecode=20000320", | |
remark: "", | |
following: false, | |
follow_me: false | |
}, | |
text: "有有有!oct老师看我看我!", | |
like_counts: 0, | |
liked: false | |
} | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment