Skip to content

Instantly share code, notes, and snippets.

@spacemeowx2
Last active March 15, 2020 09:11
Show Gist options
  • Save spacemeowx2/a7a7799abd05fd82aaf4e91c7366943a to your computer and use it in GitHub Desktop.
Save spacemeowx2/a7a7799abd05fd82aaf4e91c7366943a to your computer and use it in GitHub Desktop.
抽奖微博爬虫
const DELAY_TIME = 1000
async function getRepost(page) {
let res = await fetch(`https://m.weibo.cn/api/statuses/repostTimeline?id=4211254174873514&page=${page}`)
let obj = await res.json()
if (obj.ok !== 1) {
return false
}
return obj.data.data
}
async function getComment(page) {
let res = await fetch(`https://m.weibo.cn/api/comments/show?id=4211254174873514&page=${page}`)
let obj = await res.json()
if (obj.ok !== 1) {
return false
}
return obj.data.data
}
function delay (ms) {
return new Promise(res => setTimeout(res, ms))
}
async function getAll() {
let i = 1
let retry = 0
let reposts = []
let comments = []
window.debug = {
reposts,
comments
}
console.log('repost begin')
while (true) {
let p = await getRepost(i)
if (p) {
reposts.push(...p)
console.log(`page ${i} done`)
i++
} else {
if (retry < 4) {
retry++
console.log(`page ${i} failed, retry: ${retry}`)
} else {
retry = 0
console.log(`failed on page ${i}, exit`)
break
}
}
await delay(DELAY_TIME)
}
console.log('comment begin')
i = 1
while (true) {
let p = await getComment(i)
if (p) {
comments.push(...p)
console.log(`page ${i} done`)
i++
} else {
if (retry < 4) {
retry++
console.log(`page ${i} failed, retry: ${retry}`)
} else {
retry = 0
console.log(`failed on page ${i}, exit`)
break
}
}
await delay(DELAY_TIME)
}
}
const fs = require('fs')
const util = require('util')
const readFile = util.promisify(fs.readFile)
const writeFile = util.promisify(fs.writeFile)
function repostSimplify (r) {
return {
id: r.id,
user_id: r.user.id,
text: r.raw_text,
user: r.user.screen_name,
time: r.created_at,
from: 'repost'
}
}
function commentSimplify (r) {
return {
id: r.id,
user_id: r.user.id,
text: r.text,
user: r.user.screen_name,
time: r.created_at,
from: 'comment'
}
}
function removeSameId (ary, ids = new Set()) {
return ary.filter(r => {
const id = r.id
if (ids.has(id)) {
return false
} else {
ids.add(id)
return true
}
})
}
function removeSameUserId (ary, ids = new Set()) {
return ary.filter(r => {
const id = r.user_id
if (ids.has(id)) {
return false
} else {
ids.add(id)
return true
}
})
}
function toCSV (str, i) {
return str + `\n\t${i.id},\t${i.user_id},\t${i.text},\t${i.user},\t${i.time},\t${i.from}`
}
async function main () {
const raw = JSON.parse(await readFile('raw.json'))
/** @type {Array} */
let reposts = raw.reposts
let comments = raw.comments
reposts = reposts.map(repostSimplify)
comments = comments.map(commentSimplify)
console.log(`reposts: ${reposts.length} comments: ${comments.length}`)
reposts = removeSameId(reposts)
comments = removeSameId(comments)
console.log(`reposts: ${reposts.length} comments: ${comments.length}`)
let user_ids = new Set()
reposts = removeSameUserId(reposts, user_ids)
comments = removeSameUserId(comments, user_ids)
console.log(`reposts: ${reposts.length} comments: ${comments.length}`)
const fields = 'id,user_id,text,user,time,from'
await writeFile('result.csv', '\ufeff' + [...reposts, ...comments].reduce(toCSV, fields))
// await writeFile('comments.csv', comments.reduce(toCSV, fields))
}
main().catch(e => console.error(e))
/**
repost:
{
id: 4211990370709412,
raw_text: "哇这个也心动//@双极性蓝火火:哇//@eleholic:[跪了]//@颓废中的九零:[哆啦A梦吃惊]//@咕噜咕噜小摩卡:再给偶像转一次[并不简单]//@八百比海琴:竟然这么多人转[允悲],那就http://t.cn/REbLG46 ,大概会从基础的一些知识和操作到大局观一步步教吧,新人尽量领到S+,S+尽量胜率变高",
user: {
id: 1812259204,
screen_name: "月島蛍太太",
profile_image_url: "https://tvax4.sinaimg.cn/crop.0.0.480.480.180/6c04e184ly8fkxx1anwezj20dc0dcjsc.jpg",
verified: true,
verified_type: 0,
verified_type_ext: 0,
mbtype: 13,
profile_url: "https://m.weibo.cn/u/1812259204?uid=1812259204",
remark: "",
following: false,
follow_me: false
},
like_counts: 0,
liked: false
}
comment:
{
id: 4211254635439074,
created_at: "02-25",
source: "iPhone",
user: {
id: 1081455044,
screen_name: "大帅比二卷酱",
profile_image_url: "https://tva1.sinaimg.cn/crop.0.0.640.640.180/4075b1c4jw8eax6e7ay83j20hs0hs0ue.jpg",
verified: false,
verified_type: 220,
mbtype: 0,
profile_url: "https://m.weibo.cn/u/1081455044?uid=1081455044&featurecode=20000320",
remark: "",
following: false,
follow_me: false
},
text: "有有有!oct老师看我看我!",
like_counts: 0,
liked: false
}
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment