Created
June 23, 2020 10:36
-
-
Save tedshd/4100b1cbefa2610c6dfdbff6cd5617e1 to your computer and use it in GitHub Desktop.
parse
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const url = require('url'); | |
const pLimit = require('p-limit'); | |
const limit = pLimit(1); | |
crawlerIG(['joanne_722']); | |
/** | |
* | |
* | |
* @param {*} ids String or Array | |
* example: 'joanne_722' or ['joanne_722'] | |
* @returns [ | |
* { | |
* avatar: '', | |
* posts: '', | |
* post_count: '', | |
* follower_count: '', | |
* social_name: '', | |
* private_account: false / true, | |
* social_name_exist: false / true, | |
* recent: [''], | |
* } | |
* ] | |
*/ | |
async function crawlerIG (ids) { | |
let browser = false, | |
cookieFlag = false, | |
urlArray = [], | |
parseResult = {}; | |
if (!ids) { | |
// TODO put error log | |
console.error('crawlerIG: not set IG ids'); | |
return; | |
} | |
if (Array.isArray(ids)) { | |
if (!ids.length) { | |
// TODO put error log | |
console.error('crawlerIG: ids is empty'); | |
return; | |
} | |
} | |
if (typeof ids === 'string') { | |
console.log('crawlerIG: ids trans to array'); | |
ids = [ids]; | |
} | |
for (let i = 0; i < ids.length; i++) { | |
urlArray.push('https://www.instagram.com/' + ids[i] + '/'); | |
} | |
if (!browser) { | |
browser = await puppeteer.launch(); | |
} | |
var page = await browser.newPage(); | |
// UA | |
page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0'); | |
// set cookie | |
if (!cookieFlag) { | |
var cookies = [{ | |
'name': 'sessionid', | |
'value': '37549417879%3AK12RRPXVjcjTz0%3A5', | |
'domain': '.instagram.com' | |
}]; | |
await page.setCookie(...cookies); | |
cookieFlag = true; | |
} | |
var cookiesSet = await page.cookies(urlArray[0]); | |
console.log('IG cookie: ' + JSON.stringify(cookiesSet)); | |
console.log('urlArray: ', urlArray); | |
await fetchData(page, urlArray[0]); | |
// for (let j = 0; j < urlArray.length; j++) { | |
// var parseUrl = urlArray[j], | |
// path = url.parse(parseUrl, true).path, | |
// user = path.slice(1, path.length - 1), | |
// privateAccount = false, | |
// socialNameExist = true; | |
// await page.goto(parseUrl, { waitUntil: 'networkidle2' }); | |
// if (await page.$('input[name=username]') !== null) { | |
// console.log('login'); | |
// await page.focus('input[name=username]'); | |
// await page.keyboard.type(''); | |
// await page.focus('input[name=password]'); | |
// await page.keyboard.type(''); | |
// await page.waitFor(1000); | |
// await page.click('button[type=submit]'); | |
// await page.waitFor(2000); | |
// if (await page.$('section > main > div > div > div > div > button') !== null) { | |
// await page.click('button[type=button]'); // first login check save account data in browser | |
// } | |
// } | |
// await page.waitFor(2000); | |
// if (await page.$('a[href="' + path + '"]') == null) { | |
// console.log('not profile'); | |
// if (await page.$('section > main > div > header > section > div > button') !== null) { | |
// await page.click('section > main > div > header > section > div > button') // click follow button | |
// privateAccount = true; | |
// } else { | |
// socialNameExist = false; | |
// } | |
// } | |
// if (!socialNameExist) { | |
// parseResult[user] = { | |
// social_name: user, | |
// social_name_exist: false, | |
// }; | |
// continue; | |
// } | |
// await page.waitFor(3000); | |
// console.log('parse data'); | |
// let data = await page.evaluate(() => { | |
// var avatar = document.querySelector('header img').src || '', | |
// post_count = document.querySelectorAll('header section ul li')[0].innerText.split(' ')[0] || '', | |
// follower_count = document.querySelectorAll('header section ul li')[1].innerText.split(' ')[0] || '', | |
// images = document.querySelectorAll('article div')[0].querySelectorAll('img') || [], | |
// recent = [], | |
// sum = 10; | |
// for (let index = 0; index < images.length; index++) { | |
// if (index == sum - 1) { | |
// break; | |
// } | |
// recent.push(images[index].src); | |
// } | |
// return { | |
// "avatar": avatar, | |
// "post_count": post_count, | |
// "follower_count": follower_count, | |
// "recent": recent | |
// }; | |
// }); | |
// data['social_name'] = user; | |
// data['private_account'] = privateAccount; | |
// data['social_name_exist'] = socialNameExist; | |
// parseResult[user] = data; | |
// } | |
async function fetchData (page, parseUrl) { | |
var path = url.parse(parseUrl, true).path, | |
user = path.slice(1, path.length - 1), | |
privateAccount = false, | |
socialNameExist = true; | |
await page.goto(parseUrl, { waitUntil: 'networkidle2' }); | |
if (await page.$('input[name=username]') !== null) { | |
console.log('login'); | |
await page.focus('input[name=username]'); | |
await page.keyboard.type(''); | |
await page.focus('input[name=password]'); | |
await page.keyboard.type(''); | |
await page.waitFor(1000); | |
await page.click('button[type=submit]'); | |
await page.waitFor(2000); | |
if (await page.$('section > main > div > div > div > div > button') !== null) { | |
await page.click('button[type=button]'); // first login check save account data in browser | |
} | |
} | |
await page.waitFor(2000); | |
if (await page.$('a[href="' + path + '"]') == null) { | |
console.log('not profile'); | |
if (await page.$('section > main > div > header > section > div > button') !== null) { | |
await page.click('section > main > div > header > section > div > button') // click follow button | |
privateAccount = true; | |
} else { | |
socialNameExist = false; | |
} | |
} | |
if (!socialNameExist) { | |
parseResult[user] = { | |
social_name: user, | |
social_name_exist: false, | |
}; | |
continue; | |
} | |
await page.waitFor(3000); | |
console.log('parse data'); | |
let data = await page.evaluate(() => { | |
var avatar = document.querySelector('header img').src || '', | |
post_count = document.querySelectorAll('header section ul li')[0].innerText.split(' ')[0] || '', | |
follower_count = document.querySelectorAll('header section ul li')[1].innerText.split(' ')[0] || '', | |
images = document.querySelectorAll('article div')[0].querySelectorAll('img') || [], | |
recent = [], | |
sum = 10; | |
for (let index = 0; index < images.length; index++) { | |
if (index == sum - 1) { | |
break; | |
} | |
recent.push(images[index].src); | |
} | |
return { | |
"avatar": avatar, | |
"post_count": post_count, | |
"follower_count": follower_count, | |
"recent": recent | |
}; | |
}); | |
data['social_name'] = user; | |
data['private_account'] = privateAccount; | |
data['social_name_exist'] = socialNameExist; | |
parseResult[user] = data; | |
} | |
await browser.close(); | |
console.log('IG: ', parseResult); | |
return parseResult; | |
} | |
exports.crawlerIG = crawlerIG; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment