Created
August 31, 2022 16:50
-
-
Save fawazahmed0/849045624ebdbc15b221eb285fe95d8a to your computer and use it in GitHub Desktop.
Youtube comment scrape
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const { firefox } = require('playwright'); | |
const fs = require('fs') | |
const path = require('path') | |
async function begin(){ | |
let count = 0 | |
let browser = await firefox.launch({headless: false}); | |
let context = await browser.newContext({viewport:{width:400,height:700}}); | |
let page = await context.newPage(); | |
let values = ["/watch?v=VikfS15Ymmw","/watch?v=ndaEzlg5xcg","/watch?v=XexTm02fumM","/watch?v=sRIB9kVzjYM","/watch?v=TSNNi7qSpCc","/watch?v=rbdVDwMbqlE","/watch?v=rfrg-Oo13GI","/watch?v=U1s48KT2KME","/watch?v=KdA-9wK_wVw","/watch?v=KrjvEvYGFUI","/watch?v=LgDKD87Cwqk","/watch?v=ck7bzakwmnQ","/watch?v=lkJ_an81sbI","/watch?v=72N0pRsSIFQ","/watch?v=QzsfZU1q-SQ","/watch?v=c8y5UTwMuyY","/watch?v=pI46I7pSc4g","/watch?v=RzYlGaLGn2o","/watch?v=bTE2wZeiVz4","/watch?v=doIfZm2iGnA","/watch?v=N-9CWbNo9Fg","/watch?v=XJpmfW60OuE","/watch?v=rh3ohhR1iYI","/watch?v=3XMPLN3cyxs","/watch?v=p-VvH1KDvAM","/watch?v=TLnmBLqIMqs","/watch?v=WHvJuEA00V0","/watch?v=3j_9p1VOTPI","/watch?v=waXGqCwoBLY","/watch?v=zHRDtD8t1v0","/watch?v=G6BQqubJPqM","/watch?v=M2UIWc40w4U","/watch?v=ffB-9qLajf4","/watch?v=npPszXXC5Tg","/watch?v=retcp_lRl0Q","/watch?v=4dWc7Edkc8Y","/watch?v=p1TUPVgK-ro","/watch?v=b1U-Lxlhfn4"] | |
values = [...new Set(values)]; | |
await page.goto('https://youtube.com') | |
await page.waitForTimeout(50000) | |
for(let vid of values){ | |
try{ | |
await page.goto(`https://www.youtube.com${vid}`); | |
await page.waitForTimeout(5000) | |
await scrollFullPage(page) | |
await scrollFullPage(page) | |
console.log('at ',vid) | |
let valArr = await page.evaluate(()=>Array.from(document.querySelectorAll('#content')).map(e=>e.textContent)) | |
let comments = valArr.map(e=>e.replace(/\s+/gi,' ').toLowerCase().trim()).filter(e=>e.includes('univ') && e.length < 1000) | |
comments = [...new Set(comments)]; | |
fs.appendFileSync(path.join(__dirname, 'content.txt'), comments.join('\n') + '\n') | |
}catch(error){console.error(error); | |
console.log('error at ',vid) | |
browser = await firefox.launch({headless: false}); | |
context = await browser.newContext({viewport:{width:400,height:700}}); | |
page = await context.newPage(); | |
} | |
} | |
await browser.close(); | |
} | |
begin() | |
async function scrollFullPage(page) { | |
await page.evaluate(async () => { | |
await new Promise(resolve => { | |
let totalHeight = 0; | |
const distance = 100; | |
const timer = setInterval(() => { | |
const scrollHeight = Math.max( | |
document.body.scrollHeight, document.documentElement.scrollHeight, | |
document.body.offsetHeight, document.documentElement.offsetHeight, | |
document.body.clientHeight, document.documentElement.clientHeight | |
); | |
window.scrollBy(0, distance); | |
totalHeight += distance; | |
if (totalHeight >= scrollHeight){ | |
clearInterval(timer); | |
resolve(); | |
} | |
}, 1000); | |
}); | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment