Skip to content

Instantly share code, notes, and snippets.

@fawazahmed0
Created August 31, 2022 16:50
Show Gist options
  • Save fawazahmed0/849045624ebdbc15b221eb285fe95d8a to your computer and use it in GitHub Desktop.
Save fawazahmed0/849045624ebdbc15b221eb285fe95d8a to your computer and use it in GitHub Desktop.
Youtube comment scrape
const { firefox } = require('playwright');
const fs = require('fs')
const path = require('path')
async function begin(){
let count = 0
let browser = await firefox.launch({headless: false});
let context = await browser.newContext({viewport:{width:400,height:700}});
let page = await context.newPage();
let values = ["/watch?v=VikfS15Ymmw","/watch?v=ndaEzlg5xcg","/watch?v=XexTm02fumM","/watch?v=sRIB9kVzjYM","/watch?v=TSNNi7qSpCc","/watch?v=rbdVDwMbqlE","/watch?v=rfrg-Oo13GI","/watch?v=U1s48KT2KME","/watch?v=KdA-9wK_wVw","/watch?v=KrjvEvYGFUI","/watch?v=LgDKD87Cwqk","/watch?v=ck7bzakwmnQ","/watch?v=lkJ_an81sbI","/watch?v=72N0pRsSIFQ","/watch?v=QzsfZU1q-SQ","/watch?v=c8y5UTwMuyY","/watch?v=pI46I7pSc4g","/watch?v=RzYlGaLGn2o","/watch?v=bTE2wZeiVz4","/watch?v=doIfZm2iGnA","/watch?v=N-9CWbNo9Fg","/watch?v=XJpmfW60OuE","/watch?v=rh3ohhR1iYI","/watch?v=3XMPLN3cyxs","/watch?v=p-VvH1KDvAM","/watch?v=TLnmBLqIMqs","/watch?v=WHvJuEA00V0","/watch?v=3j_9p1VOTPI","/watch?v=waXGqCwoBLY","/watch?v=zHRDtD8t1v0","/watch?v=G6BQqubJPqM","/watch?v=M2UIWc40w4U","/watch?v=ffB-9qLajf4","/watch?v=npPszXXC5Tg","/watch?v=retcp_lRl0Q","/watch?v=4dWc7Edkc8Y","/watch?v=p1TUPVgK-ro","/watch?v=b1U-Lxlhfn4"]
values = [...new Set(values)];
await page.goto('https://youtube.com')
await page.waitForTimeout(50000)
for(let vid of values){
try{
await page.goto(`https://www.youtube.com${vid}`);
await page.waitForTimeout(5000)
await scrollFullPage(page)
await scrollFullPage(page)
console.log('at ',vid)
let valArr = await page.evaluate(()=>Array.from(document.querySelectorAll('#content')).map(e=>e.textContent))
let comments = valArr.map(e=>e.replace(/\s+/gi,' ').toLowerCase().trim()).filter(e=>e.includes('univ') && e.length < 1000)
comments = [...new Set(comments)];
fs.appendFileSync(path.join(__dirname, 'content.txt'), comments.join('\n') + '\n')
}catch(error){console.error(error);
console.log('error at ',vid)
browser = await firefox.launch({headless: false});
context = await browser.newContext({viewport:{width:400,height:700}});
page = await context.newPage();
}
}
await browser.close();
}
begin()
async function scrollFullPage(page) {
await page.evaluate(async () => {
await new Promise(resolve => {
let totalHeight = 0;
const distance = 100;
const timer = setInterval(() => {
const scrollHeight = Math.max(
document.body.scrollHeight, document.documentElement.scrollHeight,
document.body.offsetHeight, document.documentElement.offsetHeight,
document.body.clientHeight, document.documentElement.clientHeight
);
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight){
clearInterval(timer);
resolve();
}
}, 1000);
});
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment