Last active
February 1, 2024 22:49
-
-
Save dereckmezquita/07699bea5566a645c222edaddea6987e to your computer and use it in GitHub Desktop.
Scripts for scraping photos and videos from VK with error tracking.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import puppeteer from 'puppeteer'; | |
import fs from 'fs'; | |
import path from 'path'; | |
/* | |
// scroll and scrape links manually from album page | |
const rows = document.querySelectorAll('.photos_row[aria-label="Photo"] a'); | |
const hrefs = Array.from(rows).map(row => row.href); | |
hrefs.forEach((e) => { | |
document.write(e + '</br>'); | |
}); | |
*/ | |
const outDir: string = './outdir'; | |
if (!fs.existsSync(path.resolve(__dirname, outDir))) { | |
fs.mkdirSync(path.resolve(__dirname, outDir)); | |
} | |
interface DownloadStatus { | |
fileName: string; | |
link: string; | |
status: 'success' | 'fail'; | |
error?: Error | unknown | string; | |
} | |
const links: string[] = fs.readFileSync( | |
path.resolve(__dirname, './input/combined-links.txt'), 'utf8' | |
).split('\n'); | |
const status: DownloadStatus[] = JSON.parse( | |
fs.readFileSync(path.resolve(__dirname, "./photos-status-4lqlmv.json"), 'utf8') | |
); | |
console.table(status); | |
async function main(): Promise<void> { | |
// we want maximum window size this affects the quality of the image | |
const browser = await puppeteer.launch({ | |
headless: false, | |
args: [`--window-size=1920,1080`], | |
defaultViewport: { | |
width: 1920, | |
height: 1080 | |
} | |
}); | |
const page = await browser.newPage(); | |
for (let i = 0; i < links.length; i++) { | |
const fileName: string = links[i].split('photo-')[1]; // 14323456_456456456 | |
// if in success status then skip | |
if (status.some((e) => { | |
return e.fileName === fileName && e.status === 'success'; | |
})) { | |
console.log(`Already succesfully downloaded; skipping ${fileName} (${i + 1}/${links.length})`); | |
continue; | |
} | |
// if failed status then remove status and we will try again | |
if (status.some((e) => { | |
return e.fileName === fileName && e.status === 'fail'; | |
})) { | |
console.log(`Retrying ${fileName} (${i + 1}/${links.length})`); | |
status.splice(status.findIndex((e) => e.fileName === fileName), 1); | |
} | |
console.log(`Downloading ${fileName} (${i}/${links.length})`); | |
try { | |
// Set the Referer header | |
await page.setExtraHTTPHeaders({ 'Referer': links[i] }); | |
await page.goto(links[i], { waitUntil: 'networkidle0' }); | |
// right click and open in new tab | |
// id="pv_photo" && child is img | |
const photo = await page.$('#pv_photo img'); | |
const src = await photo?.evaluate((node) => node.getAttribute('src')); | |
if (!src) { | |
throw new Error('src is null'); | |
} | |
// 'https://sun9-41.userapi.com/impf/c6234534/v62534543539/4bd32/5mFSAdK_kSf4U.jpg?size=610x912&quality=96&sign=da4a6fc2fc6&type=album' | |
const sunName: string = src.split('?')[0].split('/').pop()!; // 5moSK_kRY4U.jpg | |
const imgResponse = await page.goto(src, { timeout: 0, waitUntil: 'networkidle0' }); | |
const imgBuffer = await imgResponse?.buffer(); | |
if (!imgBuffer) { | |
throw new Error('imgBuffer is null.'); | |
} | |
const saveTo: string = path.resolve(__dirname, outDir, `${i}_${fileName}_${sunName}`); | |
console.log(`Saving to ${saveTo}`); | |
await fs.promises.writeFile(saveTo, imgBuffer); | |
await pause(1250); | |
status.push({ | |
fileName, | |
link: links[i], | |
status: 'success', | |
error: '' | |
}); | |
} catch (err) { | |
console.error(err); | |
status.push({ | |
fileName, | |
link: links[i], | |
status: 'fail', | |
error: err | |
}); | |
} finally { | |
saveStatus(); | |
} | |
} | |
await browser.close(); | |
} | |
main().catch((err) => { | |
console.error(err); | |
}).finally(() => { | |
console.log('Done.'); | |
saveStatus(); | |
}); | |
async function pause(ms: number): Promise<void> { | |
return new Promise((resolve) => setTimeout(resolve, ms)); | |
} | |
const uid: string = Math.random().toString(36).substring(7); | |
function saveStatus() { | |
console.log('Saving status...'); | |
fs.writeFileSync( | |
path.resolve(__dirname, `./photos-status-${uid}.json`), | |
JSON.stringify(status, null, 4) | |
); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Bonus; scrape videos from vk install
yt-dlp
first.