Skip to content

Instantly share code, notes, and snippets.

@dereckmezquita
Last active February 1, 2024 22:49
Show Gist options
  • Save dereckmezquita/07699bea5566a645c222edaddea6987e to your computer and use it in GitHub Desktop.
Save dereckmezquita/07699bea5566a645c222edaddea6987e to your computer and use it in GitHub Desktop.
Scripts for scraping photos and videos from VK with error tracking.
import puppeteer from 'puppeteer';
import fs from 'fs';
import path from 'path';
/*
// scroll and scrape links manually from album page
const rows = document.querySelectorAll('.photos_row[aria-label="Photo"] a');
const hrefs = Array.from(rows).map(row => row.href);
hrefs.forEach((e) => {
document.write(e + '</br>');
});
*/
const outDir: string = './outdir';
if (!fs.existsSync(path.resolve(__dirname, outDir))) {
fs.mkdirSync(path.resolve(__dirname, outDir));
}
interface DownloadStatus {
fileName: string;
link: string;
status: 'success' | 'fail';
error?: Error | unknown | string;
}
const links: string[] = fs.readFileSync(
path.resolve(__dirname, './input/combined-links.txt'), 'utf8'
).split('\n');
const status: DownloadStatus[] = JSON.parse(
fs.readFileSync(path.resolve(__dirname, "./photos-status-4lqlmv.json"), 'utf8')
);
console.table(status);
async function main(): Promise<void> {
// we want maximum window size this affects the quality of the image
const browser = await puppeteer.launch({
headless: false,
args: [`--window-size=1920,1080`],
defaultViewport: {
width: 1920,
height: 1080
}
});
const page = await browser.newPage();
for (let i = 0; i < links.length; i++) {
const fileName: string = links[i].split('photo-')[1]; // 14323456_456456456
// if in success status then skip
if (status.some((e) => {
return e.fileName === fileName && e.status === 'success';
})) {
console.log(`Already succesfully downloaded; skipping ${fileName} (${i + 1}/${links.length})`);
continue;
}
// if failed status then remove status and we will try again
if (status.some((e) => {
return e.fileName === fileName && e.status === 'fail';
})) {
console.log(`Retrying ${fileName} (${i + 1}/${links.length})`);
status.splice(status.findIndex((e) => e.fileName === fileName), 1);
}
console.log(`Downloading ${fileName} (${i}/${links.length})`);
try {
// Set the Referer header
await page.setExtraHTTPHeaders({ 'Referer': links[i] });
await page.goto(links[i], { waitUntil: 'networkidle0' });
// right click and open in new tab
// id="pv_photo" && child is img
const photo = await page.$('#pv_photo img');
const src = await photo?.evaluate((node) => node.getAttribute('src'));
if (!src) {
throw new Error('src is null');
}
// 'https://sun9-41.userapi.com/impf/c6234534/v62534543539/4bd32/5mFSAdK_kSf4U.jpg?size=610x912&quality=96&sign=da4a6fc2fc6&type=album'
const sunName: string = src.split('?')[0].split('/').pop()!; // 5moSK_kRY4U.jpg
const imgResponse = await page.goto(src, { timeout: 0, waitUntil: 'networkidle0' });
const imgBuffer = await imgResponse?.buffer();
if (!imgBuffer) {
throw new Error('imgBuffer is null.');
}
const saveTo: string = path.resolve(__dirname, outDir, `${i}_${fileName}_${sunName}`);
console.log(`Saving to ${saveTo}`);
await fs.promises.writeFile(saveTo, imgBuffer);
await pause(1250);
status.push({
fileName,
link: links[i],
status: 'success',
error: ''
});
} catch (err) {
console.error(err);
status.push({
fileName,
link: links[i],
status: 'fail',
error: err
});
} finally {
saveStatus();
}
}
await browser.close();
}
main().catch((err) => {
console.error(err);
}).finally(() => {
console.log('Done.');
saveStatus();
});
async function pause(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
const uid: string = Math.random().toString(36).substring(7);
function saveStatus() {
console.log('Saving status...');
fs.writeFileSync(
path.resolve(__dirname, `./photos-status-${uid}.json`),
JSON.stringify(status, null, 4)
);
}
@dereckmezquita
Copy link
Author

Bonus; scrape videos from vk install yt-dlp first.

#!/usr/bin/env bash

# scroll and scrape links manually from album page
# const hrefs = document.querySelectorAll(".VideoCard__thumbLink.video_item__thumb_link");
# hrefs.forEach((e) => {
#     document.write(e.href + '</br>');
# });

mkdir videos
cd videos

echo 'Scraping VK videos...'

# Get list of videos 
videos=$(cat ../input/video-links-unique.txt)

# File to save the results
output_file="./download_status.csv"

# Create or clear the output file
echo "Video URL,Status" > "$output_file"

# Loop through each video link
for video in $videos
do
    echo 'Scraping video: ' $video
    # Download the video and check the exit status
    if yt-dlp $video; then
        # Download successful
        echo "$video,success" >> "$output_file"
    else
        # Download failed
        echo "$video,fail" >> "$output_file"
    fi
done

echo 'All videos processed. Check the download status in' $output_file

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment