Skip to content

Instantly share code, notes, and snippets.

@atomiomi
Created March 26, 2025 13:56
Show Gist options
  • Save atomiomi/be46d4fbdbe1070de23d9cb7d1b8b5e3 to your computer and use it in GitHub Desktop.
Save atomiomi/be46d4fbdbe1070de23d9cb7d1b8b5e3 to your computer and use it in GitHub Desktop.
import * as cheerio from 'cheerio';
import normalizeUrl from 'normalize-url';
import { AppError } from '../error';
import { z } from 'zod';
import { debugLog } from '../utils';
import { requestTimeout, viewport } from './config';
import { getCluster } from './cluster';
const linkSchema = z.string().url();
function cleanUrl(url: string): string {
return normalizeUrl(url, {
removeQueryParameters: true,
removeTrailingSlash: true,
stripWWW: true,
stripHash: true,
stripTextFragment: true,
});
}
function validateAndNormalizeUrl(url: string, baseUrl: string) {
if (url.startsWith('/')) {
const absoluteUrl = new URL(url, baseUrl).toString();
return cleanUrl(absoluteUrl);
}
const validUrl = linkSchema.safeParse(url);
if (!validUrl.success) {
return null;
}
return cleanUrl(url);
}
export async function fetchWebPage(url: string) {
try {
const cluster = await getCluster();
const result = await cluster.execute(url, async ({ page }) => {
await page.setViewport(viewport);
page.setDefaultTimeout(requestTimeout);
await page.setRequestInterception(true);
// Block loading images to save on bandwidth
page.on('request', (request) => {
if (request.resourceType() === 'image') request.abort();
else request.continue();
});
await page.goto(url, { waitUntil: 'domcontentloaded' });
const html = await page.content();
return html;
});
return result;
} catch (err) {
throw new AppError('failed_to_fetch_web_page', 500, {
cause: err,
});
}
}
export async function crawlWebPage(url: string) {
const rootUrl = cleanUrl(url);
const queue = [rootUrl];
const visited = new Set<string>();
const links = new Set<string>();
while (queue.length > 0 && links.size < 100) {
const currentUrl = queue.shift()!;
if (visited.has(currentUrl)) {
continue;
}
debugLog(`Visiting ${currentUrl}`);
visited.add(currentUrl);
try {
const html = await fetchWebPage(currentUrl);
// Add only if we successfully fetched the page
links.add(currentUrl);
const $ = cheerio.load(html);
$('a').each((i, el) => {
const href = $(el).attr('href');
const absoluteUrl = href ? validateAndNormalizeUrl(href, rootUrl) : null;
if (absoluteUrl && absoluteUrl.startsWith(rootUrl)) {
queue.push(absoluteUrl);
}
});
} catch (_) {
// Ignore if can't fetch the page
}
}
return Array.from(links);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment