Created
March 26, 2025 13:56
-
-
Save atomiomi/be46d4fbdbe1070de23d9cb7d1b8b5e3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as cheerio from 'cheerio'; | |
import normalizeUrl from 'normalize-url'; | |
import { AppError } from '../error'; | |
import { z } from 'zod'; | |
import { debugLog } from '../utils'; | |
import { requestTimeout, viewport } from './config'; | |
import { getCluster } from './cluster'; | |
const linkSchema = z.string().url(); | |
function cleanUrl(url: string): string { | |
return normalizeUrl(url, { | |
removeQueryParameters: true, | |
removeTrailingSlash: true, | |
stripWWW: true, | |
stripHash: true, | |
stripTextFragment: true, | |
}); | |
} | |
function validateAndNormalizeUrl(url: string, baseUrl: string) { | |
if (url.startsWith('/')) { | |
const absoluteUrl = new URL(url, baseUrl).toString(); | |
return cleanUrl(absoluteUrl); | |
} | |
const validUrl = linkSchema.safeParse(url); | |
if (!validUrl.success) { | |
return null; | |
} | |
return cleanUrl(url); | |
} | |
export async function fetchWebPage(url: string) { | |
try { | |
const cluster = await getCluster(); | |
const result = await cluster.execute(url, async ({ page }) => { | |
await page.setViewport(viewport); | |
page.setDefaultTimeout(requestTimeout); | |
await page.setRequestInterception(true); | |
// Block loading images to save on bandwidth | |
page.on('request', (request) => { | |
if (request.resourceType() === 'image') request.abort(); | |
else request.continue(); | |
}); | |
await page.goto(url, { waitUntil: 'domcontentloaded' }); | |
const html = await page.content(); | |
return html; | |
}); | |
return result; | |
} catch (err) { | |
throw new AppError('failed_to_fetch_web_page', 500, { | |
cause: err, | |
}); | |
} | |
} | |
export async function crawlWebPage(url: string) { | |
const rootUrl = cleanUrl(url); | |
const queue = [rootUrl]; | |
const visited = new Set<string>(); | |
const links = new Set<string>(); | |
while (queue.length > 0 && links.size < 100) { | |
const currentUrl = queue.shift()!; | |
if (visited.has(currentUrl)) { | |
continue; | |
} | |
debugLog(`Visiting ${currentUrl}`); | |
visited.add(currentUrl); | |
try { | |
const html = await fetchWebPage(currentUrl); | |
// Add only if we successfully fetched the page | |
links.add(currentUrl); | |
const $ = cheerio.load(html); | |
$('a').each((i, el) => { | |
const href = $(el).attr('href'); | |
const absoluteUrl = href ? validateAndNormalizeUrl(href, rootUrl) : null; | |
if (absoluteUrl && absoluteUrl.startsWith(rootUrl)) { | |
queue.push(absoluteUrl); | |
} | |
}); | |
} catch (_) { | |
// Ignore if can't fetch the page | |
} | |
} | |
return Array.from(links); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment