Skip to content

Instantly share code, notes, and snippets.

@gbertb
Last active February 17, 2026 02:50
Show Gist options
  • Select an option

  • Save gbertb/8a8dbbdb82f9b0190e124ed17da48b16 to your computer and use it in GitHub Desktop.

Select an option

Save gbertb/8a8dbbdb82f9b0190e124ed17da48b16 to your computer and use it in GitHub Desktop.
Spider Browser — XY Planning Network advisor scraper (TypeScript)
/**
* Spider Browser — XY Planning Network advisor scraper (TypeScript).
*
* Scrapes advisor cards from the "Find an Advisor" directory,
* paginating through the first MAX_PAGES pages. Exports JSON + CSV.
*
* Usage:
* 1. Copy .env.example to .env and add your SPIDER_API_KEY
* 2. Run: node xy_planning_network_advistor.ts
*/
import 'dotenv/config';
import fs from 'node:fs';
import path from 'node:path';
import { SpiderBrowser } from 'spider-browser';
// ── Configuration ──────────────────────────────────────────────────────
const OUTPUT_DIR = path.join(import.meta.dirname, 'output');
const START_URL = 'https://connect.xyplanningnetwork.com/find-an-advisor';
const BASE_URL = 'https://connect.xyplanningnetwork.com';
const MAX_PAGES = 3;
// ── Types ──────────────────────────────────────────────────────────────
interface AdvisorInfo {
name: string;
firm: string;
credentials: string[];
specialties: string[];
bio: string;
profileUrl: string;
headshotUrl: string;
}
// ── Utilities ──────────────────────────────────────────────────────────
const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));
function parseText(html: string): string {
return html.replace(/<[^>]+>/g, '').trim();
}
function collectMatches(html: string, pattern: RegExp, transform?: (s: string) => string): string[] {
const results: string[] = [];
for (const m of html.matchAll(pattern)) {
const value = transform ? transform(parseText(m[1])) : parseText(m[1]);
if (value) results.push(value);
}
return results;
}
function csvEscape(s: string): string {
return `"${s.replace(/"/g, '""')}"`;
}
// ── Card Parsing ───────────────────────────────────────────────────────
/**
* Parse a single advisor card's outerHTML into structured data.
*
* Card structure:
* h1 → advisor name
* h2 → firm name
* .xy-badge-blue → credentials (CFP, CPA, etc.)
* .text-xy-green → specialties / ideal clients
* .line-clamp-6 → bio snippet
* .xy-btn-secondary → profile link
* img[alt="profile headshot"] → headshot URL
*/
function parseAdvisorCard(html: string): AdvisorInfo | null {
const nameMatch = html.match(/<h1[^>]*>(.*?)<\/h1>/s);
const name = nameMatch ? parseText(nameMatch[1]) : '';
if (!name) return null;
const firmMatch = html.match(/<h2[^>]*>(.*?)<\/h2>/s);
const bioMatch = html.match(/<p class="line-clamp-6[^"]*">(.*?)<\/p>/s);
const profileMatch = html.match(/<a class="xy-btn-secondary" href="([^"]*)">/);
const headshotMatch = html.match(/<img[^>]*src="([^"]*)"[^>]*alt="profile headshot"/);
const decode = (s: string) => s.replace(/&amp;/g, '&');
return {
name,
firm: firmMatch ? parseText(firmMatch[1]) : '',
credentials: collectMatches(html, /<span class="xy-badge-blue">(.*?)<\/span>/gs),
specialties: collectMatches(
html,
/<span class="text-xy-green">[^<]*<\/span>\s*(.*?)<\/span>/gs,
decode,
),
bio: bioMatch ? decode(parseText(bioMatch[1])) : '',
profileUrl: profileMatch ? `${BASE_URL}${profileMatch[1]}` : '',
headshotUrl: headshotMatch ? headshotMatch[1] : '',
};
}
// ── Page Scraping ──────────────────────────────────────────────────────
async function scrapeCurrentPage(page: any): Promise<AdvisorInfo[]> {
const cards = await page.querySelectorAll('ul.grid > li');
return cards.map(parseAdvisorCard).filter((a: AdvisorInfo | null): a is AdvisorInfo => a !== null);
}
async function clickNextPage(page: any): Promise<boolean> {
const links = await page.querySelectorAll('.flex.justify-end a');
for (const linkHtml of links) {
if (!parseText(linkHtml).includes('Next')) continue;
if (linkHtml.includes('cursor-not-allowed') || linkHtml.includes('pointer-events-none')) {
return false;
}
await page.click('.w-0.flex-1.flex.justify-end a');
return true;
}
return false;
}
async function waitForPage(page: any): Promise<void> {
await page.waitForNetworkIdle(15000);
await sleep(3000);
}
// ── Debug ──────────────────────────────────────────────────────────────
async function debugEmptyPage(page: any): Promise<AdvisorInfo[]> {
console.log(' No advisors found — running diagnostics...');
const html = await page.content(5000, 100);
console.log(` HTML length: ${html.length} chars`);
const selectors = ['article', '.bg-white.shadow', 'a.xy-btn-secondary', 'a[href*="find-an-advisor"]'];
for (const sel of selectors) {
const els = await page.querySelectorAll(sel);
console.log(` ${sel}: ${els.length} matches`);
}
// Retry after scroll in case of lazy loading
await page.scrollY(800);
await sleep(5000);
return scrapeCurrentPage(page);
}
// ── Export ──────────────────────────────────────────────────────────────
function exportResults(advisors: AdvisorInfo[]): void {
const jsonPath = path.join(OUTPUT_DIR, 'advisors.json');
fs.writeFileSync(jsonPath, JSON.stringify(advisors, null, 2));
console.log(` JSON: ${jsonPath}`);
const csvPath = path.join(OUTPUT_DIR, 'advisors.csv');
const header = 'Name,Firm,Credentials,Specialties,Bio,Profile URL,Headshot URL';
const rows = advisors.map((a) =>
[a.name, a.firm, a.credentials.join('; '), a.specialties.join('; '), a.bio, a.profileUrl, a.headshotUrl]
.map(csvEscape)
.join(','),
);
fs.writeFileSync(csvPath, [header, ...rows].join('\n') + '\n');
console.log(` CSV: ${csvPath}`);
}
// ── Main ───────────────────────────────────────────────────────────────
async function main(): Promise<void> {
const apiKey = process.env.SPIDER_API_KEY;
if (!apiKey) {
console.error('Error: SPIDER_API_KEY not set. Copy .env.example to .env and add your key.');
return;
}
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
const browser = new SpiderBrowser({
apiKey,
captcha: 'solve',
stealth: 0,
maxRetries: 12,
url: START_URL,
logLevel: 'info',
});
browser.on('captcha.detected', (e) => console.log(` [captcha] detected: ${JSON.stringify(e.types)}`));
browser.on('captcha.solved', () => console.log(' [captcha] solved'));
browser.on('retry.attempt', (e) => console.log(` [retry] ${e.attempt}/${e.maxRetries}: ${e.error}`));
const allAdvisors: AdvisorInfo[] = [];
let pageNum = 1;
try {
await browser.init();
const page = browser.page;
await page.setViewport(1920, 1080);
console.log(`Navigating to ${START_URL}`);
await browser.goto(START_URL);
await waitForPage(page);
while (true) {
console.log(`\n[Page ${pageNum}/${MAX_PAGES}] Scraping...`);
let advisors = await scrapeCurrentPage(page);
// First-page fallback: run diagnostics if nothing was found
if (advisors.length === 0 && pageNum === 1) {
advisors = await debugEmptyPage(page);
if (advisors.length === 0) {
console.log(' Diagnostics failed — page structure may have changed.');
break;
}
}
allAdvisors.push(...advisors);
console.log(` ${advisors.length} advisors found`);
for (const a of advisors) {
console.log(` ${a.name} — ${a.firm}`);
}
if (pageNum >= MAX_PAGES) break;
const hasNext = await clickNextPage(page);
if (!hasNext) break;
pageNum++;
await waitForPage(page);
}
// ── Summary & Export ──────────────────────────────────────────────
console.log('\n' + '═'.repeat(70));
console.log(' SCRAPING COMPLETE');
console.log('═'.repeat(70));
console.log(` Pages: ${pageNum} | Advisors: ${allAdvisors.length}`);
console.log('═'.repeat(70));
console.log('\n # Name Firm Credentials');
console.log(' ' + '─'.repeat(90));
for (let i = 0; i < allAdvisors.length; i++) {
const a = allAdvisors[i];
console.log(
` ${String(i + 1).padStart(3)} ${a.name.slice(0, 33).padEnd(33)} ${a.firm.slice(0, 33).padEnd(33)} ${a.credentials.join(', ')}`,
);
}
console.log('\n Exported:');
exportResults(allAdvisors);
console.log('\nDone!');
} finally {
await browser.close();
}
}
main().catch((err) => {
console.error('Fatal error:', err);
process.exit(1);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment