Last active
February 17, 2026 02:50
-
-
Save gbertb/8a8dbbdb82f9b0190e124ed17da48b16 to your computer and use it in GitHub Desktop.
Spider Browser — XY Planning Network advisor scraper (TypeScript)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * Spider Browser — XY Planning Network advisor scraper (TypeScript). | |
| * | |
| * Scrapes advisor cards from the "Find an Advisor" directory, | |
| * paginating through the first MAX_PAGES pages. Exports JSON + CSV. | |
| * | |
| * Usage: | |
| * 1. Copy .env.example to .env and add your SPIDER_API_KEY | |
| * 2. Run: node xy_planning_network_advistor.ts | |
| */ | |
| import 'dotenv/config'; | |
| import fs from 'node:fs'; | |
| import path from 'node:path'; | |
| import { SpiderBrowser } from 'spider-browser'; | |
| // ── Configuration ────────────────────────────────────────────────────── | |
| const OUTPUT_DIR = path.join(import.meta.dirname, 'output'); | |
| const START_URL = 'https://connect.xyplanningnetwork.com/find-an-advisor'; | |
| const BASE_URL = 'https://connect.xyplanningnetwork.com'; | |
| const MAX_PAGES = 3; | |
| // ── Types ────────────────────────────────────────────────────────────── | |
| interface AdvisorInfo { | |
| name: string; | |
| firm: string; | |
| credentials: string[]; | |
| specialties: string[]; | |
| bio: string; | |
| profileUrl: string; | |
| headshotUrl: string; | |
| } | |
| // ── Utilities ────────────────────────────────────────────────────────── | |
| const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); | |
| function parseText(html: string): string { | |
| return html.replace(/<[^>]+>/g, '').trim(); | |
| } | |
| function collectMatches(html: string, pattern: RegExp, transform?: (s: string) => string): string[] { | |
| const results: string[] = []; | |
| for (const m of html.matchAll(pattern)) { | |
| const value = transform ? transform(parseText(m[1])) : parseText(m[1]); | |
| if (value) results.push(value); | |
| } | |
| return results; | |
| } | |
| function csvEscape(s: string): string { | |
| return `"${s.replace(/"/g, '""')}"`; | |
| } | |
| // ── Card Parsing ─────────────────────────────────────────────────────── | |
| /** | |
| * Parse a single advisor card's outerHTML into structured data. | |
| * | |
| * Card structure: | |
| * h1 → advisor name | |
| * h2 → firm name | |
| * .xy-badge-blue → credentials (CFP, CPA, etc.) | |
| * .text-xy-green → specialties / ideal clients | |
| * .line-clamp-6 → bio snippet | |
| * .xy-btn-secondary → profile link | |
| * img[alt="profile headshot"] → headshot URL | |
| */ | |
| function parseAdvisorCard(html: string): AdvisorInfo | null { | |
| const nameMatch = html.match(/<h1[^>]*>(.*?)<\/h1>/s); | |
| const name = nameMatch ? parseText(nameMatch[1]) : ''; | |
| if (!name) return null; | |
| const firmMatch = html.match(/<h2[^>]*>(.*?)<\/h2>/s); | |
| const bioMatch = html.match(/<p class="line-clamp-6[^"]*">(.*?)<\/p>/s); | |
| const profileMatch = html.match(/<a class="xy-btn-secondary" href="([^"]*)">/); | |
| const headshotMatch = html.match(/<img[^>]*src="([^"]*)"[^>]*alt="profile headshot"/); | |
| const decode = (s: string) => s.replace(/&/g, '&'); | |
| return { | |
| name, | |
| firm: firmMatch ? parseText(firmMatch[1]) : '', | |
| credentials: collectMatches(html, /<span class="xy-badge-blue">(.*?)<\/span>/gs), | |
| specialties: collectMatches( | |
| html, | |
| /<span class="text-xy-green">[^<]*<\/span>\s*(.*?)<\/span>/gs, | |
| decode, | |
| ), | |
| bio: bioMatch ? decode(parseText(bioMatch[1])) : '', | |
| profileUrl: profileMatch ? `${BASE_URL}${profileMatch[1]}` : '', | |
| headshotUrl: headshotMatch ? headshotMatch[1] : '', | |
| }; | |
| } | |
| // ── Page Scraping ────────────────────────────────────────────────────── | |
| async function scrapeCurrentPage(page: any): Promise<AdvisorInfo[]> { | |
| const cards = await page.querySelectorAll('ul.grid > li'); | |
| return cards.map(parseAdvisorCard).filter((a: AdvisorInfo | null): a is AdvisorInfo => a !== null); | |
| } | |
| async function clickNextPage(page: any): Promise<boolean> { | |
| const links = await page.querySelectorAll('.flex.justify-end a'); | |
| for (const linkHtml of links) { | |
| if (!parseText(linkHtml).includes('Next')) continue; | |
| if (linkHtml.includes('cursor-not-allowed') || linkHtml.includes('pointer-events-none')) { | |
| return false; | |
| } | |
| await page.click('.w-0.flex-1.flex.justify-end a'); | |
| return true; | |
| } | |
| return false; | |
| } | |
| async function waitForPage(page: any): Promise<void> { | |
| await page.waitForNetworkIdle(15000); | |
| await sleep(3000); | |
| } | |
| // ── Debug ────────────────────────────────────────────────────────────── | |
| async function debugEmptyPage(page: any): Promise<AdvisorInfo[]> { | |
| console.log(' No advisors found — running diagnostics...'); | |
| const html = await page.content(5000, 100); | |
| console.log(` HTML length: ${html.length} chars`); | |
| const selectors = ['article', '.bg-white.shadow', 'a.xy-btn-secondary', 'a[href*="find-an-advisor"]']; | |
| for (const sel of selectors) { | |
| const els = await page.querySelectorAll(sel); | |
| console.log(` ${sel}: ${els.length} matches`); | |
| } | |
| // Retry after scroll in case of lazy loading | |
| await page.scrollY(800); | |
| await sleep(5000); | |
| return scrapeCurrentPage(page); | |
| } | |
| // ── Export ────────────────────────────────────────────────────────────── | |
| function exportResults(advisors: AdvisorInfo[]): void { | |
| const jsonPath = path.join(OUTPUT_DIR, 'advisors.json'); | |
| fs.writeFileSync(jsonPath, JSON.stringify(advisors, null, 2)); | |
| console.log(` JSON: ${jsonPath}`); | |
| const csvPath = path.join(OUTPUT_DIR, 'advisors.csv'); | |
| const header = 'Name,Firm,Credentials,Specialties,Bio,Profile URL,Headshot URL'; | |
| const rows = advisors.map((a) => | |
| [a.name, a.firm, a.credentials.join('; '), a.specialties.join('; '), a.bio, a.profileUrl, a.headshotUrl] | |
| .map(csvEscape) | |
| .join(','), | |
| ); | |
| fs.writeFileSync(csvPath, [header, ...rows].join('\n') + '\n'); | |
| console.log(` CSV: ${csvPath}`); | |
| } | |
| // ── Main ─────────────────────────────────────────────────────────────── | |
| async function main(): Promise<void> { | |
| const apiKey = process.env.SPIDER_API_KEY; | |
| if (!apiKey) { | |
| console.error('Error: SPIDER_API_KEY not set. Copy .env.example to .env and add your key.'); | |
| return; | |
| } | |
| fs.mkdirSync(OUTPUT_DIR, { recursive: true }); | |
| const browser = new SpiderBrowser({ | |
| apiKey, | |
| captcha: 'solve', | |
| stealth: 0, | |
| maxRetries: 12, | |
| url: START_URL, | |
| logLevel: 'info', | |
| }); | |
| browser.on('captcha.detected', (e) => console.log(` [captcha] detected: ${JSON.stringify(e.types)}`)); | |
| browser.on('captcha.solved', () => console.log(' [captcha] solved')); | |
| browser.on('retry.attempt', (e) => console.log(` [retry] ${e.attempt}/${e.maxRetries}: ${e.error}`)); | |
| const allAdvisors: AdvisorInfo[] = []; | |
| let pageNum = 1; | |
| try { | |
| await browser.init(); | |
| const page = browser.page; | |
| await page.setViewport(1920, 1080); | |
| console.log(`Navigating to ${START_URL}`); | |
| await browser.goto(START_URL); | |
| await waitForPage(page); | |
| while (true) { | |
| console.log(`\n[Page ${pageNum}/${MAX_PAGES}] Scraping...`); | |
| let advisors = await scrapeCurrentPage(page); | |
| // First-page fallback: run diagnostics if nothing was found | |
| if (advisors.length === 0 && pageNum === 1) { | |
| advisors = await debugEmptyPage(page); | |
| if (advisors.length === 0) { | |
| console.log(' Diagnostics failed — page structure may have changed.'); | |
| break; | |
| } | |
| } | |
| allAdvisors.push(...advisors); | |
| console.log(` ${advisors.length} advisors found`); | |
| for (const a of advisors) { | |
| console.log(` ${a.name} — ${a.firm}`); | |
| } | |
| if (pageNum >= MAX_PAGES) break; | |
| const hasNext = await clickNextPage(page); | |
| if (!hasNext) break; | |
| pageNum++; | |
| await waitForPage(page); | |
| } | |
| // ── Summary & Export ────────────────────────────────────────────── | |
| console.log('\n' + '═'.repeat(70)); | |
| console.log(' SCRAPING COMPLETE'); | |
| console.log('═'.repeat(70)); | |
| console.log(` Pages: ${pageNum} | Advisors: ${allAdvisors.length}`); | |
| console.log('═'.repeat(70)); | |
| console.log('\n # Name Firm Credentials'); | |
| console.log(' ' + '─'.repeat(90)); | |
| for (let i = 0; i < allAdvisors.length; i++) { | |
| const a = allAdvisors[i]; | |
| console.log( | |
| ` ${String(i + 1).padStart(3)} ${a.name.slice(0, 33).padEnd(33)} ${a.firm.slice(0, 33).padEnd(33)} ${a.credentials.join(', ')}`, | |
| ); | |
| } | |
| console.log('\n Exported:'); | |
| exportResults(allAdvisors); | |
| console.log('\nDone!'); | |
| } finally { | |
| await browser.close(); | |
| } | |
| } | |
| main().catch((err) => { | |
| console.error('Fatal error:', err); | |
| process.exit(1); | |
| }); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment