gbertb · February 17, 2026 02:50
diff --git a/xy_planning_network_advistor.ts b/xy_planning_network_advistor.ts
 /**
 * Spider Browser — XY Planning Network advisor scraper (TypeScript).
 *
 * Scrapes advisor cards from the "Find an Advisor" directory,
 * paginating through the first MAX_PAGES pages. Exports JSON + CSV.
 *
 * Usage:
 *    1. Copy .env.example to .env and add your SPIDER_API_KEY
 *    2. Run:  node xy_planning_network_advistor.ts
 */

 import 'dotenv/config';
 import fs from 'node:fs';
 import path from 'node:path';
 import { SpiderBrowser } from 'spider-browser';

 // ── Configuration ──────────────────────────────────────────────────────

 const OUTPUT_DIR = path.join(import.meta.dirname, 'output');
 const START_URL = 'https://connect.xyplanningnetwork.com/find-an-advisor';
 const BASE_URL = 'https://connect.xyplanningnetwork.com';
 const MAX_PAGES = 3;

 // ── Types ──────────────────────────────────────────────────────────────

 interface AdvisorInfo {
  name: string;
  firm: string;
  credentials: string[];
  specialties: string[];
  bio: string;
  profileUrl: string;
  headshotUrl: string;
 }

 // ── Utilities ──────────────────────────────────────────────────────────

 const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));

 function parseText(html: string): string {
  return html.replace(/<[^>]+>/g, '').trim();
 }

 function collectMatches(html: string, pattern: RegExp, transform?: (s: string) => string): string[] {
  const results: string[] = [];
  for (const m of html.matchAll(pattern)) {
    const value = transform ? transform(parseText(m[1])) : parseText(m[1]);
    if (value) results.push(value);
  }
  return results;
 }

 function csvEscape(s: string): string {
  return `"${s.replace(/"/g, '""')}"`;
 }

 // ── Card Parsing ───────────────────────────────────────────────────────

 /**
 * Parse a single advisor card's outerHTML into structured data.
 *
 * Card structure:
 *   h1              → advisor name
 *   h2              → firm name
 *   .xy-badge-blue  → credentials (CFP, CPA, etc.)
 *   .text-xy-green  → specialties / ideal clients
 *   .line-clamp-6   → bio snippet
 *   .xy-btn-secondary → profile link
 *   img[alt="profile headshot"] → headshot URL
 */
 function parseAdvisorCard(html: string): AdvisorInfo | null {
  const nameMatch = html.match(/<h1[^>]*>(.*?)<\/h1>/s);
  const name = nameMatch ? parseText(nameMatch[1]) : '';
  if (!name) return null;

  const firmMatch = html.match(/<h2[^>]*>(.*?)<\/h2>/s);
  const bioMatch = html.match(/<p class="line-clamp-6[^"]*">(.*?)<\/p>/s);
  const profileMatch = html.match(/<a class="xy-btn-secondary" href="([^"]*)">/);
  const headshotMatch = html.match(/<img[^>]*src="([^"]*)"[^>]*alt="profile headshot"/);

  const decode = (s: string) => s.replace(/&amp;/g, '&');

  return {
    name,
    firm: firmMatch ? parseText(firmMatch[1]) : '',
    credentials: collectMatches(html, /<span class="xy-badge-blue">(.*?)<\/span>/gs),
    specialties: collectMatches(
      html,
      /<span class="text-xy-green">[^<]*<\/span>\s*(.*?)<\/span>/gs,
      decode,
    ),
    bio: bioMatch ? decode(parseText(bioMatch[1])) : '',
    profileUrl: profileMatch ? `${BASE_URL}${profileMatch[1]}` : '',
    headshotUrl: headshotMatch ? headshotMatch[1] : '',
  };
 }

 // ── Page Scraping ──────────────────────────────────────────────────────

 async function scrapeCurrentPage(page: any): Promise<AdvisorInfo[]> {
  const cards = await page.querySelectorAll('ul.grid > li');
  return cards.map(parseAdvisorCard).filter((a: AdvisorInfo | null): a is AdvisorInfo => a !== null);
 }

 async function clickNextPage(page: any): Promise<boolean> {
  const links = await page.querySelectorAll('.flex.justify-end a');

  for (const linkHtml of links) {
    if (!parseText(linkHtml).includes('Next')) continue;
    if (linkHtml.includes('cursor-not-allowed') || linkHtml.includes('pointer-events-none')) {
      return false;
    }
    await page.click('.w-0.flex-1.flex.justify-end a');
    return true;
  }

  return false;
 }

 async function waitForPage(page: any): Promise<void> {
  await page.waitForNetworkIdle(15000);
  await sleep(3000);
 }

 // ── Debug ──────────────────────────────────────────────────────────────

 async function debugEmptyPage(page: any): Promise<AdvisorInfo[]> {
  console.log('   No advisors found — running diagnostics...');

  const html = await page.content(5000, 100);
  console.log(`   HTML length: ${html.length} chars`);

  const selectors = ['article', '.bg-white.shadow', 'a.xy-btn-secondary', 'a[href*="find-an-advisor"]'];
  for (const sel of selectors) {
    const els = await page.querySelectorAll(sel);
    console.log(`   ${sel}: ${els.length} matches`);
  }

  // Retry after scroll in case of lazy loading
  await page.scrollY(800);
  await sleep(5000);
  return scrapeCurrentPage(page);
 }

 // ── Export ──────────────────────────────────────────────────────────────

 function exportResults(advisors: AdvisorInfo[]): void {
  const jsonPath = path.join(OUTPUT_DIR, 'advisors.json');
  fs.writeFileSync(jsonPath, JSON.stringify(advisors, null, 2));
  console.log(`   JSON: ${jsonPath}`);

  const csvPath = path.join(OUTPUT_DIR, 'advisors.csv');
  const header = 'Name,Firm,Credentials,Specialties,Bio,Profile URL,Headshot URL';
  const rows = advisors.map((a) =>
    [a.name, a.firm, a.credentials.join('; '), a.specialties.join('; '), a.bio, a.profileUrl, a.headshotUrl]
      .map(csvEscape)
      .join(','),
  );
  fs.writeFileSync(csvPath, [header, ...rows].join('\n') + '\n');
  console.log(`   CSV:  ${csvPath}`);
 }

 // ── Main ───────────────────────────────────────────────────────────────

 async function main(): Promise<void> {
  const apiKey = process.env.SPIDER_API_KEY;
  if (!apiKey) {
    console.error('Error: SPIDER_API_KEY not set. Copy .env.example to .env and add your key.');
    return;
  }

  fs.mkdirSync(OUTPUT_DIR, { recursive: true });

  const browser = new SpiderBrowser({
    apiKey,
    captcha: 'solve',
    stealth: 0,
    maxRetries: 12,
    url: START_URL,
    logLevel: 'info',
  });

  browser.on('captcha.detected', (e) => console.log(`   [captcha] detected: ${JSON.stringify(e.types)}`));
  browser.on('captcha.solved', () => console.log('   [captcha] solved'));
  browser.on('retry.attempt', (e) => console.log(`   [retry] ${e.attempt}/${e.maxRetries}: ${e.error}`));

  const allAdvisors: AdvisorInfo[] = [];
  let pageNum = 1;

  try {
    await browser.init();
    const page = browser.page;
    await page.setViewport(1920, 1080);

    console.log(`Navigating to ${START_URL}`);
    await browser.goto(START_URL);
    await waitForPage(page);

    while (true) {
      console.log(`\n[Page ${pageNum}/${MAX_PAGES}] Scraping...`);
      let advisors = await scrapeCurrentPage(page);

      // First-page fallback: run diagnostics if nothing was found
      if (advisors.length === 0 && pageNum === 1) {
        advisors = await debugEmptyPage(page);
        if (advisors.length === 0) {
          console.log('   Diagnostics failed — page structure may have changed.');
          break;
        }
      }

      allAdvisors.push(...advisors);
      console.log(`   ${advisors.length} advisors found`);
      for (const a of advisors) {
        console.log(`     ${a.name} — ${a.firm}`);
      }

      if (pageNum >= MAX_PAGES) break;

      const hasNext = await clickNextPage(page);
      if (!hasNext) break;

      pageNum++;
      await waitForPage(page);
    }

    // ── Summary & Export ──────────────────────────────────────────────
    console.log('\n' + '═'.repeat(70));
    console.log('  SCRAPING COMPLETE');
    console.log('═'.repeat(70));
    console.log(`  Pages: ${pageNum}  |  Advisors: ${allAdvisors.length}`);
    console.log('═'.repeat(70));

    console.log('\n   #   Name                              Firm                              Credentials');
    console.log('   ' + '─'.repeat(90));
    for (let i = 0; i < allAdvisors.length; i++) {
      const a = allAdvisors[i];
      console.log(
        `   ${String(i + 1).padStart(3)} ${a.name.slice(0, 33).padEnd(33)} ${a.firm.slice(0, 33).padEnd(33)} ${a.credentials.join(', ')}`,
      );
    }

    console.log('\n  Exported:');
    exportResults(allAdvisors);
    console.log('\nDone!');
  } finally {
    await browser.close();
  }
 }

 main().catch((err) => {
  console.error('Fatal error:', err);
  process.exit(1);
 });
	/**
	* Spider Browser — XY Planning Network advisor scraper (TypeScript).
	*
	* Scrapes advisor cards from the "Find an Advisor" directory,
	* paginating through the first MAX_PAGES pages. Exports JSON + CSV.
	*
	* Usage:
	* 1. Copy .env.example to .env and add your SPIDER_API_KEY
	* 2. Run: node xy_planning_network_advistor.ts
	*/

	import 'dotenv/config';
	import fs from 'node:fs';
	import path from 'node:path';
	import { SpiderBrowser } from 'spider-browser';

	// ── Configuration ──────────────────────────────────────────────────────

	const OUTPUT_DIR = path.join(import.meta.dirname, 'output');
	const START_URL = 'https://connect.xyplanningnetwork.com/find-an-advisor';
	const BASE_URL = 'https://connect.xyplanningnetwork.com';
	const MAX_PAGES = 3;

	// ── Types ──────────────────────────────────────────────────────────────

	interface AdvisorInfo {
	name: string;
	firm: string;
	credentials: string[];
	specialties: string[];
	bio: string;
	profileUrl: string;
	headshotUrl: string;
	}

	// ── Utilities ──────────────────────────────────────────────────────────

	const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));

	function parseText(html: string): string {
	return html.replace(/<[^>]+>/g, '').trim();
	}

	function collectMatches(html: string, pattern: RegExp, transform?: (s: string) => string): string[] {
	const results: string[] = [];
	for (const m of html.matchAll(pattern)) {
	const value = transform ? transform(parseText(m[1])) : parseText(m[1]);
	if (value) results.push(value);
	}
	return results;
	}

	function csvEscape(s: string): string {
	return `"${s.replace(/"/g, '""')}"`;
	}

	// ── Card Parsing ───────────────────────────────────────────────────────

	/**
	* Parse a single advisor card's outerHTML into structured data.
	*
	* Card structure:
	* h1 → advisor name
	* h2 → firm name
	* .xy-badge-blue → credentials (CFP, CPA, etc.)
	* .text-xy-green → specialties / ideal clients
	* .line-clamp-6 → bio snippet
	* .xy-btn-secondary → profile link
	* img[alt="profile headshot"] → headshot URL
	*/
	function parseAdvisorCard(html: string): AdvisorInfo \| null {
	const nameMatch = html.match(/<h1[^>]>(.?)<\/h1>/s);
	const name = nameMatch ? parseText(nameMatch[1]) : '';
	if (!name) return null;

	const firmMatch = html.match(/<h2[^>]>(.?)<\/h2>/s);
	const bioMatch = html.match(/<p class="line-clamp-6[^"]">(.?)<\/p>/s);
	const profileMatch = html.match(/<a class="xy-btn-secondary" href="([^"]*)">/);
	const headshotMatch = html.match(/<img[^>]src="([^"])"[^>]*alt="profile headshot"/);

	const decode = (s: string) => s.replace(/&/g, '&');

	return {
	name,
	firm: firmMatch ? parseText(firmMatch[1]) : '',
	credentials: collectMatches(html, /<span class="xy-badge-blue">(.*?)<\/span>/gs),
	specialties: collectMatches(
	html,
	/<span class="text-xy-green">[^<]<\/span>\s(.*?)<\/span>/gs,
	decode,
	),
	bio: bioMatch ? decode(parseText(bioMatch[1])) : '',
	profileUrl: profileMatch ? `${BASE_URL}${profileMatch[1]}` : '',
	headshotUrl: headshotMatch ? headshotMatch[1] : '',
	};
	}

	// ── Page Scraping ──────────────────────────────────────────────────────

	async function scrapeCurrentPage(page: any): Promise<AdvisorInfo[]> {
	const cards = await page.querySelectorAll('ul.grid > li');
	return cards.map(parseAdvisorCard).filter((a: AdvisorInfo \| null): a is AdvisorInfo => a !== null);
	}

	async function clickNextPage(page: any): Promise<boolean> {
	const links = await page.querySelectorAll('.flex.justify-end a');

	for (const linkHtml of links) {
	if (!parseText(linkHtml).includes('Next')) continue;
	if (linkHtml.includes('cursor-not-allowed') \|\| linkHtml.includes('pointer-events-none')) {
	return false;
	}
	await page.click('.w-0.flex-1.flex.justify-end a');
	return true;
	}

	return false;
	}

	async function waitForPage(page: any): Promise<void> {
	await page.waitForNetworkIdle(15000);
	await sleep(3000);
	}

	// ── Debug ──────────────────────────────────────────────────────────────

	async function debugEmptyPage(page: any): Promise<AdvisorInfo[]> {
	console.log(' No advisors found — running diagnostics...');

	const html = await page.content(5000, 100);
	console.log(` HTML length: ${html.length} chars`);

	const selectors = ['article', '.bg-white.shadow', 'a.xy-btn-secondary', 'a[href*="find-an-advisor"]'];
	for (const sel of selectors) {
	const els = await page.querySelectorAll(sel);
	console.log(` ${sel}: ${els.length} matches`);
	}

	// Retry after scroll in case of lazy loading
	await page.scrollY(800);
	await sleep(5000);
	return scrapeCurrentPage(page);
	}

	// ── Export ──────────────────────────────────────────────────────────────

	function exportResults(advisors: AdvisorInfo[]): void {
	const jsonPath = path.join(OUTPUT_DIR, 'advisors.json');
	fs.writeFileSync(jsonPath, JSON.stringify(advisors, null, 2));
	console.log(` JSON: ${jsonPath}`);

	const csvPath = path.join(OUTPUT_DIR, 'advisors.csv');
	const header = 'Name,Firm,Credentials,Specialties,Bio,Profile URL,Headshot URL';
	const rows = advisors.map((a) =>
	[a.name, a.firm, a.credentials.join('; '), a.specialties.join('; '), a.bio, a.profileUrl, a.headshotUrl]
	.map(csvEscape)
	.join(','),
	);
	fs.writeFileSync(csvPath, [header, ...rows].join('\n') + '\n');
	console.log(` CSV: ${csvPath}`);
	}

	// ── Main ───────────────────────────────────────────────────────────────

	async function main(): Promise<void> {
	const apiKey = process.env.SPIDER_API_KEY;
	if (!apiKey) {
	console.error('Error: SPIDER_API_KEY not set. Copy .env.example to .env and add your key.');
	return;
	}

	fs.mkdirSync(OUTPUT_DIR, { recursive: true });

	const browser = new SpiderBrowser({
	apiKey,
	captcha: 'solve',
	stealth: 0,
	maxRetries: 12,
	url: START_URL,
	logLevel: 'info',
	});

	browser.on('captcha.detected', (e) => console.log(` [captcha] detected: ${JSON.stringify(e.types)}`));
	browser.on('captcha.solved', () => console.log(' [captcha] solved'));
	browser.on('retry.attempt', (e) => console.log(` [retry] ${e.attempt}/${e.maxRetries}: ${e.error}`));

	const allAdvisors: AdvisorInfo[] = [];
	let pageNum = 1;

	try {
	await browser.init();
	const page = browser.page;
	await page.setViewport(1920, 1080);

	console.log(`Navigating to ${START_URL}`);
	await browser.goto(START_URL);
	await waitForPage(page);

	while (true) {
	console.log(`\n[Page ${pageNum}/${MAX_PAGES}] Scraping...`);
	let advisors = await scrapeCurrentPage(page);

	// First-page fallback: run diagnostics if nothing was found
	if (advisors.length === 0 && pageNum === 1) {
	advisors = await debugEmptyPage(page);
	if (advisors.length === 0) {
	console.log(' Diagnostics failed — page structure may have changed.');
	break;
	}
	}

	allAdvisors.push(...advisors);
	console.log(` ${advisors.length} advisors found`);
	for (const a of advisors) {
	console.log(` ${a.name} — ${a.firm}`);
	}

	if (pageNum >= MAX_PAGES) break;

	const hasNext = await clickNextPage(page);
	if (!hasNext) break;

	pageNum++;
	await waitForPage(page);
	}

	// ── Summary & Export ──────────────────────────────────────────────
	console.log('\n' + '═'.repeat(70));
	console.log(' SCRAPING COMPLETE');
	console.log('═'.repeat(70));
	console.log(` Pages: ${pageNum} \| Advisors: ${allAdvisors.length}`);
	console.log('═'.repeat(70));

	console.log('\n # Name Firm Credentials');
	console.log(' ' + '─'.repeat(90));
	for (let i = 0; i < allAdvisors.length; i++) {
	const a = allAdvisors[i];
	console.log(
	` ${String(i + 1).padStart(3)} ${a.name.slice(0, 33).padEnd(33)} ${a.firm.slice(0, 33).padEnd(33)} ${a.credentials.join(', ')}`,
	);
	}

	console.log('\n Exported:');
	exportResults(allAdvisors);
	console.log('\nDone!');
	} finally {
	await browser.close();
	}
	}

	main().catch((err) => {
	console.error('Fatal error:', err);
	process.exit(1);
	});
No results found