domderen · September 11, 2023 19:06
diff --git a/hackernews_articles.ts b/hackernews_articles.ts
 import * as playwright from 'playwright';

 /**
 * Defines type used to extract complex values from the page.
 */
 type CompositeSelector = {
  selector?: string;
  attributeName?: string;
 }

 /**
 * Finds a node with a specified selector, and either extracts it's text content,
 * or if an xpath selector is used, it can extract value of a specific attribute.
 * 
 * Eg. "div.something" -> Will extact text content of a div with class "something",
 *     "//div[contains(@class, 'something')]/@data-id" -> Will extract value of the attribute "data-id" on the same element as above.
 * 
 * @param page Playwright Page class.
 * @param sel Playwright selector for obtaining a value
 */
 async function getValue(page: playwright.Page, sel: string): Promise<string> {
  const { attributeName, selector } = parseXpathSelector(sel);

  return await page.$eval(selector, (el, attributeName) => {
    if(attributeName) {
      return el.getAttribute(attributeName);
    }

    return el.textContent.trim();
  }, attributeName)
 }

 /**
 * Finds an array of nodes with a specified selector, and either extracts their text content,
 * or if an xpath selector is used, it can extract value of a specific attribute on each matching element.
 * 
 * Eg. "div.something" -> Will extact text content of all divs with class "something",
 *     "//div[contains(@class, 'something')]/@data-id" -> Will extract values of the attribute "data-id" on the same elements as above.
 * 
 * @param page Playwright Page class.
 * @param sel Playwright selector for obtaining a values
 */
 async function getValues(page: playwright.Page, sel: string): Promise<string[]> {
  const { attributeName, selector } = parseXpathSelector(sel);

  return await page.$$eval(selector, (els, attributeName) => {
    if(attributeName) {
      return els.map(el => el.getAttribute(attributeName));
    }

    return els.map(el => el.textContent.trim());
  }, attributeName)
 }

 /**
 * Extracts an object of values from the page. Return object will have the same keys as "propertySelectors" input parameter,
 * and values will equal to extracted text from the page for the specified selectors.
 * @param page Playwright Page class
 * @param propertySelectors Object defining the structure of return object and selectors for extracting values.
 * @param objectSelector Selector defining parent element containing properties we want to extract.
 */
 async function getObject(page: playwright.Page, propertySelectors: {[key: string]: string | CompositeSelector}, objectSelector: string | undefined = undefined): Promise<{[key: string]: string}> {
  const startObject: {[key: string]: CompositeSelector} = {};
  const compositePropertySelectors = Object.entries(propertySelectors).reduce((init, next) => {
    if(typeof next[1] === 'string') {
      init[next[0]] = parseXpathSelector(next[1]);
    } else {
      init[next[0]] = next[1];
    }
    return init;
  }, startObject);

  return await page.$eval(objectSelector || 'document', (el, compositePropertySelectors) => {
    const response: {[key: string]: string} = {};

    return Object.keys(compositePropertySelectors).reduce((init, key) => {
      const value = compositePropertySelectors[key];
      const node = value.selector ? el.querySelector(value.selector) : el;
      if(!node) {
        init[key] = null;
      } else {
        init[key] = value.attributeName ? node.getAttribute(value.attributeName) : node.textContent.trim();
      }

      return init;
    }, response);

  }, compositePropertySelectors)
 }

 /**
 * Extracts an array of objects from the page. Return objects will have the same keys as "propertySelectors" input parameter,
 * and values will equal to extracted text from the page for the specified selectors.
 * @param page Playwright Page class
 * @param propertySelectors Object defining the structure of return object and selectors for extracting values.
 * @param objectSelector Selector defining parent element containing properties we want to extract.
 */
 async function getObjects(page: playwright.Page, propertySelectors: {[key: string]: string | CompositeSelector}, objectSelector: string | undefined = undefined): Promise<{[key: string]: string}[]> {
  const startObject: {[key: string]: CompositeSelector} = {};
  const compositePropertySelectors = Object.entries(propertySelectors).reduce((init, next) => {
    if(typeof next[1] === 'string') {
      init[next[0]] = parseXpathSelector(next[1]);
    } else {
      init[next[0]] = next[1];
    }
    return init;
  }, startObject);

  return await page.$$eval(objectSelector || 'document', (els, compositePropertySelectors) => {
    return els.map(el => {
      const response: {[key: string]: string} = {};
      return Object.keys(compositePropertySelectors).reduce((init, key) => {
        const value = compositePropertySelectors[key];
        const node = value.selector ? el.querySelector(value.selector) : el;
        if(!node) {
          init[key] = null;
        } else {
          init[key] = value.attributeName ? node.getAttribute(value.attributeName) : node.textContent.trim();
        }
  
        return init;
      }, response);
    });
  }, compositePropertySelectors)
 }

 /**
 * Checks if the provided selector is an xpath one, and if so, tries to parse out an attribute name to be extracted.
 * @param selector a Playwright Selector.
 */
 function parseXpathSelector(selector: string): CompositeSelector {
  let attributeName: string
  if(selector.startsWith('//') || selector.startsWith('..') || selector.startsWith('xpath=')) {
    const matchResults = selector.match(/(\/@.*)$/g);
    if(matchResults && matchResults.length) {
      attributeName = matchResults[0].substr(2);
      selector = selector.replace(matchResults[0], '');
    }
  }

  return {attributeName, selector};
 }

 /**
 * This function launches a browser, goes to the provided URL,
 * and attempts to extract values from the page, either as simple labels, or complex objects.
 */
 async function main () {
  try {
    const browser = await playwright.chromium.launch();
    const page = await browser.newPage();
    await page.goto('https://news.ycombinator.com/');

    const headline = await getValue(page, "//tbody/tr[contains(@class, 'athing')]/td[contains(@class, 'title')]/a[contains(@class, 'storylink')]");

    console.log(headline);
    // "Some headline"

    const articleLinks = await getValues(page, "//tbody/tr[contains(@class, 'athing')]/td[contains(@class, 'title')]/a[contains(@class, 'storylink')]/@href");

    console.log(articleLinks);
    // ["https://github.com/whatever", "https://github.com/whatever2", ...]

    const selectors = {
      source: {selector: "span.sitestr"},
      headline: "a.storylink",
      link: {selector: "a.storylink", attributeName: "href"}
    }

    const article = await getObject(page, selectors, "//tbody/tr[contains(@class, 'athing')]");
    
    console.log(article);
    // {source: "github.com", headline: "Some headline", link: "https://github.com/whatever"}

    const articles = await getObjects(page, selectors, "//tbody/tr[contains(@class, 'athing')]");

    console.log(articles);
    // [{source: "github.com", headline: "Some headline", link: "https://github.com/whatever"}, ...]
  } catch (error) {
    console.log('UNEXPECTED ERROR', error);
    process.exit(1);
  }
 }

 main();
	import * as playwright from 'playwright';

	/**
	* Defines type used to extract complex values from the page.
	*/
	type CompositeSelector = {
	selector?: string;
	attributeName?: string;
	}

	/**
	* Finds a node with a specified selector, and either extracts it's text content,
	* or if an xpath selector is used, it can extract value of a specific attribute.
	*
	* Eg. "div.something" -> Will extact text content of a div with class "something",
	* "//div[contains(@class, 'something')]/@data-id" -> Will extract value of the attribute "data-id" on the same element as above.
	*
	* @param page Playwright Page class.
	* @param sel Playwright selector for obtaining a value
	*/
	async function getValue(page: playwright.Page, sel: string): Promise<string> {
	const { attributeName, selector } = parseXpathSelector(sel);

	return await page.$eval(selector, (el, attributeName) => {
	if(attributeName) {
	return el.getAttribute(attributeName);
	}

	return el.textContent.trim();
	}, attributeName)
	}

	/**
	* Finds an array of nodes with a specified selector, and either extracts their text content,
	* or if an xpath selector is used, it can extract value of a specific attribute on each matching element.
	*
	* Eg. "div.something" -> Will extact text content of all divs with class "something",
	* "//div[contains(@class, 'something')]/@data-id" -> Will extract values of the attribute "data-id" on the same elements as above.
	*
	* @param page Playwright Page class.
	* @param sel Playwright selector for obtaining a values
	*/
	async function getValues(page: playwright.Page, sel: string): Promise<string[]> {
	const { attributeName, selector } = parseXpathSelector(sel);

	return await page.$$eval(selector, (els, attributeName) => {
	if(attributeName) {
	return els.map(el => el.getAttribute(attributeName));
	}

	return els.map(el => el.textContent.trim());
	}, attributeName)
	}

	/**
	* Extracts an object of values from the page. Return object will have the same keys as "propertySelectors" input parameter,
	* and values will equal to extracted text from the page for the specified selectors.
	* @param page Playwright Page class
	* @param propertySelectors Object defining the structure of return object and selectors for extracting values.
	* @param objectSelector Selector defining parent element containing properties we want to extract.
	*/
	async function getObject(page: playwright.Page, propertySelectors: {[key: string]: string \| CompositeSelector}, objectSelector: string \| undefined = undefined): Promise<{[key: string]: string}> {
	const startObject: {[key: string]: CompositeSelector} = {};
	const compositePropertySelectors = Object.entries(propertySelectors).reduce((init, next) => {
	if(typeof next[1] === 'string') {
	init[next[0]] = parseXpathSelector(next[1]);
	} else {
	init[next[0]] = next[1];
	}
	return init;
	}, startObject);

	return await page.$eval(objectSelector \|\| 'document', (el, compositePropertySelectors) => {
	const response: {[key: string]: string} = {};

	return Object.keys(compositePropertySelectors).reduce((init, key) => {
	const value = compositePropertySelectors[key];
	const node = value.selector ? el.querySelector(value.selector) : el;
	if(!node) {
	init[key] = null;
	} else {
	init[key] = value.attributeName ? node.getAttribute(value.attributeName) : node.textContent.trim();
	}

	return init;
	}, response);

	}, compositePropertySelectors)
	}

	/**
	* Extracts an array of objects from the page. Return objects will have the same keys as "propertySelectors" input parameter,
	* and values will equal to extracted text from the page for the specified selectors.
	* @param page Playwright Page class
	* @param propertySelectors Object defining the structure of return object and selectors for extracting values.
	* @param objectSelector Selector defining parent element containing properties we want to extract.
	*/
	async function getObjects(page: playwright.Page, propertySelectors: {[key: string]: string \| CompositeSelector}, objectSelector: string \| undefined = undefined): Promise<{[key: string]: string}[]> {
	const startObject: {[key: string]: CompositeSelector} = {};
	const compositePropertySelectors = Object.entries(propertySelectors).reduce((init, next) => {
	if(typeof next[1] === 'string') {
	init[next[0]] = parseXpathSelector(next[1]);
	} else {
	init[next[0]] = next[1];
	}
	return init;
	}, startObject);

	return await page.$$eval(objectSelector \|\| 'document', (els, compositePropertySelectors) => {
	return els.map(el => {
	const response: {[key: string]: string} = {};
	return Object.keys(compositePropertySelectors).reduce((init, key) => {
	const value = compositePropertySelectors[key];
	const node = value.selector ? el.querySelector(value.selector) : el;
	if(!node) {
	init[key] = null;
	} else {
	init[key] = value.attributeName ? node.getAttribute(value.attributeName) : node.textContent.trim();
	}

	return init;
	}, response);
	});
	}, compositePropertySelectors)
	}

	/**
	* Checks if the provided selector is an xpath one, and if so, tries to parse out an attribute name to be extracted.
	* @param selector a Playwright Selector.
	*/
	function parseXpathSelector(selector: string): CompositeSelector {
	let attributeName: string
	if(selector.startsWith('//') \|\| selector.startsWith('..') \|\| selector.startsWith('xpath=')) {
	const matchResults = selector.match(/(\/@.*)$/g);
	if(matchResults && matchResults.length) {
	attributeName = matchResults[0].substr(2);
	selector = selector.replace(matchResults[0], '');
	}
	}

	return {attributeName, selector};
	}

	/**
	* This function launches a browser, goes to the provided URL,
	* and attempts to extract values from the page, either as simple labels, or complex objects.
	*/
	async function main () {
	try {
	const browser = await playwright.chromium.launch();
	const page = await browser.newPage();
	await page.goto('https://news.ycombinator.com/');

	const headline = await getValue(page, "//tbody/tr[contains(@class, 'athing')]/td[contains(@class, 'title')]/a[contains(@class, 'storylink')]");

	console.log(headline);
	// "Some headline"

	const articleLinks = await getValues(page, "//tbody/tr[contains(@class, 'athing')]/td[contains(@class, 'title')]/a[contains(@class, 'storylink')]/@href");

	console.log(articleLinks);
	// ["https://github.com/whatever", "https://github.com/whatever2", ...]

	const selectors = {
	source: {selector: "span.sitestr"},
	headline: "a.storylink",
	link: {selector: "a.storylink", attributeName: "href"}
	}

	const article = await getObject(page, selectors, "//tbody/tr[contains(@class, 'athing')]");

	console.log(article);
	// {source: "github.com", headline: "Some headline", link: "https://github.com/whatever"}

	const articles = await getObjects(page, selectors, "//tbody/tr[contains(@class, 'athing')]");

	console.log(articles);
	// [{source: "github.com", headline: "Some headline", link: "https://github.com/whatever"}, ...]
	} catch (error) {
	console.log('UNEXPECTED ERROR', error);
	process.exit(1);
	}
	}

	main();
No results found