Last active
September 11, 2023 19:06
-
-
Save domderen/a12ba5804b5cc5b41f2dd69c0cd11c2b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as playwright from 'playwright'; | |
/** | |
* Defines type used to extract complex values from the page. | |
*/ | |
type CompositeSelector = { | |
selector?: string; | |
attributeName?: string; | |
} | |
/** | |
* Finds a node with a specified selector, and either extracts it's text content, | |
* or if an xpath selector is used, it can extract value of a specific attribute. | |
* | |
* Eg. "div.something" -> Will extact text content of a div with class "something", | |
* "//div[contains(@class, 'something')]/@data-id" -> Will extract value of the attribute "data-id" on the same element as above. | |
* | |
* @param page Playwright Page class. | |
* @param sel Playwright selector for obtaining a value | |
*/ | |
async function getValue(page: playwright.Page, sel: string): Promise<string> { | |
const { attributeName, selector } = parseXpathSelector(sel); | |
return await page.$eval(selector, (el, attributeName) => { | |
if(attributeName) { | |
return el.getAttribute(attributeName); | |
} | |
return el.textContent.trim(); | |
}, attributeName) | |
} | |
/** | |
* Finds an array of nodes with a specified selector, and either extracts their text content, | |
* or if an xpath selector is used, it can extract value of a specific attribute on each matching element. | |
* | |
* Eg. "div.something" -> Will extact text content of all divs with class "something", | |
* "//div[contains(@class, 'something')]/@data-id" -> Will extract values of the attribute "data-id" on the same elements as above. | |
* | |
* @param page Playwright Page class. | |
* @param sel Playwright selector for obtaining a values | |
*/ | |
async function getValues(page: playwright.Page, sel: string): Promise<string[]> { | |
const { attributeName, selector } = parseXpathSelector(sel); | |
return await page.$$eval(selector, (els, attributeName) => { | |
if(attributeName) { | |
return els.map(el => el.getAttribute(attributeName)); | |
} | |
return els.map(el => el.textContent.trim()); | |
}, attributeName) | |
} | |
/** | |
* Extracts an object of values from the page. Return object will have the same keys as "propertySelectors" input parameter, | |
* and values will equal to extracted text from the page for the specified selectors. | |
* @param page Playwright Page class | |
* @param propertySelectors Object defining the structure of return object and selectors for extracting values. | |
* @param objectSelector Selector defining parent element containing properties we want to extract. | |
*/ | |
async function getObject(page: playwright.Page, propertySelectors: {[key: string]: string | CompositeSelector}, objectSelector: string | undefined = undefined): Promise<{[key: string]: string}> { | |
const startObject: {[key: string]: CompositeSelector} = {}; | |
const compositePropertySelectors = Object.entries(propertySelectors).reduce((init, next) => { | |
if(typeof next[1] === 'string') { | |
init[next[0]] = parseXpathSelector(next[1]); | |
} else { | |
init[next[0]] = next[1]; | |
} | |
return init; | |
}, startObject); | |
return await page.$eval(objectSelector || 'document', (el, compositePropertySelectors) => { | |
const response: {[key: string]: string} = {}; | |
return Object.keys(compositePropertySelectors).reduce((init, key) => { | |
const value = compositePropertySelectors[key]; | |
const node = value.selector ? el.querySelector(value.selector) : el; | |
if(!node) { | |
init[key] = null; | |
} else { | |
init[key] = value.attributeName ? node.getAttribute(value.attributeName) : node.textContent.trim(); | |
} | |
return init; | |
}, response); | |
}, compositePropertySelectors) | |
} | |
/** | |
* Extracts an array of objects from the page. Return objects will have the same keys as "propertySelectors" input parameter, | |
* and values will equal to extracted text from the page for the specified selectors. | |
* @param page Playwright Page class | |
* @param propertySelectors Object defining the structure of return object and selectors for extracting values. | |
* @param objectSelector Selector defining parent element containing properties we want to extract. | |
*/ | |
async function getObjects(page: playwright.Page, propertySelectors: {[key: string]: string | CompositeSelector}, objectSelector: string | undefined = undefined): Promise<{[key: string]: string}[]> { | |
const startObject: {[key: string]: CompositeSelector} = {}; | |
const compositePropertySelectors = Object.entries(propertySelectors).reduce((init, next) => { | |
if(typeof next[1] === 'string') { | |
init[next[0]] = parseXpathSelector(next[1]); | |
} else { | |
init[next[0]] = next[1]; | |
} | |
return init; | |
}, startObject); | |
return await page.$$eval(objectSelector || 'document', (els, compositePropertySelectors) => { | |
return els.map(el => { | |
const response: {[key: string]: string} = {}; | |
return Object.keys(compositePropertySelectors).reduce((init, key) => { | |
const value = compositePropertySelectors[key]; | |
const node = value.selector ? el.querySelector(value.selector) : el; | |
if(!node) { | |
init[key] = null; | |
} else { | |
init[key] = value.attributeName ? node.getAttribute(value.attributeName) : node.textContent.trim(); | |
} | |
return init; | |
}, response); | |
}); | |
}, compositePropertySelectors) | |
} | |
/** | |
* Checks if the provided selector is an xpath one, and if so, tries to parse out an attribute name to be extracted. | |
* @param selector a Playwright Selector. | |
*/ | |
function parseXpathSelector(selector: string): CompositeSelector { | |
let attributeName: string | |
if(selector.startsWith('//') || selector.startsWith('..') || selector.startsWith('xpath=')) { | |
const matchResults = selector.match(/(\/@.*)$/g); | |
if(matchResults && matchResults.length) { | |
attributeName = matchResults[0].substr(2); | |
selector = selector.replace(matchResults[0], ''); | |
} | |
} | |
return {attributeName, selector}; | |
} | |
/** | |
* This function launches a browser, goes to the provided URL, | |
* and attempts to extract values from the page, either as simple labels, or complex objects. | |
*/ | |
async function main () { | |
try { | |
const browser = await playwright.chromium.launch(); | |
const page = await browser.newPage(); | |
await page.goto('https://news.ycombinator.com/'); | |
const headline = await getValue(page, "//tbody/tr[contains(@class, 'athing')]/td[contains(@class, 'title')]/a[contains(@class, 'storylink')]"); | |
console.log(headline); | |
// "Some headline" | |
const articleLinks = await getValues(page, "//tbody/tr[contains(@class, 'athing')]/td[contains(@class, 'title')]/a[contains(@class, 'storylink')]/@href"); | |
console.log(articleLinks); | |
// ["https://github.com/whatever", "https://github.com/whatever2", ...] | |
const selectors = { | |
source: {selector: "span.sitestr"}, | |
headline: "a.storylink", | |
link: {selector: "a.storylink", attributeName: "href"} | |
} | |
const article = await getObject(page, selectors, "//tbody/tr[contains(@class, 'athing')]"); | |
console.log(article); | |
// {source: "github.com", headline: "Some headline", link: "https://github.com/whatever"} | |
const articles = await getObjects(page, selectors, "//tbody/tr[contains(@class, 'athing')]"); | |
console.log(articles); | |
// [{source: "github.com", headline: "Some headline", link: "https://github.com/whatever"}, ...] | |
} catch (error) { | |
console.log('UNEXPECTED ERROR', error); | |
process.exit(1); | |
} | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment