Last active
November 18, 2024 08:35
-
-
Save zakhar-kogan/6ca4ee7428f7b35557017ffb0f5397b2 to your computer and use it in GitHub Desktop.
Allows for scraping using Browserless (browserless.io) from Google sheets. Outputs {selector: text} or just {text} if noHeaders is TRUE. Allows for several selectors.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Scrapes web content using specified selectors, handling multiple results | |
* @param {string} url The URL to scrape | |
* @param {string} selectors CSS selectors (comma separated) | |
* @param {string} apiUrl API endpoint URL incl. /scrape and ?token= if set | |
* @param {number} timeout (Optional) Timeout in milliseconds | |
* @param {boolean} noHeaders (Optional) If true, only returns values without selectors | |
* @return {Array} Scraped content | |
* @customfunction | |
*/ | |
function WEBSCRAPE(url, selectors, apiUrl, timeout, noHeaders = false) { | |
const DEFAULT_API_URL = "YOUR_DEFAULT_API_URL"; | |
const DEFAULT_TIMEOUT = 30000; | |
try { | |
const finalApiUrl = apiUrl || DEFAULT_API_URL; | |
const finalTimeout = parseInt(timeout) || DEFAULT_TIMEOUT; | |
const results = wScrape(url, finalApiUrl, selectors, finalTimeout); | |
const flattenedResults = results.flatMap(item => { | |
const texts = item.results.map(result => result.text); | |
return texts.map(text => [item.selector, text]); | |
}); | |
if (noHeaders) { | |
return flattenedResults.map(row => row[1]); | |
} | |
return flattenedResults; | |
} catch (error) { | |
return `Error: ${error.message}`; | |
} | |
} | |
function wScrape(url, apiURL, selectors, timeout) { | |
if (!url) throw new Error("URL is required"); | |
if (!selectors) throw new Error("Selectors are required"); | |
const selectorArray = selectors.split(',') | |
.map(selector => selector.trim()) | |
.filter(selector => selector) | |
.map(selector => ({ "selector": selector })); | |
if (selectorArray.length === 0) { | |
throw new Error("At least one valid selector is required"); | |
} | |
const payload = { | |
"url": url, | |
"elements": selectorArray, | |
"gotoOptions": { | |
"timeout": timeout || 30000 | |
} | |
}; | |
const options = { | |
method: "post", | |
headers: { | |
"Content-Type": "application/json" | |
}, | |
payload: JSON.stringify(payload), | |
muteHttpExceptions: true | |
}; | |
try { | |
const response = UrlFetchApp.fetch(apiURL, options); | |
const responseCode = response.getResponseCode(); | |
if (responseCode !== 200) { | |
throw new Error(`API returned status code ${responseCode}`); | |
} | |
const resp = JSON.parse(response.getContentText()); | |
if (!resp.data || !Array.isArray(resp.data)) { | |
throw new Error('Invalid API response structure'); | |
} | |
return resp.data; | |
} catch (error) { | |
throw error; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment