Skip to content

Instantly share code, notes, and snippets.

@zakhar-kogan
Last active November 18, 2024 08:35
Show Gist options
  • Save zakhar-kogan/6ca4ee7428f7b35557017ffb0f5397b2 to your computer and use it in GitHub Desktop.
Save zakhar-kogan/6ca4ee7428f7b35557017ffb0f5397b2 to your computer and use it in GitHub Desktop.
Allows for scraping using Browserless (browserless.io) from Google sheets. Outputs {selector: text} or just {text} if noHeaders is TRUE. Allows for several selectors.
/**
* Scrapes web content using specified selectors, handling multiple results
* @param {string} url The URL to scrape
* @param {string} selectors CSS selectors (comma separated)
* @param {string} apiUrl API endpoint URL incl. /scrape and ?token= if set
* @param {number} timeout (Optional) Timeout in milliseconds
* @param {boolean} noHeaders (Optional) If true, only returns values without selectors
* @return {Array} Scraped content
* @customfunction
*/
function WEBSCRAPE(url, selectors, apiUrl, timeout, noHeaders = false) {
const DEFAULT_API_URL = "YOUR_DEFAULT_API_URL";
const DEFAULT_TIMEOUT = 30000;
try {
const finalApiUrl = apiUrl || DEFAULT_API_URL;
const finalTimeout = parseInt(timeout) || DEFAULT_TIMEOUT;
const results = wScrape(url, finalApiUrl, selectors, finalTimeout);
const flattenedResults = results.flatMap(item => {
const texts = item.results.map(result => result.text);
return texts.map(text => [item.selector, text]);
});
if (noHeaders) {
return flattenedResults.map(row => row[1]);
}
return flattenedResults;
} catch (error) {
return `Error: ${error.message}`;
}
}
function wScrape(url, apiURL, selectors, timeout) {
if (!url) throw new Error("URL is required");
if (!selectors) throw new Error("Selectors are required");
const selectorArray = selectors.split(',')
.map(selector => selector.trim())
.filter(selector => selector)
.map(selector => ({ "selector": selector }));
if (selectorArray.length === 0) {
throw new Error("At least one valid selector is required");
}
const payload = {
"url": url,
"elements": selectorArray,
"gotoOptions": {
"timeout": timeout || 30000
}
};
const options = {
method: "post",
headers: {
"Content-Type": "application/json"
},
payload: JSON.stringify(payload),
muteHttpExceptions: true
};
try {
const response = UrlFetchApp.fetch(apiURL, options);
const responseCode = response.getResponseCode();
if (responseCode !== 200) {
throw new Error(`API returned status code ${responseCode}`);
}
const resp = JSON.parse(response.getContentText());
if (!resp.data || !Array.isArray(resp.data)) {
throw new Error('Invalid API response structure');
}
return resp.data;
} catch (error) {
throw error;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment