Last active
September 9, 2024 17:49
-
-
Save moorage/8e689ac7e903e3cd34e0e5b9a93fcdaa to your computer and use it in GitHub Desktop.
Google Apps Script to fetch a url, use a query selector to get to a value
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Fetches and caches the fully redirected URL for one day (24 hours). | |
* | |
* This function follows all redirects for a given URL and stores the final | |
* redirected URL in the cache for 24 hours. If the URL has already been | |
* processed within the last 24 hours, the cached result is returned to | |
* improve performance and limit network requests. | |
* | |
* @param {string} url - The original URL to fetch and follow redirects from. | |
* @return {string} The fully redirected URL after following all redirects. | |
* @throws Will throw an error if there is a problem fetching the URL. | |
* @customfunction | |
*/ | |
function FULL_REDIRECT_URL(url) { | |
// Get the cache service for the current script | |
var cache = CacheService.getScriptCache(); | |
// Attempt to retrieve the cached URL | |
const cacheKey = `redirect:${url}`; | |
var cachedUrl = cache.get(cacheKey); | |
// If the URL is found in the cache, return it immediately | |
if (cachedUrl) { | |
return cachedUrl; | |
} | |
try { | |
// Fetch the URL and follow redirects (default behavior) | |
var response = UrlFetchApp.fetch(url, { followRedirects: true }); | |
// The fully redirected URL can be retrieved from either the 'Content-Location' header or the request URL | |
var finalUrl = response.getHeaders()['Content-Location'] || url; | |
// Cache the final redirected URL for 24 hours (86400 seconds) | |
cache.put(cacheKey, finalUrl, 86400); // Cache duration: 1 day (86400 seconds) | |
// Return the fully redirected URL | |
return finalUrl; | |
} catch (e) { | |
cache.put(cacheKey, null, 24 * 60 * 60); | |
// If there's an error during the fetch process, throw an error with details | |
throw new Error('Error fetching URL: ' + e.message); | |
} | |
} | |
/** | |
* Fetches the content of a URL and returns either an attribute value or text content based on a query selector. | |
* | |
* @param {String} url The URL to fetch the content from. | |
* @param {String} querySelector The CSS selector to query the content. | |
* @param {String} [attributeName] Optional. The attribute name to fetch the value of. | |
* @return {String} The value of the attribute or text content based on the query selector. | |
* @customfunction | |
*/ | |
function IMPORT_URL_WITH_SELECTOR(url, querySelector, attributeName) { | |
'use strict'; | |
if (!url || !querySelector) return null; | |
const cache = CacheService.getScriptCache(); | |
const docCacheKey = `doc:${url}`; | |
const valCacheKey = `html-val:${url}:${querySelector}:${attributeName || 'textContent'}`; | |
const cachedResult = cache.get(valCacheKey); | |
if (cachedResult) return cachedResult; | |
let htmlContent = ""; | |
try { | |
const cachedHtmlContent = cache.get(docCacheKey); | |
htmlContent = cachedHtmlContent ? cachedHtmlContent : UrlFetchApp.fetch(url, { 'followRedirects': true, 'muteHttpExceptions': true }).getContentText(); | |
if (!cachedHtmlContent) { | |
// Cache the entire HTML content for 1 day (24 hours) | |
cache.put(url, htmlContent, 24 * 60 * 60); | |
} | |
}catch (error) { | |
cache.put(docCacheKey, null, 24 * 60 * 60); | |
cache.put(valCacheKey, null, 24 * 60 * 60); | |
throw error; | |
} | |
// Replace all variations of <br> with <br>\n if not followed by optional whitespace and then a newline | |
// to work around cheerio's not handling of text() <BR>s as newlines | |
htmlContent = htmlContent.replace(/<br\s*\/?>(?!\s*\n)/gi, '<br/>\n'); | |
try { | |
const $ = Cheerio.load(htmlContent); | |
const result = attributeName ? $(querySelector).first().attr(attributeName) : $(querySelector).first().text().trim(); | |
// Cache the result based on parameters for 1 day (24 hours) | |
cache.put(valCacheKey, result, 24 * 60 * 60); | |
return result; | |
} catch (error) { | |
cache.put(valCacheKey, null, 24 * 60 * 60); | |
throw error; | |
} | |
} | |
/** | |
* Fetches the content of a URL and returns a value based on a JSON query selector. | |
* Example: To extract the WebsiteURL of the first array element, you would call the function as follows: | |
* `=IMPORT_JSON_WITH_QUERY("https://example.com/data.json", "0.WebsiteURL")` | |
* | |
* @param {String} url The URL to fetch the content from. | |
* @param {String} jsonQuerySelector The JSON query selector to extract the value. | |
* @return {String} The value extracted based on the JSON query selector. | |
* @customfunction | |
*/ | |
function IMPORT_JSON_WITH_QUERY(url, jsonQuerySelector) { | |
'use strict'; | |
if (!url || !jsonQuerySelector) return null; | |
const cache = CacheService.getScriptCache(); | |
const docCacheKey = `doc:${url}`; | |
const valCacheKey = `json-val:${url}:${jsonQuerySelector}`; | |
const cachedResult = cache.get(valCacheKey); | |
if (cachedResult) return cachedResult; | |
let jsonContent = ""; | |
try { | |
const cachedJsonContent = cache.get(docCacheKey); | |
jsonContent = cachedJsonContent ? cachedJsonContent : UrlFetchApp.fetch(url, { 'followRedirects': true, 'muteHttpExceptions': true }).getContentText(); | |
if (!cachedJsonContent) { | |
// Cache the entire JSON content for 1 day (24 hours) | |
cache.put(docCacheKey, jsonContent, 24 * 60 * 60); | |
} | |
} catch (error) { | |
cache.put(docCacheKey, null, 24 * 60 * 60); | |
cache.put(valCacheKey, null, 24 * 60 * 60); | |
throw error; | |
} | |
try { | |
const jsonObject = JSON.parse(jsonContent); | |
const result = getJsonValue(jsonObject, jsonQuerySelector); | |
// Cache the result based on parameters for 1 day (24 hours) | |
cache.put(valCacheKey, result, 24 * 60 * 60); | |
return result; | |
} catch (error) { | |
cache.put(valCacheKey, null, 24 * 60 * 60); | |
throw error; | |
} | |
} | |
/** | |
* Helper function to get a value from a JSON object based on a simple query selector. | |
* | |
* @param {Object} jsonObject The JSON object to query. | |
* @param {String} query The query selector (e.g., "0.WebsiteURL" to access the first element's WebsiteURL). | |
* @return {String} The value extracted based on the query selector. | |
*/ | |
function getJsonValue(jsonObject, query) { | |
try { | |
const properties = query.split('.'); | |
let value = jsonObject; | |
for (const prop of properties) { | |
if (value === undefined || value === null) { | |
return null; | |
} | |
// Handle array index | |
if (!isNaN(prop)) { | |
value = value[parseInt(prop, 10)]; | |
} else { | |
value = value[prop]; | |
} | |
} | |
return value !== undefined ? value : null; | |
} catch (error) { | |
return null; | |
} | |
} | |
/** | |
* Fetches the content of a URL and returns either an attribute value or text content based on an XPath selector. | |
* | |
* @param {String} url The URL to fetch the content from. | |
* @param {String} xpath The XPath selector to query the content. | |
* @param {String} [attributeName] Optional. The attribute name to fetch the value of. | |
* @return {String} The value of the attribute or text content based on the XPath selector. | |
* @customfunction | |
*/ | |
function IMPORT_XML_WITH_XPATH(url, xpath, attributeName) { | |
'use strict'; | |
if (!url || !xpath) return null; | |
const cache = CacheService.getScriptCache(); | |
const docCacheKey = `doc:${url}`; | |
const valCacheKey = `xml-val:${url}:${xpath}:${attributeName || 'textContent'}`; | |
const cachedResult = cache.get(valCacheKey); | |
if (cachedResult) return cachedResult; | |
let htmlContent = ""; | |
try { | |
const cachedHtmlContent = cache.get(docCacheKey); | |
htmlContent = cachedHtmlContent ? cachedHtmlContent : UrlFetchApp.fetch(url, { 'followRedirects': true, 'muteHttpExceptions': true }).getContentText(); | |
if (!cachedHtmlContent) { | |
// Cache the entire HTML content for 1 day (24 hours) | |
cache.put(docCacheKey, htmlContent, 24 * 60 * 60); | |
} | |
} catch (error) { | |
cache.put(docCacheKey, null, 24 * 60 * 60); | |
cache.put(valCacheKey, null, 24 * 60 * 60); | |
throw error; | |
} | |
try { | |
const xmlDocument = XmlService.parse(htmlContent); | |
const xpathResult = XmlService.getXPathEvaluator().evaluate(xpath, xmlDocument, XmlService.NameSpace.XMLNS, XmlService.XPathResult.FIRST_ORDERED_NODE_TYPE, null); | |
let result; | |
if (xpathResult.getNode()) { | |
if (attributeName) { | |
result = xpathResult.getNode().getAttribute(attributeName); | |
} else { | |
result = xpathResult.getNode().getValue(); | |
} | |
} else { | |
result = null; | |
} | |
// Cache the result based on parameters for 1 day (24 hours) | |
cache.put(valCacheKey, result, 24 * 60 * 60); | |
return result; | |
} catch (error) { | |
cache.put(valCacheKey, null, 24 * 60 * 60); | |
throw error; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment