Skip to content

Instantly share code, notes, and snippets.

@moorage
Last active September 9, 2024 17:49
Show Gist options
  • Save moorage/8e689ac7e903e3cd34e0e5b9a93fcdaa to your computer and use it in GitHub Desktop.
Save moorage/8e689ac7e903e3cd34e0e5b9a93fcdaa to your computer and use it in GitHub Desktop.
Google Apps Script to fetch a url, use a query selector to get to a value
/**
* Fetches and caches the fully redirected URL for one day (24 hours).
*
* This function follows all redirects for a given URL and stores the final
* redirected URL in the cache for 24 hours. If the URL has already been
* processed within the last 24 hours, the cached result is returned to
* improve performance and limit network requests.
*
* @param {string} url - The original URL to fetch and follow redirects from.
* @return {string} The fully redirected URL after following all redirects.
* @throws Will throw an error if there is a problem fetching the URL.
* @customfunction
*/
function FULL_REDIRECT_URL(url) {
// Get the cache service for the current script
var cache = CacheService.getScriptCache();
// Attempt to retrieve the cached URL
const cacheKey = `redirect:${url}`;
var cachedUrl = cache.get(cacheKey);
// If the URL is found in the cache, return it immediately
if (cachedUrl) {
return cachedUrl;
}
try {
// Fetch the URL and follow redirects (default behavior)
var response = UrlFetchApp.fetch(url, { followRedirects: true });
// The fully redirected URL can be retrieved from either the 'Content-Location' header or the request URL
var finalUrl = response.getHeaders()['Content-Location'] || url;
// Cache the final redirected URL for 24 hours (86400 seconds)
cache.put(cacheKey, finalUrl, 86400); // Cache duration: 1 day (86400 seconds)
// Return the fully redirected URL
return finalUrl;
} catch (e) {
cache.put(cacheKey, null, 24 * 60 * 60);
// If there's an error during the fetch process, throw an error with details
throw new Error('Error fetching URL: ' + e.message);
}
}
/**
* Fetches the content of a URL and returns either an attribute value or text content based on a query selector.
*
* @param {String} url The URL to fetch the content from.
* @param {String} querySelector The CSS selector to query the content.
* @param {String} [attributeName] Optional. The attribute name to fetch the value of.
* @return {String} The value of the attribute or text content based on the query selector.
* @customfunction
*/
function IMPORT_URL_WITH_SELECTOR(url, querySelector, attributeName) {
'use strict';
if (!url || !querySelector) return null;
const cache = CacheService.getScriptCache();
const docCacheKey = `doc:${url}`;
const valCacheKey = `html-val:${url}:${querySelector}:${attributeName || 'textContent'}`;
const cachedResult = cache.get(valCacheKey);
if (cachedResult) return cachedResult;
let htmlContent = "";
try {
const cachedHtmlContent = cache.get(docCacheKey);
htmlContent = cachedHtmlContent ? cachedHtmlContent : UrlFetchApp.fetch(url, { 'followRedirects': true, 'muteHttpExceptions': true }).getContentText();
if (!cachedHtmlContent) {
// Cache the entire HTML content for 1 day (24 hours)
cache.put(url, htmlContent, 24 * 60 * 60);
}
}catch (error) {
cache.put(docCacheKey, null, 24 * 60 * 60);
cache.put(valCacheKey, null, 24 * 60 * 60);
throw error;
}
// Replace all variations of <br> with <br>\n if not followed by optional whitespace and then a newline
// to work around cheerio's not handling of text() <BR>s as newlines
htmlContent = htmlContent.replace(/<br\s*\/?>(?!\s*\n)/gi, '<br/>\n');
try {
const $ = Cheerio.load(htmlContent);
const result = attributeName ? $(querySelector).first().attr(attributeName) : $(querySelector).first().text().trim();
// Cache the result based on parameters for 1 day (24 hours)
cache.put(valCacheKey, result, 24 * 60 * 60);
return result;
} catch (error) {
cache.put(valCacheKey, null, 24 * 60 * 60);
throw error;
}
}
/**
* Fetches the content of a URL and returns a value based on a JSON query selector.
* Example: To extract the WebsiteURL of the first array element, you would call the function as follows:
* `=IMPORT_JSON_WITH_QUERY("https://example.com/data.json", "0.WebsiteURL")`
*
* @param {String} url The URL to fetch the content from.
* @param {String} jsonQuerySelector The JSON query selector to extract the value.
* @return {String} The value extracted based on the JSON query selector.
* @customfunction
*/
function IMPORT_JSON_WITH_QUERY(url, jsonQuerySelector) {
'use strict';
if (!url || !jsonQuerySelector) return null;
const cache = CacheService.getScriptCache();
const docCacheKey = `doc:${url}`;
const valCacheKey = `json-val:${url}:${jsonQuerySelector}`;
const cachedResult = cache.get(valCacheKey);
if (cachedResult) return cachedResult;
let jsonContent = "";
try {
const cachedJsonContent = cache.get(docCacheKey);
jsonContent = cachedJsonContent ? cachedJsonContent : UrlFetchApp.fetch(url, { 'followRedirects': true, 'muteHttpExceptions': true }).getContentText();
if (!cachedJsonContent) {
// Cache the entire JSON content for 1 day (24 hours)
cache.put(docCacheKey, jsonContent, 24 * 60 * 60);
}
} catch (error) {
cache.put(docCacheKey, null, 24 * 60 * 60);
cache.put(valCacheKey, null, 24 * 60 * 60);
throw error;
}
try {
const jsonObject = JSON.parse(jsonContent);
const result = getJsonValue(jsonObject, jsonQuerySelector);
// Cache the result based on parameters for 1 day (24 hours)
cache.put(valCacheKey, result, 24 * 60 * 60);
return result;
} catch (error) {
cache.put(valCacheKey, null, 24 * 60 * 60);
throw error;
}
}
/**
* Helper function to get a value from a JSON object based on a simple query selector.
*
* @param {Object} jsonObject The JSON object to query.
* @param {String} query The query selector (e.g., "0.WebsiteURL" to access the first element's WebsiteURL).
* @return {String} The value extracted based on the query selector.
*/
function getJsonValue(jsonObject, query) {
try {
const properties = query.split('.');
let value = jsonObject;
for (const prop of properties) {
if (value === undefined || value === null) {
return null;
}
// Handle array index
if (!isNaN(prop)) {
value = value[parseInt(prop, 10)];
} else {
value = value[prop];
}
}
return value !== undefined ? value : null;
} catch (error) {
return null;
}
}
/**
* Fetches the content of a URL and returns either an attribute value or text content based on an XPath selector.
*
* @param {String} url The URL to fetch the content from.
* @param {String} xpath The XPath selector to query the content.
* @param {String} [attributeName] Optional. The attribute name to fetch the value of.
* @return {String} The value of the attribute or text content based on the XPath selector.
* @customfunction
*/
function IMPORT_XML_WITH_XPATH(url, xpath, attributeName) {
'use strict';
if (!url || !xpath) return null;
const cache = CacheService.getScriptCache();
const docCacheKey = `doc:${url}`;
const valCacheKey = `xml-val:${url}:${xpath}:${attributeName || 'textContent'}`;
const cachedResult = cache.get(valCacheKey);
if (cachedResult) return cachedResult;
let htmlContent = "";
try {
const cachedHtmlContent = cache.get(docCacheKey);
htmlContent = cachedHtmlContent ? cachedHtmlContent : UrlFetchApp.fetch(url, { 'followRedirects': true, 'muteHttpExceptions': true }).getContentText();
if (!cachedHtmlContent) {
// Cache the entire HTML content for 1 day (24 hours)
cache.put(docCacheKey, htmlContent, 24 * 60 * 60);
}
} catch (error) {
cache.put(docCacheKey, null, 24 * 60 * 60);
cache.put(valCacheKey, null, 24 * 60 * 60);
throw error;
}
try {
const xmlDocument = XmlService.parse(htmlContent);
const xpathResult = XmlService.getXPathEvaluator().evaluate(xpath, xmlDocument, XmlService.NameSpace.XMLNS, XmlService.XPathResult.FIRST_ORDERED_NODE_TYPE, null);
let result;
if (xpathResult.getNode()) {
if (attributeName) {
result = xpathResult.getNode().getAttribute(attributeName);
} else {
result = xpathResult.getNode().getValue();
}
} else {
result = null;
}
// Cache the result based on parameters for 1 day (24 hours)
cache.put(valCacheKey, result, 24 * 60 * 60);
return result;
} catch (error) {
cache.put(valCacheKey, null, 24 * 60 * 60);
throw error;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment