moorage · September 9, 2024 17:49
diff --git a/IMPORT_URL_WITH_SELECTOR.gs b/IMPORT_URL_WITH_SELECTOR.gs
 /**
 * Fetches and caches the fully redirected URL for one day (24 hours).
 * 
 * This function follows all redirects for a given URL and stores the final 
 * redirected URL in the cache for 24 hours. If the URL has already been 
 * processed within the last 24 hours, the cached result is returned to 
 * improve performance and limit network requests.
 * 
 * @param {string} url - The original URL to fetch and follow redirects from.
 * @return {string} The fully redirected URL after following all redirects.
 * @throws Will throw an error if there is a problem fetching the URL.
 * @customfunction
 */
 function FULL_REDIRECT_URL(url) {
  // Get the cache service for the current script
  var cache = CacheService.getScriptCache();
  
  // Attempt to retrieve the cached URL
  const cacheKey = `redirect:${url}`;
  var cachedUrl = cache.get(cacheKey);
  
  // If the URL is found in the cache, return it immediately
  if (cachedUrl) {
    return cachedUrl;
  }

  try {
    // Fetch the URL and follow redirects (default behavior)
    var response = UrlFetchApp.fetch(url, { followRedirects: true });

    // The fully redirected URL can be retrieved from either the 'Content-Location' header or the request URL
    var finalUrl = response.getHeaders()['Content-Location'] || url;

    // Cache the final redirected URL for 24 hours (86400 seconds)
    cache.put(cacheKey, finalUrl, 86400); // Cache duration: 1 day (86400 seconds)

    // Return the fully redirected URL
    return finalUrl;
  } catch (e) {
    cache.put(cacheKey, null, 24 * 60 * 60);
    // If there's an error during the fetch process, throw an error with details
    throw new Error('Error fetching URL: ' + e.message);
  }
 }

 /**
 * Fetches the content of a URL and returns either an attribute value or text content based on a query selector.
 *
 * @param {String} url The URL to fetch the content from.
 * @param {String} querySelector The CSS selector to query the content.
 * @param {String} [attributeName] Optional. The attribute name to fetch the value of.
 * @return {String} The value of the attribute or text content based on the query selector.
 * @customfunction
 */
 function IMPORT_URL_WITH_SELECTOR(url, querySelector, attributeName) {
  'use strict';
  if (!url || !querySelector) return null;
  
  const cache = CacheService.getScriptCache();
  const docCacheKey = `doc:${url}`;
  const valCacheKey = `html-val:${url}:${querySelector}:${attributeName || 'textContent'}`;
  const cachedResult = cache.get(valCacheKey);
  if (cachedResult) return cachedResult;

  let htmlContent = "";
  try {
    const cachedHtmlContent = cache.get(docCacheKey);
    htmlContent = cachedHtmlContent ? cachedHtmlContent : UrlFetchApp.fetch(url, { 'followRedirects': true, 'muteHttpExceptions': true }).getContentText();
    
    if (!cachedHtmlContent) {
      // Cache the entire HTML content for 1 day (24 hours)
      cache.put(url, htmlContent, 24 * 60 * 60);
    }
  }catch (error) {
    cache.put(docCacheKey, null, 24 * 60 * 60);
    cache.put(valCacheKey, null, 24 * 60 * 60);
    throw error;
  }

  // Replace all variations of <br> with <br>\n if not followed by optional whitespace and then a newline
  // to work around cheerio's not handling of text() <BR>s as newlines
  htmlContent = htmlContent.replace(/<br\s*\/?>(?!\s*\n)/gi, '<br/>\n');

  try {
    const $ = Cheerio.load(htmlContent);
    const result = attributeName ? $(querySelector).first().attr(attributeName) : $(querySelector).first().text().trim();
    // Cache the result based on parameters for 1 day (24 hours)
    cache.put(valCacheKey, result, 24 * 60 * 60);
    return result;
  } catch (error) {
    cache.put(valCacheKey, null, 24 * 60 * 60);
    throw error;
  }
 }

 /**
 * Fetches the content of a URL and returns a value based on a JSON query selector.
 * Example: To extract the WebsiteURL of the first array element, you would call the function as follows:
 * `=IMPORT_JSON_WITH_QUERY("https://example.com/data.json", "0.WebsiteURL")`
 *
 * @param {String} url The URL to fetch the content from.
 * @param {String} jsonQuerySelector The JSON query selector to extract the value.
 * @return {String} The value extracted based on the JSON query selector.
 * @customfunction
 */
 function IMPORT_JSON_WITH_QUERY(url, jsonQuerySelector) {
  'use strict';
  if (!url || !jsonQuerySelector) return null;

  const cache = CacheService.getScriptCache();
  const docCacheKey = `doc:${url}`;
  const valCacheKey = `json-val:${url}:${jsonQuerySelector}`;
  const cachedResult = cache.get(valCacheKey);
  if (cachedResult) return cachedResult;

  let jsonContent = "";
  try {
    const cachedJsonContent = cache.get(docCacheKey);
    jsonContent = cachedJsonContent ? cachedJsonContent : UrlFetchApp.fetch(url, { 'followRedirects': true, 'muteHttpExceptions': true }).getContentText();

    if (!cachedJsonContent) {
      // Cache the entire JSON content for 1 day (24 hours)
      cache.put(docCacheKey, jsonContent, 24 * 60 * 60);
    }
  } catch (error) {
    cache.put(docCacheKey, null, 24 * 60 * 60);
    cache.put(valCacheKey, null, 24 * 60 * 60);
    throw error;
  }

  try {
    const jsonObject = JSON.parse(jsonContent);
    const result = getJsonValue(jsonObject, jsonQuerySelector);

    // Cache the result based on parameters for 1 day (24 hours)
    cache.put(valCacheKey, result, 24 * 60 * 60);
    return result;
  } catch (error) {
    cache.put(valCacheKey, null, 24 * 60 * 60);
    throw error;
  }
 }

 /**
 * Helper function to get a value from a JSON object based on a simple query selector.
 *
 * @param {Object} jsonObject The JSON object to query.
 * @param {String} query The query selector (e.g., "0.WebsiteURL" to access the first element's WebsiteURL).
 * @return {String} The value extracted based on the query selector.
 */
 function getJsonValue(jsonObject, query) {
  try {
    const properties = query.split('.');
    let value = jsonObject;

    for (const prop of properties) {
      if (value === undefined || value === null) {
        return null;
      }

      // Handle array index
      if (!isNaN(prop)) {
        value = value[parseInt(prop, 10)];
      } else {
        value = value[prop];
      }
    }

    return value !== undefined ? value : null;
  } catch (error) {
    return null;
  }
 }

 /**
 * Fetches the content of a URL and returns either an attribute value or text content based on an XPath selector.
 *
 * @param {String} url The URL to fetch the content from.
 * @param {String} xpath The XPath selector to query the content.
 * @param {String} [attributeName] Optional. The attribute name to fetch the value of.
 * @return {String} The value of the attribute or text content based on the XPath selector.
 * @customfunction
 */
 function IMPORT_XML_WITH_XPATH(url, xpath, attributeName) {
  'use strict';
  if (!url || !xpath) return null;
  
  const cache = CacheService.getScriptCache();
  const docCacheKey = `doc:${url}`;
  const valCacheKey = `xml-val:${url}:${xpath}:${attributeName || 'textContent'}`;
  const cachedResult = cache.get(valCacheKey);
  if (cachedResult) return cachedResult;

  let htmlContent = "";
  try {
    const cachedHtmlContent = cache.get(docCacheKey);
    htmlContent = cachedHtmlContent ? cachedHtmlContent : UrlFetchApp.fetch(url, { 'followRedirects': true, 'muteHttpExceptions': true }).getContentText();
    
    if (!cachedHtmlContent) {
      // Cache the entire HTML content for 1 day (24 hours)
      cache.put(docCacheKey, htmlContent, 24 * 60 * 60);
    }
  } catch (error) {
    cache.put(docCacheKey, null, 24 * 60 * 60);
    cache.put(valCacheKey, null, 24 * 60 * 60);
    throw error;
  }

  try {
    const xmlDocument = XmlService.parse(htmlContent);
    const xpathResult = XmlService.getXPathEvaluator().evaluate(xpath, xmlDocument, XmlService.NameSpace.XMLNS, XmlService.XPathResult.FIRST_ORDERED_NODE_TYPE, null);
    let result;

    if (xpathResult.getNode()) {
      if (attributeName) {
        result = xpathResult.getNode().getAttribute(attributeName);
      } else {
        result = xpathResult.getNode().getValue();
      }
    } else {
      result = null;
    }
    
    // Cache the result based on parameters for 1 day (24 hours)
    cache.put(valCacheKey, result, 24 * 60 * 60);
    return result;
  } catch (error) {
    cache.put(valCacheKey, null, 24 * 60 * 60);
    throw error;
  }
 }
	/**
	* Fetches and caches the fully redirected URL for one day (24 hours).
	*
	* This function follows all redirects for a given URL and stores the final
	* redirected URL in the cache for 24 hours. If the URL has already been
	* processed within the last 24 hours, the cached result is returned to
	* improve performance and limit network requests.
	*
	* @param {string} url - The original URL to fetch and follow redirects from.
	* @return {string} The fully redirected URL after following all redirects.
	* @throws Will throw an error if there is a problem fetching the URL.
	* @customfunction
	*/
	function FULL_REDIRECT_URL(url) {
	// Get the cache service for the current script
	var cache = CacheService.getScriptCache();

	// Attempt to retrieve the cached URL
	const cacheKey = `redirect:${url}`;
	var cachedUrl = cache.get(cacheKey);

	// If the URL is found in the cache, return it immediately
	if (cachedUrl) {
	return cachedUrl;
	}

	try {
	// Fetch the URL and follow redirects (default behavior)
	var response = UrlFetchApp.fetch(url, { followRedirects: true });

	// The fully redirected URL can be retrieved from either the 'Content-Location' header or the request URL
	var finalUrl = response.getHeaders()['Content-Location'] \|\| url;

	// Cache the final redirected URL for 24 hours (86400 seconds)
	cache.put(cacheKey, finalUrl, 86400); // Cache duration: 1 day (86400 seconds)

	// Return the fully redirected URL
	return finalUrl;
	} catch (e) {
	cache.put(cacheKey, null, 24 * 60 * 60);
	// If there's an error during the fetch process, throw an error with details
	throw new Error('Error fetching URL: ' + e.message);
	}
	}

	/**
	* Fetches the content of a URL and returns either an attribute value or text content based on a query selector.
	*
	* @param {String} url The URL to fetch the content from.
	* @param {String} querySelector The CSS selector to query the content.
	* @param {String} [attributeName] Optional. The attribute name to fetch the value of.
	* @return {String} The value of the attribute or text content based on the query selector.
	* @customfunction
	*/
	function IMPORT_URL_WITH_SELECTOR(url, querySelector, attributeName) {
	'use strict';
	if (!url \|\| !querySelector) return null;

	const cache = CacheService.getScriptCache();
	const docCacheKey = `doc:${url}`;
	const valCacheKey = `html-val:${url}:${querySelector}:${attributeName \|\| 'textContent'}`;
	const cachedResult = cache.get(valCacheKey);
	if (cachedResult) return cachedResult;

	let htmlContent = "";
	try {
	const cachedHtmlContent = cache.get(docCacheKey);
	htmlContent = cachedHtmlContent ? cachedHtmlContent : UrlFetchApp.fetch(url, { 'followRedirects': true, 'muteHttpExceptions': true }).getContentText();

	if (!cachedHtmlContent) {
	// Cache the entire HTML content for 1 day (24 hours)
	cache.put(url, htmlContent, 24 * 60 * 60);
	}
	}catch (error) {
	cache.put(docCacheKey, null, 24 * 60 * 60);
	cache.put(valCacheKey, null, 24 * 60 * 60);
	throw error;
	}

	// Replace all variations of <br> with <br>\n if not followed by optional whitespace and then a newline
	// to work around cheerio's not handling of text() <BR>s as newlines
	htmlContent = htmlContent.replace(/<br\s\/?>(?!\s\n)/gi, '<br/>\n');

	try {
	const $ = Cheerio.load(htmlContent);
	const result = attributeName ? $(querySelector).first().attr(attributeName) : $(querySelector).first().text().trim();
	// Cache the result based on parameters for 1 day (24 hours)
	cache.put(valCacheKey, result, 24 * 60 * 60);
	return result;
	} catch (error) {
	cache.put(valCacheKey, null, 24 * 60 * 60);
	throw error;
	}
	}

	/**
	* Fetches the content of a URL and returns a value based on a JSON query selector.
	* Example: To extract the WebsiteURL of the first array element, you would call the function as follows:
	* `=IMPORT_JSON_WITH_QUERY("https://example.com/data.json", "0.WebsiteURL")`
	*
	* @param {String} url The URL to fetch the content from.
	* @param {String} jsonQuerySelector The JSON query selector to extract the value.
	* @return {String} The value extracted based on the JSON query selector.
	* @customfunction
	*/
	function IMPORT_JSON_WITH_QUERY(url, jsonQuerySelector) {
	'use strict';
	if (!url \|\| !jsonQuerySelector) return null;

	const cache = CacheService.getScriptCache();
	const docCacheKey = `doc:${url}`;
	const valCacheKey = `json-val:${url}:${jsonQuerySelector}`;
	const cachedResult = cache.get(valCacheKey);
	if (cachedResult) return cachedResult;

	let jsonContent = "";
	try {
	const cachedJsonContent = cache.get(docCacheKey);
	jsonContent = cachedJsonContent ? cachedJsonContent : UrlFetchApp.fetch(url, { 'followRedirects': true, 'muteHttpExceptions': true }).getContentText();

	if (!cachedJsonContent) {
	// Cache the entire JSON content for 1 day (24 hours)
	cache.put(docCacheKey, jsonContent, 24 * 60 * 60);
	}
	} catch (error) {
	cache.put(docCacheKey, null, 24 * 60 * 60);
	cache.put(valCacheKey, null, 24 * 60 * 60);
	throw error;
	}

	try {
	const jsonObject = JSON.parse(jsonContent);
	const result = getJsonValue(jsonObject, jsonQuerySelector);

	// Cache the result based on parameters for 1 day (24 hours)
	cache.put(valCacheKey, result, 24 * 60 * 60);
	return result;
	} catch (error) {
	cache.put(valCacheKey, null, 24 * 60 * 60);
	throw error;
	}
	}

	/**
	* Helper function to get a value from a JSON object based on a simple query selector.
	*
	* @param {Object} jsonObject The JSON object to query.
	* @param {String} query The query selector (e.g., "0.WebsiteURL" to access the first element's WebsiteURL).
	* @return {String} The value extracted based on the query selector.
	*/
	function getJsonValue(jsonObject, query) {
	try {
	const properties = query.split('.');
	let value = jsonObject;

	for (const prop of properties) {
	if (value === undefined \|\| value === null) {
	return null;
	}

	// Handle array index
	if (!isNaN(prop)) {
	value = value[parseInt(prop, 10)];
	} else {
	value = value[prop];
	}
	}

	return value !== undefined ? value : null;
	} catch (error) {
	return null;
	}
	}

	/**
	* Fetches the content of a URL and returns either an attribute value or text content based on an XPath selector.
	*
	* @param {String} url The URL to fetch the content from.
	* @param {String} xpath The XPath selector to query the content.
	* @param {String} [attributeName] Optional. The attribute name to fetch the value of.
	* @return {String} The value of the attribute or text content based on the XPath selector.
	* @customfunction
	*/
	function IMPORT_XML_WITH_XPATH(url, xpath, attributeName) {
	'use strict';
	if (!url \|\| !xpath) return null;

	const cache = CacheService.getScriptCache();
	const docCacheKey = `doc:${url}`;
	const valCacheKey = `xml-val:${url}:${xpath}:${attributeName \|\| 'textContent'}`;
	const cachedResult = cache.get(valCacheKey);
	if (cachedResult) return cachedResult;

	let htmlContent = "";
	try {
	const cachedHtmlContent = cache.get(docCacheKey);
	htmlContent = cachedHtmlContent ? cachedHtmlContent : UrlFetchApp.fetch(url, { 'followRedirects': true, 'muteHttpExceptions': true }).getContentText();

	if (!cachedHtmlContent) {
	// Cache the entire HTML content for 1 day (24 hours)
	cache.put(docCacheKey, htmlContent, 24 * 60 * 60);
	}
	} catch (error) {
	cache.put(docCacheKey, null, 24 * 60 * 60);
	cache.put(valCacheKey, null, 24 * 60 * 60);
	throw error;
	}

	try {
	const xmlDocument = XmlService.parse(htmlContent);
	const xpathResult = XmlService.getXPathEvaluator().evaluate(xpath, xmlDocument, XmlService.NameSpace.XMLNS, XmlService.XPathResult.FIRST_ORDERED_NODE_TYPE, null);
	let result;

	if (xpathResult.getNode()) {
	if (attributeName) {
	result = xpathResult.getNode().getAttribute(attributeName);
	} else {
	result = xpathResult.getNode().getValue();
	}
	} else {
	result = null;
	}

	// Cache the result based on parameters for 1 day (24 hours)
	cache.put(valCacheKey, result, 24 * 60 * 60);
	return result;
	} catch (error) {
	cache.put(valCacheKey, null, 24 * 60 * 60);
	throw error;
	}
	}