Created
September 14, 2023 19:01
-
-
Save PropelDMS/167eb82dc264a197bf5853c945c4cd5e to your computer and use it in GitHub Desktop.
Hoseman Snippet - Get Internal Links During Crawl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* This code first retrieves all the anchor elements in the page using document.querySelectorAll('a'). | |
* Then it filters out the anchor elements that are inside <head>, <header>, or <footer> tags. | |
* After that, it extracts the "href" attribute of each remaining anchor element. | |
* Finally, it filters out the URLs that are pointing to other pages on the same domain. | |
*/ | |
const isNestedInTags = (element) => { | |
const allowedTags = ['head', 'header', 'footer']; | |
let parent = element.parentElement; | |
while (parent) { | |
if (allowedTags.includes(parent.tagName.toLowerCase())) { | |
return true; | |
} | |
parent = parent.parentElement; | |
} | |
return false; | |
}; | |
// Get all anchor elements in the page | |
const anchorElements = document.querySelectorAll('a'); | |
// Filter out anchor elements nested inside <head>, <header>, or <footer> tags | |
const filteredAnchorElements = Array.from(anchorElements).filter((el) => !isNestedInTags(el)); | |
// Extract the "href" attribute of each anchor element | |
const urls = Array.from(filteredAnchorElements).map((el) => el.href); | |
// Get the current domain without protocol | |
const currentDomain = window.location.host; | |
// Filter out URLs pointing to other pages on the same domain | |
const sameDomainUrls = urls.filter((url) => { | |
const urlWithoutProtocol = url.replace(/^https?:\/\//, ''); | |
const urlDomain = urlWithoutProtocol.split('/')[0]; | |
return urlDomain === currentDomain; | |
}); | |
// Filter out URLs with final URI parts starting with a hash mark | |
const validUrls = sameDomainUrls.filter((url) => { | |
const hashIndex = url.indexOf('#'); | |
return hashIndex === -1; | |
}); | |
// Remove the protocol from each URL | |
const finalUrls = validUrls.map((url) => url.replace(/^https?:\/\//, '')); | |
return JSON.stringify(finalUrls); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment