|
import xmldom from 'xmldom'; |
|
|
|
export default class SitemapParser { |
|
constructor(sitemap){ |
|
this.sitemap = sitemap; |
|
this.links = []; |
|
} |
|
|
|
/** |
|
* Get the JSON version of the sitemap |
|
* |
|
* @return {Promise} |
|
*/ |
|
async jsonSitemap(){ |
|
let sitemap = await fetch(this.sitemap); |
|
|
|
// Only continue if it's safe |
|
if(sitemap.status !== 200){ |
|
return false; |
|
} |
|
|
|
let parsed = await new xmldom.DOMParser() |
|
.parseFromString(await sitemap.text(), 'text/xml'); |
|
|
|
return this.constructor.xmlToJson(parsed); |
|
} |
|
|
|
/** |
|
* Parse all the urls in a json urlset |
|
* |
|
* @param {object} jsonData |
|
* @return {array} |
|
*/ |
|
parseURLSet(jsonData){ |
|
if( |
|
!(jsonData.urlset.url instanceof Array) |
|
&& |
|
!jsonData.urlset.url.length |
|
){ |
|
return this.links; |
|
} |
|
|
|
jsonData.urlset.url.forEach(url => { |
|
this.links.push(url.loc['#text']) |
|
}); |
|
|
|
return this.links; |
|
} |
|
|
|
/** |
|
* Parse all the urls for sitemaps in a json sitemap index |
|
* |
|
* @param {object} jsonData |
|
* @return {array} |
|
*/ |
|
parseSitemapIndex(jsonData){ |
|
if( |
|
!(jsonData.sitemapindex.sitemap instanceof Array) |
|
&& |
|
!jsonData.sitemapindex.sitemap.length |
|
){ |
|
return []; |
|
} |
|
|
|
let links = []; |
|
|
|
jsonData.sitemapindex.sitemap.forEach(url => { |
|
links.push(url.loc['#text']) |
|
}); |
|
|
|
return links; |
|
} |
|
|
|
/** |
|
* Get all the links in all the sitemaps |
|
* |
|
* @return {Promise} |
|
*/ |
|
async getLinks(){ |
|
let xmlData = await this.jsonSitemap(); |
|
|
|
// If the jsonSitemap function has not successfully run, return an empty |
|
// array |
|
if(!xmlData){ |
|
return []; |
|
} |
|
|
|
// There can be a sitemap of sitemaps, wordpress's yoast seo does |
|
// this to cope with custom post types, so recursively get links |
|
if( |
|
typeof xmlData.sitemapindex !== 'undefined' |
|
&& |
|
typeof xmlData.sitemapindex.sitemap !== 'undefined' |
|
){ |
|
let sitemaps = this.parseSitemapIndex(xmlData); |
|
|
|
for(let sitemap of sitemaps){ |
|
let parser = new SitemapParser(sitemap); |
|
|
|
let links = await parser.getLinks(); |
|
|
|
this.links.push(...links); |
|
} |
|
} |
|
|
|
// The urlset is what is wanted to get the links from, so if it exists |
|
// get them! |
|
if( |
|
typeof xmlData.urlset !== 'undefined' |
|
&& |
|
typeof xmlData.urlset.url !== 'undefined' |
|
){ |
|
this.parseURLSet(xmlData); |
|
} |
|
|
|
return this.links; |
|
} |
|
|
|
/** |
|
* Convert an xml string to a json object |
|
* |
|
* @param {string} xml |
|
* @return {object} |
|
*/ |
|
static xmlToJson(xml){ |
|
let jsonData = {}; |
|
|
|
if(xml.nodeType === 1){ |
|
if(xml.attributes.length > 0){ |
|
jsonData["@attributes"] = {}; |
|
|
|
for(let j = 0; j < xml.attributes.length; j++){ |
|
let attribute = xml.attributes.item(j); |
|
jsonData["@attributes"][attribute.nodeName] = attribute.nodeValue; |
|
} |
|
} |
|
} else if(xml.nodeType === 3){ |
|
jsonData = xml.nodeValue; |
|
} |
|
|
|
if(xml.hasChildNodes()){ |
|
for(let i = 0; i < xml.childNodes.length; i++){ |
|
let item = xml.childNodes.item(i); |
|
let nodeName = item.nodeName; |
|
|
|
if(typeof(jsonData[nodeName]) === "undefined"){ |
|
jsonData[nodeName] = this.xmlToJson(item); |
|
} else { |
|
if(typeof(jsonData[nodeName].push) === "undefined"){ |
|
let old = jsonData[nodeName]; |
|
jsonData[nodeName] = []; |
|
jsonData[nodeName].push(old); |
|
} |
|
|
|
jsonData[nodeName].push(this.xmlToJson(item)); |
|
} |
|
} |
|
} |
|
|
|
return jsonData; |
|
} |
|
} |