Last active
February 7, 2024 08:49
-
-
Save reinvanoyen/3ce721d7e4c42aa3eabacab2a395e2ba to your computer and use it in GitHub Desktop.
scrape-text-from-sitemap
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"use strict"; | |
const https = require('https'); | |
const Sitemapper = require('sitemapper'); | |
const jsdom = require('jsdom'); | |
const fs = require('fs'); | |
const { JSDOM } = jsdom; | |
if (!process.argv[2]) { | |
console.error('No path to sitemap given'); | |
} | |
const sitemapUrl = process.argv[2]; | |
const elementSelector = process.argv[3] || 'body'; | |
const removeMultipleNewlines = process.argv[4] || false; | |
const get = async (url) => { | |
console.log('Fetching html from webpage ('+url+')'); | |
return new Promise((resolve, reject) => { | |
https.get(url, res => { | |
res.setEncoding('utf8'); | |
let response = ''; | |
res.on('data', data => { response += data; }); | |
res.on('end', () => { resolve(response); }); | |
}).on('error', error => reject(error)); | |
}); | |
}; | |
const getUrlsFromSitemap = async (sitemapUrl) => { | |
const sitemap = new Sitemapper(); | |
return await sitemap.fetch(sitemapUrl); | |
}; | |
const scrapeUrl = async (url) => { | |
console.log('Scraping webpage'); | |
const html = await get(url); | |
return getTextFromHtml(html); | |
}; | |
const getTextFromHtmlNode = (htmlNode) => { | |
if ( | |
htmlNode.tagName === 'STYLE' || | |
htmlNode.tagName === 'style' || | |
htmlNode.nodeType === 8 | |
) { | |
return ''; | |
} | |
if (htmlNode.tagName === 'BR') { | |
return "\n"; | |
} | |
if (htmlNode.nodeType === 3) { | |
return htmlNode.textContent.trim(); | |
} | |
let output = ''; | |
for (let i = 0; i < htmlNode.childNodes.length; i++) { | |
output += getTextFromHtmlNode(htmlNode.childNodes[i]); | |
} | |
if (output) { | |
const markdownMap = { | |
'H1': '# ', | |
'H2': '## ', | |
'H3': '### ', | |
'H4': '#### ', | |
'H5': '#### ', | |
'H6': '##### ', | |
'LI': '* ', | |
'OPTION': '* ', | |
}; | |
if (markdownMap[htmlNode.tagName]) { | |
return "\n"+markdownMap[htmlNode.tagName] + output.trim(); | |
} | |
return "\n"+output.trim(); | |
} | |
return ''; | |
}; | |
const getTextFromHtml = (html) => { | |
console.log('Getting text from webpage'); | |
const dom = new JSDOM(html); | |
let output = ''; | |
const containerEl = dom.window.document.querySelector(elementSelector); | |
for (let i = 0; i < containerEl.childNodes.length; i++) { | |
output += getTextFromHtmlNode(containerEl.childNodes[i]); | |
} | |
return output; | |
}; | |
const removeNewlines = (string) => { | |
return string.replace(/[\r\n]{2,}/g, "\n"); | |
}; | |
const scrapeUrls = async (urls) => { | |
let output = ''; | |
for (let i = 0; i < urls.length; i++) { | |
output += "\n"; | |
output += '# '+urls[i]; | |
output += "\n"; | |
output += await scrapeUrl(urls[i]); | |
} | |
return output; | |
}; | |
const writeFile = (filename, content) => { | |
console.log('Writing all text to output.txt'); | |
fs.writeFile(filename, content, (err) => { | |
if(err) { | |
return console.error(err); | |
} | |
console.log('The file was saved!'); | |
}); | |
}; | |
// IIFE | |
(async function() { | |
const urls = await getUrlsFromSitemap(sitemapUrl); | |
const content = await scrapeUrls(urls.sites); | |
writeFile('output.md', (removeMultipleNewlines ? removeNewlines(content) : content)); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment