Last active
December 14, 2023 23:38
-
-
Save jim80net/0a6e372606026beb2b47d58b4f1d29fb to your computer and use it in GitHub Desktop.
Extract text from a website - make sure you adjust the filter on line 20
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrape from 'website-scraper'; | |
import TurndownService from 'turndown'; | |
import { JSDOM } from 'jsdom'; | |
import path from 'path'; | |
import fs from 'fs-extra'; | |
const turndownService = new TurndownService(); | |
class MyPlugin { | |
apply(registerAction) { | |
registerAction('error', async ({error}) => {console.error(error)}); | |
registerAction('onResourceSaved', ({resource}) => console.log(`Resource ${resource.url} saved!`)); | |
registerAction('onResourceError', ({resource, error}) => console.log(`Resource ${resource.url} has error ${error}`)); | |
registerAction('saveResource', ({resource}) => { | |
const absoluteDirectoryPath = path.resolve(process.cwd(), options.directory); // yuck options is global | |
const filename = path.join(absoluteDirectoryPath, resource.getFilename()).slice(0, -4) + 'txt'; | |
const original_text = resource.getText(); | |
const dom = new JSDOM(original_text); | |
const contentBody = dom.window.document.querySelector('.content-body'); | |
let text = ""; | |
if (contentBody) { | |
text = turndownService.turndown(contentBody.textContent); | |
} | |
console.log(`Saving resource ${filename} with content ${text.substring(0, 12)}...`); | |
fs.outputFile(filename, text, { encoding: resource.getEncoding() }); | |
}); | |
} | |
} | |
const options = { | |
urls: ['http://localhost:3001/docs'], | |
urlFilter: function(url) { | |
return url.indexOf('http://localhost:3001/docs') === 0; | |
}, | |
directory: 'output' + new Date().getTime(), | |
recursive: true, | |
maxRecursiveDepth: 5, | |
plugins: [new MyPlugin()], | |
//filenameGenerator: 'bySiteStructure', | |
}; | |
// with async/await | |
const result = await scrape(options); | |
// Now condense the files into one file | |
const files = await fs.readdir(options.directory); | |
const output = files.filter(file => file.endsWith('.txt')).map(file => fs.readFileSync(path.join(options.directory, file), {encoding: 'utf8'})).join('\n'); | |
await fs.outputFile(path.join(options.directory, 'output.txt'), output); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment