jim80net · December 14, 2023 23:38
diff --git a/extract.mjs b/extract.mjs
 import scrape from 'website-scraper';
 import TurndownService from 'turndown';
 import { JSDOM } from 'jsdom';
 import path from 'path';
 import fs from 'fs-extra';

 const turndownService = new TurndownService();


 class MyPlugin {
 	apply(registerAction) {
 		registerAction('error', async ({error}) => {console.error(error)});
 		registerAction('onResourceSaved', ({resource}) => console.log(`Resource ${resource.url} saved!`));
 		registerAction('onResourceError', ({resource, error}) => console.log(`Resource ${resource.url} has error ${error}`));
 		registerAction('saveResource', ({resource}) => {
 			const absoluteDirectoryPath = path.resolve(process.cwd(), options.directory); // yuck options is global
 			const filename = path.join(absoluteDirectoryPath, resource.getFilename()).slice(0, -4) + 'txt';
 			const original_text = resource.getText();
 			const dom = new JSDOM(original_text);
 			const contentBody = dom.window.document.querySelector('.content-body');
 			let text = "";
 			if (contentBody) {
 				text = turndownService.turndown(contentBody.textContent);
 			}
 			console.log(`Saving resource ${filename} with content ${text.substring(0, 12)}...`);
 			fs.outputFile(filename, text, { encoding: resource.getEncoding() });

 		});
 	}
 }

 const options = {
  urls: ['http://localhost:3001/docs'],
  urlFilter: function(url) {
    return url.indexOf('http://localhost:3001/docs') === 0;
  },
  directory: 'output' + new Date().getTime(),
  recursive: true,
  maxRecursiveDepth: 5,
  plugins: [new MyPlugin()],
  //filenameGenerator: 'bySiteStructure',
 };

 // with async/await
 const result = await scrape(options);


 // Now condense the files into one file
 const files = await fs.readdir(options.directory);
 const output = files.filter(file => file.endsWith('.txt')).map(file => fs.readFileSync(path.join(options.directory, file), {encoding: 'utf8'})).join('\n');
 await fs.outputFile(path.join(options.directory, 'output.txt'), output);
	import scrape from 'website-scraper';
	import TurndownService from 'turndown';
	import { JSDOM } from 'jsdom';
	import path from 'path';
	import fs from 'fs-extra';

	const turndownService = new TurndownService();


	class MyPlugin {
	apply(registerAction) {
	registerAction('error', async ({error}) => {console.error(error)});
	registerAction('onResourceSaved', ({resource}) => console.log(`Resource ${resource.url} saved!`));
	registerAction('onResourceError', ({resource, error}) => console.log(`Resource ${resource.url} has error ${error}`));
	registerAction('saveResource', ({resource}) => {
	const absoluteDirectoryPath = path.resolve(process.cwd(), options.directory); // yuck options is global
	const filename = path.join(absoluteDirectoryPath, resource.getFilename()).slice(0, -4) + 'txt';
	const original_text = resource.getText();
	const dom = new JSDOM(original_text);
	const contentBody = dom.window.document.querySelector('.content-body');
	let text = "";
	if (contentBody) {
	text = turndownService.turndown(contentBody.textContent);
	}
	console.log(`Saving resource ${filename} with content ${text.substring(0, 12)}...`);
	fs.outputFile(filename, text, { encoding: resource.getEncoding() });

	});
	}
	}

	const options = {
	urls: ['http://localhost:3001/docs'],
	urlFilter: function(url) {
	return url.indexOf('http://localhost:3001/docs') === 0;
	},
	directory: 'output' + new Date().getTime(),
	recursive: true,
	maxRecursiveDepth: 5,
	plugins: [new MyPlugin()],
	//filenameGenerator: 'bySiteStructure',
	};

	// with async/await
	const result = await scrape(options);


	// Now condense the files into one file
	const files = await fs.readdir(options.directory);
	const output = files.filter(file => file.endsWith('.txt')).map(file => fs.readFileSync(path.join(options.directory, file), {encoding: 'utf8'})).join('\n');
	await fs.outputFile(path.join(options.directory, 'output.txt'), output);