cheeseonamonkey · April 27, 2024 02:07
diff --git a/downloadWikiDumps.js b/downloadWikiDumps.js
 /*
  npm install axios cheerio fs-extra
 */


 // Import the necessary libraries
 const axios = require('axios');
 const cheerio = require('cheerio');
 const fs = require('fs-extra');
 const path = require('path');

 // Base URL for Wikimedia dumps
 const baseUrl = 'https://dumps.wikimedia.org/enwiki/';

 // Function to download a file
 const downloadFile = async (url, outputLocationPath) => {
  // Create a writable stream to download the file
  const writer = fs.createWriteStream(outputLocationPath);

  const response = await axios({
    url,
    method: 'GET',
    responseType: 'stream', // This allows us to pipe the response to the file
  });

  // Pipe the response data into the writer stream
  response.data.pipe(writer);

  // Return a promise that resolves when the write operation is complete
  return new Promise((resolve, reject) => {
    writer.on('finish', resolve);
    writer.on('error', reject);
  });
 };

 // Fetch the main page and extract the links
 const fetchMainPage = async () => {
  const response = await axios.get(baseUrl);
  return cheerio.load(response.data); // Load the HTML into Cheerio for parsing
 };

 const getThirdPageLink = ($) => {
  // Get all links on the page and return the third one
  const links = $('a');
  return $(links.get(2)).attr('href'); // 0-indexed, so this is the third link
 };

 // Fetch the third page and extract the multistream links
 const fetchThirdPage = async (link) => {
  const response = await axios.get(`${baseUrl}${link}`);
  return cheerio.load(response.data); // Load the HTML into Cheerio for parsing
 };

 const getMultistreamLinks = ($) => {
  // Extract all links containing 'multistream' but not 'index'
  const origin = baseUrl.replace('enwiki/',''); // Origin base URL
  let output = $('a')
    .map((_, a) => $(a).attr('href'))
    .get()
    .filter((link) => link.includes('multistream') && !link.includes('index'))
    .map((link) => `${origin}${link}`); // Ensure full URL by prepending the origin

   return output
 };

 const main = async () => {
  console.log(`Fetching main page: ${baseUrl}`);
  const $mainPage = await fetchMainPage();

  const thirdPageLink = getThirdPageLink($mainPage);
  console.log(`Extracted index URL: ${thirdPageLink}`);

  const $thirdPage = await fetchThirdPage(thirdPageLink);
  const multistreamLinks = getMultistreamLinks($thirdPage);
  
  console.log(`Extracted ${multistreamLinks.length} download links`);

  // Use __dirname to create a relative path for the download directory
  const tempDir = path.resolve(__dirname, 'downloads'); // Ensure you have __dirname defined
  await fs.ensureDir(tempDir); // Make sure the directory exists

  console.log('Downloading files...');
  for (const link of multistreamLinks) {
    const filename = path.basename(link); // Extract the filename from the link
    const outputLocation = path.join(tempDir, filename); // Create the full path for the downloaded file
    console.log(`Downloading ${link} to ${outputLocation}`);
    await downloadFile(link, outputLocation); // Download the file using the full URL
  }

  console.log('Download complete.'); // Indicate that the downloads are finished
 };

 // Execute the main function and catch any errors
 main().catch((error) => {
  console.error('Error:', error); // Output any errors that occur
 });
	/*
	npm install axios cheerio fs-extra
	*/


	// Import the necessary libraries
	const axios = require('axios');
	const cheerio = require('cheerio');
	const fs = require('fs-extra');
	const path = require('path');

	// Base URL for Wikimedia dumps
	const baseUrl = 'https://dumps.wikimedia.org/enwiki/';

	// Function to download a file
	const downloadFile = async (url, outputLocationPath) => {
	// Create a writable stream to download the file
	const writer = fs.createWriteStream(outputLocationPath);

	const response = await axios({
	url,
	method: 'GET',
	responseType: 'stream', // This allows us to pipe the response to the file
	});

	// Pipe the response data into the writer stream
	response.data.pipe(writer);

	// Return a promise that resolves when the write operation is complete
	return new Promise((resolve, reject) => {
	writer.on('finish', resolve);
	writer.on('error', reject);
	});
	};

	// Fetch the main page and extract the links
	const fetchMainPage = async () => {
	const response = await axios.get(baseUrl);
	return cheerio.load(response.data); // Load the HTML into Cheerio for parsing
	};

	const getThirdPageLink = ($) => {
	// Get all links on the page and return the third one
	const links = $('a');
	return $(links.get(2)).attr('href'); // 0-indexed, so this is the third link
	};

	// Fetch the third page and extract the multistream links
	const fetchThirdPage = async (link) => {
	const response = await axios.get(`${baseUrl}${link}`);
	return cheerio.load(response.data); // Load the HTML into Cheerio for parsing
	};

	const getMultistreamLinks = ($) => {
	// Extract all links containing 'multistream' but not 'index'
	const origin = baseUrl.replace('enwiki/',''); // Origin base URL
	let output = $('a')
	.map((_, a) => $(a).attr('href'))
	.get()
	.filter((link) => link.includes('multistream') && !link.includes('index'))
	.map((link) => `${origin}${link}`); // Ensure full URL by prepending the origin

	return output
	};

	const main = async () => {
	console.log(`Fetching main page: ${baseUrl}`);
	const $mainPage = await fetchMainPage();

	const thirdPageLink = getThirdPageLink($mainPage);
	console.log(`Extracted index URL: ${thirdPageLink}`);

	const $thirdPage = await fetchThirdPage(thirdPageLink);
	const multistreamLinks = getMultistreamLinks($thirdPage);

	console.log(`Extracted ${multistreamLinks.length} download links`);

	// Use __dirname to create a relative path for the download directory
	const tempDir = path.resolve(__dirname, 'downloads'); // Ensure you have __dirname defined
	await fs.ensureDir(tempDir); // Make sure the directory exists

	console.log('Downloading files...');
	for (const link of multistreamLinks) {
	const filename = path.basename(link); // Extract the filename from the link
	const outputLocation = path.join(tempDir, filename); // Create the full path for the downloaded file
	console.log(`Downloading ${link} to ${outputLocation}`);
	await downloadFile(link, outputLocation); // Download the file using the full URL
	}

	console.log('Download complete.'); // Indicate that the downloads are finished
	};

	// Execute the main function and catch any errors
	main().catch((error) => {
	console.error('Error:', error); // Output any errors that occur
	});