Created
April 27, 2024 02:07
-
-
Save cheeseonamonkey/752b4f17ec0df6e80959d1526d1f9b21 to your computer and use it in GitHub Desktop.
Wikipedia dumps download script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
npm install axios cheerio fs-extra | |
*/ | |
// Import the necessary libraries | |
const axios = require('axios'); | |
const cheerio = require('cheerio'); | |
const fs = require('fs-extra'); | |
const path = require('path'); | |
// Base URL for Wikimedia dumps | |
const baseUrl = 'https://dumps.wikimedia.org/enwiki/'; | |
// Function to download a file | |
const downloadFile = async (url, outputLocationPath) => { | |
// Create a writable stream to download the file | |
const writer = fs.createWriteStream(outputLocationPath); | |
const response = await axios({ | |
url, | |
method: 'GET', | |
responseType: 'stream', // This allows us to pipe the response to the file | |
}); | |
// Pipe the response data into the writer stream | |
response.data.pipe(writer); | |
// Return a promise that resolves when the write operation is complete | |
return new Promise((resolve, reject) => { | |
writer.on('finish', resolve); | |
writer.on('error', reject); | |
}); | |
}; | |
// Fetch the main page and extract the links | |
const fetchMainPage = async () => { | |
const response = await axios.get(baseUrl); | |
return cheerio.load(response.data); // Load the HTML into Cheerio for parsing | |
}; | |
const getThirdPageLink = ($) => { | |
// Get all links on the page and return the third one | |
const links = $('a'); | |
return $(links.get(2)).attr('href'); // 0-indexed, so this is the third link | |
}; | |
// Fetch the third page and extract the multistream links | |
const fetchThirdPage = async (link) => { | |
const response = await axios.get(`${baseUrl}${link}`); | |
return cheerio.load(response.data); // Load the HTML into Cheerio for parsing | |
}; | |
const getMultistreamLinks = ($) => { | |
// Extract all links containing 'multistream' but not 'index' | |
const origin = baseUrl.replace('enwiki/',''); // Origin base URL | |
let output = $('a') | |
.map((_, a) => $(a).attr('href')) | |
.get() | |
.filter((link) => link.includes('multistream') && !link.includes('index')) | |
.map((link) => `${origin}${link}`); // Ensure full URL by prepending the origin | |
return output | |
}; | |
const main = async () => { | |
console.log(`Fetching main page: ${baseUrl}`); | |
const $mainPage = await fetchMainPage(); | |
const thirdPageLink = getThirdPageLink($mainPage); | |
console.log(`Extracted index URL: ${thirdPageLink}`); | |
const $thirdPage = await fetchThirdPage(thirdPageLink); | |
const multistreamLinks = getMultistreamLinks($thirdPage); | |
console.log(`Extracted ${multistreamLinks.length} download links`); | |
// Use __dirname to create a relative path for the download directory | |
const tempDir = path.resolve(__dirname, 'downloads'); // Ensure you have __dirname defined | |
await fs.ensureDir(tempDir); // Make sure the directory exists | |
console.log('Downloading files...'); | |
for (const link of multistreamLinks) { | |
const filename = path.basename(link); // Extract the filename from the link | |
const outputLocation = path.join(tempDir, filename); // Create the full path for the downloaded file | |
console.log(`Downloading ${link} to ${outputLocation}`); | |
await downloadFile(link, outputLocation); // Download the file using the full URL | |
} | |
console.log('Download complete.'); // Indicate that the downloads are finished | |
}; | |
// Execute the main function and catch any errors | |
main().catch((error) => { | |
console.error('Error:', error); // Output any errors that occur | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment