Last active
January 18, 2025 11:47
-
-
Save pjmagee/6b658fe580e8dacea5d81403c4d01347 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Net.Http; | |
using System.Text.Json; | |
using System.IO; | |
using System.Collections.Generic; | |
using System.Threading.Tasks; | |
// MediaWiki API URL | |
string wikiApiUrl = "https://www.poe2wiki.net/w/api.php"; | |
string outputDirectory = @"D:\poe2dump\"; // Output directory | |
// Ensure the output directory exists | |
if (!Directory.Exists(outputDirectory)) | |
Directory.CreateDirectory(outputDirectory); | |
// Function to fetch all pages and save rendered text | |
async Task FetchAndSaveRenderedPagesAsync() | |
{ | |
string apContinue = null; // For pagination | |
using (HttpClient client = new HttpClient()) | |
{ | |
do | |
{ | |
// Construct query parameters for listing all pages | |
var query = $"?action=query&list=allpages&aplimit=500&format=json"; | |
if (!string.IsNullOrEmpty(apContinue)) | |
query += $"&apcontinue={Uri.EscapeDataString(apContinue)}"; | |
// Fetch the list of pages | |
var response = await client.GetStringAsync(wikiApiUrl + query); | |
var json = JsonDocument.Parse(response); | |
// Extract pages and apcontinue | |
var pages = new List<(string Title, int PageId, bool IsRedirect)>(); | |
foreach (var page in json.RootElement.GetProperty("query").GetProperty("allpages").EnumerateArray()) | |
{ | |
var pageId = page.GetProperty("pageid").GetInt32(); | |
var title = page.GetProperty("title").GetString(); | |
var isRedirect = page.TryGetProperty("redirect", out _); | |
pages.Add((title, pageId, isRedirect)); | |
} | |
// Save each page's rendered text to a file | |
foreach (var (title, pageId, isRedirect) in pages) | |
{ | |
if (isRedirect) | |
{ | |
Console.WriteLine($"Skipping redirect page: {title}"); | |
continue; | |
} | |
// Check if the file for this page already exists | |
string sanitizedTitle = string.Join("_", title.Split(Path.GetInvalidFileNameChars())); | |
string fileName = $"pageid_{pageId}_{sanitizedTitle}.txt"; | |
string filePath = Path.Combine(outputDirectory, fileName); | |
if (File.Exists(filePath)) | |
{ | |
Console.WriteLine($"Skipping: {fileName} (already exists)"); | |
continue; | |
} | |
// Fetch the rendered content of the page | |
var parseQuery = $"?action=parse&pageid={pageId}&format=json"; | |
var parseResponse = await client.GetStringAsync(wikiApiUrl + parseQuery); | |
var parseJson = JsonDocument.Parse(parseResponse); | |
// Extract rendered plain text | |
if (parseJson.RootElement.TryGetProperty("parse", out var parseElement) && | |
parseElement.TryGetProperty("text", out var textElement) && | |
textElement.TryGetProperty("*", out var htmlContent)) | |
{ | |
var htmlString = htmlContent.GetString(); | |
if(!string.IsNullOrWhiteSpace(htmlString)) | |
{ | |
var plainText = HtmlToPlainText(htmlString ); | |
File.WriteAllText(filePath, plainText); | |
Console.WriteLine($"Saved: {fileName}"); | |
} | |
} | |
} | |
// Find the next `apcontinue` for pagination | |
if (json.RootElement.TryGetProperty("continue", out var continueElement) && | |
continueElement.TryGetProperty("apcontinue", out var apContinueElement)) | |
{ | |
apContinue = apContinueElement.GetString(); | |
} | |
else | |
{ | |
apContinue = null; | |
} | |
} while (!string.IsNullOrEmpty(apContinue)); | |
} | |
Console.WriteLine("All pages have been fetched and saved as plain text."); | |
} | |
// Helper function to strip HTML tags | |
string HtmlToPlainText(string htmlContent) | |
{ | |
var htmlDoc = new HtmlAgilityPack.HtmlDocument(); | |
htmlDoc.LoadHtml(htmlContent); | |
var text = htmlDoc.DocumentNode.InnerText; | |
return string.Join("\n", text.Split('\n', StringSplitOptions.RemoveEmptyEntries)); | |
} | |
// Execute the function | |
await FetchAndSaveRenderedPagesAsync(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment