using System.Net.Http;
using System.Text.Json;
using System.IO;
using System.Collections.Generic;
using System.Threading.Tasks;
// MediaWiki API URL
string wikiApiUrl = "";
string outputDirectory = @"D:\poe2dump\"; // Output directory
// Ensure the output directory exists
if (!Directory.Exists(outputDirectory))
// Function to fetch all pages and save rendered text
async Task FetchAndSaveRenderedPagesAsync()
string apContinue = null; // For pagination
using (HttpClient client = new HttpClient())
// Construct query parameters for listing all pages
var query = $"?action=query&list=allpages&aplimit=500&format=json";
if (!string.IsNullOrEmpty(apContinue))
query += $"&apcontinue={Uri.EscapeDataString(apContinue)}";
// Fetch the list of pages
var response = await client.GetStringAsync(wikiApiUrl + query);
var json = JsonDocument.Parse(response);
// Extract pages and apcontinue
var pages = new List<(string Title, int PageId, bool IsRedirect)>();
foreach (var page in json.RootElement.GetProperty("query").GetProperty("allpages").EnumerateArray())
var pageId = page.GetProperty("pageid").GetInt32();
var title = page.GetProperty("title").GetString();
var isRedirect = page.TryGetProperty("redirect", out _);
pages.Add((title, pageId, isRedirect));
// Save each page's rendered text to a file
foreach (var (title, pageId, isRedirect) in pages)
if (isRedirect)
Console.WriteLine($"Skipping redirect page: {title}");
// Check if the file for this page already exists
string sanitizedTitle = string.Join("_", title.Split(Path.GetInvalidFileNameChars()));
string fileName = $"pageid_{pageId}_{sanitizedTitle}.txt";
string filePath = Path.Combine(outputDirectory, fileName);
if (File.Exists(filePath))
Console.WriteLine($"Skipping: {fileName} (already exists)");
// Fetch the rendered content of the page
var parseQuery = $"?action=parse&pageid={pageId}&format=json";
var parseResponse = await client.GetStringAsync(wikiApiUrl + parseQuery);
var parseJson = JsonDocument.Parse(parseResponse);
// Extract rendered plain text
if (parseJson.RootElement.TryGetProperty("parse", out var parseElement) &&
parseElement.TryGetProperty("text", out var textElement) &&
textElement.TryGetProperty("*", out var htmlContent))
var htmlString = htmlContent.GetString();
var plainText = HtmlToPlainText(htmlString );
File.WriteAllText(filePath, plainText);
Console.WriteLine($"Saved: {fileName}");
// Find the next `apcontinue` for pagination
if (json.RootElement.TryGetProperty("continue", out var continueElement) &&
continueElement.TryGetProperty("apcontinue", out var apContinueElement))
apContinue = apContinueElement.GetString();
apContinue = null;
} while (!string.IsNullOrEmpty(apContinue));
Console.WriteLine("All pages have been fetched and saved as plain text.");
// Helper function to strip HTML tags
string HtmlToPlainText(string htmlContent)
var htmlDoc = new HtmlAgilityPack.HtmlDocument();
var text = htmlDoc.DocumentNode.InnerText;
return string.Join("\n", text.Split('\n', StringSplitOptions.RemoveEmptyEntries));
// Execute the function
await FetchAndSaveRenderedPagesAsync();
