Skip to content

Instantly share code, notes, and snippets.

@pjmagee
Last active January 18, 2025 11:47
Show Gist options
  • Save pjmagee/6b658fe580e8dacea5d81403c4d01347 to your computer and use it in GitHub Desktop.
Save pjmagee/6b658fe580e8dacea5d81403c4d01347 to your computer and use it in GitHub Desktop.
using System.Net.Http;
using System.Text.Json;
using System.IO;
using System.Collections.Generic;
using System.Threading.Tasks;
// MediaWiki API URL
string wikiApiUrl = "https://www.poe2wiki.net/w/api.php";
string outputDirectory = @"D:\poe2dump\"; // Output directory
// Ensure the output directory exists
if (!Directory.Exists(outputDirectory))
Directory.CreateDirectory(outputDirectory);
// Function to fetch all pages and save rendered text
async Task FetchAndSaveRenderedPagesAsync()
{
string apContinue = null; // For pagination
using (HttpClient client = new HttpClient())
{
do
{
// Construct query parameters for listing all pages
var query = $"?action=query&list=allpages&aplimit=500&format=json";
if (!string.IsNullOrEmpty(apContinue))
query += $"&apcontinue={Uri.EscapeDataString(apContinue)}";
// Fetch the list of pages
var response = await client.GetStringAsync(wikiApiUrl + query);
var json = JsonDocument.Parse(response);
// Extract pages and apcontinue
var pages = new List<(string Title, int PageId, bool IsRedirect)>();
foreach (var page in json.RootElement.GetProperty("query").GetProperty("allpages").EnumerateArray())
{
var pageId = page.GetProperty("pageid").GetInt32();
var title = page.GetProperty("title").GetString();
var isRedirect = page.TryGetProperty("redirect", out _);
pages.Add((title, pageId, isRedirect));
}
// Save each page's rendered text to a file
foreach (var (title, pageId, isRedirect) in pages)
{
if (isRedirect)
{
Console.WriteLine($"Skipping redirect page: {title}");
continue;
}
// Check if the file for this page already exists
string sanitizedTitle = string.Join("_", title.Split(Path.GetInvalidFileNameChars()));
string fileName = $"pageid_{pageId}_{sanitizedTitle}.txt";
string filePath = Path.Combine(outputDirectory, fileName);
if (File.Exists(filePath))
{
Console.WriteLine($"Skipping: {fileName} (already exists)");
continue;
}
// Fetch the rendered content of the page
var parseQuery = $"?action=parse&pageid={pageId}&format=json";
var parseResponse = await client.GetStringAsync(wikiApiUrl + parseQuery);
var parseJson = JsonDocument.Parse(parseResponse);
// Extract rendered plain text
if (parseJson.RootElement.TryGetProperty("parse", out var parseElement) &&
parseElement.TryGetProperty("text", out var textElement) &&
textElement.TryGetProperty("*", out var htmlContent))
{
var htmlString = htmlContent.GetString();
if(!string.IsNullOrWhiteSpace(htmlString))
{
var plainText = HtmlToPlainText(htmlString );
File.WriteAllText(filePath, plainText);
Console.WriteLine($"Saved: {fileName}");
}
}
}
// Find the next `apcontinue` for pagination
if (json.RootElement.TryGetProperty("continue", out var continueElement) &&
continueElement.TryGetProperty("apcontinue", out var apContinueElement))
{
apContinue = apContinueElement.GetString();
}
else
{
apContinue = null;
}
} while (!string.IsNullOrEmpty(apContinue));
}
Console.WriteLine("All pages have been fetched and saved as plain text.");
}
// Helper function to strip HTML tags
string HtmlToPlainText(string htmlContent)
{
var htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.LoadHtml(htmlContent);
var text = htmlDoc.DocumentNode.InnerText;
return string.Join("\n", text.Split('\n', StringSplitOptions.RemoveEmptyEntries));
}
// Execute the function
await FetchAndSaveRenderedPagesAsync();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment