Created
November 7, 2016 20:11
-
-
Save HristoKolev/a4fe84c6c04ee82a3baefaa9a2700e18 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace HtmlScrubber | |
{ | |
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Text; | |
using System.Text.RegularExpressions; | |
using Fizzler.Systems.HtmlAgilityPack; | |
using HtmlAgilityPack; | |
public class LinkObject | |
{ | |
public string HeaderId { get; set; } | |
public string Title { get; set; } | |
} | |
// install Fizzler.Systems.HtmlAgilityPack | |
// install HtmlAgilityPack 1.4.9 | |
internal class Program | |
{ | |
private static readonly Regex EverythingExceptWordCharactersRegex = new Regex("[^A-z0-9-]+"); | |
private static readonly Regex WhiteSpaceRegex = new Regex(@"\s+"); | |
public static string Base64Encode(string plainText) | |
{ | |
var plainTextBytes = Encoding.UTF8.GetBytes(plainText); | |
return Convert.ToBase64String(plainTextBytes); | |
} | |
private static string ConvertToValidId(string header) | |
{ | |
string value = WhiteSpaceRegex.Replace(header, "-"); | |
value = EverythingExceptWordCharactersRegex.Replace(value, string.Empty); | |
value = value.ToLower(); | |
return value; | |
} | |
private static void Main(string[] args) | |
{ | |
string content = File.ReadAllText(@"C:\Users\hristo.kolev\Desktop\source.txt"); | |
var links = new List<LinkObject>(); | |
var document = new HtmlDocument(); | |
document.LoadHtml2(content); | |
var root = document.DocumentNode; | |
var headers = root.QuerySelectorAll("h1"); | |
foreach (var header in headers) | |
{ | |
var link = new LinkObject | |
{ | |
Title = header.InnerText, | |
HeaderId = ConvertToValidId(header.InnerText) | |
}; | |
links.Add(link); | |
header.SetAttributeValue("id", link.HeaderId); | |
} | |
foreach (var link in links) | |
{ | |
Console.WriteLine($"{link.Title}, {link.HeaderId}"); | |
} | |
File.WriteAllText(@"C:\Users\hristo.kolev\Desktop\target.txt", root.InnerHtml); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment