Skip to content

Instantly share code, notes, and snippets.

@baio
Created October 25, 2014 09:30
Show Gist options
  • Save baio/b0f86096f64bddc5ba63 to your computer and use it in GitHub Desktop.
Save baio/b0f86096f64bddc5ba63 to your computer and use it in GitHub Desktop.
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace h2x
{
class Program
{
static void Main(string[] args)
{
string html = "<p> test </p> <z> jjjjj </z>";
//Replace Lead
//Replace Body
//Replace header
//Replace figure
//Replace image
//Replace image title
//Replace image credits
//Replace href
//Replace video
//Replace video title
//Replace video credits
//Replace div, p, &nbsp; b, nobr
//Replace quote text
//Replace quote source
string str = System.IO.File.ReadAllText("../../WithQuote.html", UTF8Encoding.Unicode);
str = HtmlEntity.DeEntitize(str);
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(str);
//lead
var nodes = doc.DocumentNode.SelectNodes("//div[@class=\"Lead\"]");
if (nodes != null)
{
foreach (HtmlNode link in nodes)
{
link.Name = "lead";
}
}
//body
nodes = doc.DocumentNode.SelectNodes("//div[@class=\"Body\"]");
if (nodes != null)
{
foreach (HtmlNode link in nodes)
{
link.Name = "body";
}
}
//header
nodes = doc.DocumentNode.SelectNodes("//h3");
if (nodes != null)
{
foreach (HtmlNode link in nodes)
{
link.Remove();
}
}
//figure
nodes = doc.DocumentNode.SelectNodes("//figure");
if (nodes != null)
{
foreach (HtmlNode link in nodes)
{
link.Name = "figure";
}
}
//image
nodes = doc.DocumentNode.SelectNodes("//img");
if (nodes != null)
{
foreach (HtmlNode link in nodes)
{
link.Name = "image";
}
}
//figure title
nodes = doc.DocumentNode.SelectNodes("//div[@class=\"Figure-title\"]");
if (nodes != null)
{
foreach (HtmlNode link in nodes)
{
link.Name = "figureTitle";
}
}
//figure credits
nodes = doc.DocumentNode.SelectNodes("//div[@class=\"Figure-credits\"]");
if (nodes != null)
{
foreach (HtmlNode link in nodes)
{
link.Name = "figureCredits";
}
}
//source quote
nodes = doc.DocumentNode.SelectNodes("//blockquote[@class=\"SourceQuote\"]");
if (nodes != null)
{
foreach (HtmlNode link in nodes)
{
var textNode = link.SelectNodes("//p[@class=\"SourceQuote-text\"]").First();
var sourceNode = link.SelectNodes("//span[@class=\"SourceQuote-origin\"]").First();
link.RemoveAllChildren();
link.Name = "quote";
//text
var quoteTextNode = new HtmlNode(HtmlNodeType.Element, link.OwnerDocument, 0);
quoteTextNode.Name = "quoteText";
quoteTextNode.InnerHtml = textNode.InnerText;
link.AppendChild(quoteTextNode);
//source
var quoteSourceNode = new HtmlNode(HtmlNodeType.Element, link.OwnerDocument, 1);
quoteSourceNode.Name = "quoteSource";
quoteSourceNode.InnerHtml = sourceNode.InnerText;
link.AppendChild(quoteSourceNode);
}
}
//blockquote
nodes = doc.DocumentNode.SelectNodes("//blockquote");
if (nodes != null)
{
foreach (HtmlNode link in nodes)
{
link.Name = "quote";
}
}
//a
nodes = doc.DocumentNode.SelectNodes("//a");
if (nodes != null)
{
foreach (HtmlNode link in nodes)
{
link.Name = "link";
}
}
//Remove unused tags
var docStr = doc.DocumentNode.InnerHtml;
docStr = Regex.Replace(docStr, "<(br|p|nobr|div|/p|/nobr|/div).*?>", string.Empty);
docStr = "<document>" + docStr + "</document>";
System.IO.File.WriteAllText("../../converted.html", docStr, UTF8Encoding.Unicode);
//str = Regex.Replace(html, "img src=\\\"([^ ]+)\\\"", "img src=\"https://meduza.io$1\"");
//str = HtmlRemoval.ReplaceTag(str, "/p", "/x");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment