Created
October 25, 2014 09:30
-
-
Save baio/b0f86096f64bddc5ba63 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using HtmlAgilityPack; | |
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Text; | |
using System.Text.RegularExpressions; | |
using System.Threading.Tasks; | |
namespace h2x | |
{ | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
string html = "<p> test </p> <z> jjjjj </z>"; | |
//Replace Lead | |
//Replace Body | |
//Replace header | |
//Replace figure | |
//Replace image | |
//Replace image title | |
//Replace image credits | |
//Replace href | |
//Replace video | |
//Replace video title | |
//Replace video credits | |
//Replace div, p, b, nobr | |
//Replace quote text | |
//Replace quote source | |
string str = System.IO.File.ReadAllText("../../WithQuote.html", UTF8Encoding.Unicode); | |
str = HtmlEntity.DeEntitize(str); | |
HtmlDocument doc = new HtmlDocument(); | |
doc.LoadHtml(str); | |
//lead | |
var nodes = doc.DocumentNode.SelectNodes("//div[@class=\"Lead\"]"); | |
if (nodes != null) | |
{ | |
foreach (HtmlNode link in nodes) | |
{ | |
link.Name = "lead"; | |
} | |
} | |
//body | |
nodes = doc.DocumentNode.SelectNodes("//div[@class=\"Body\"]"); | |
if (nodes != null) | |
{ | |
foreach (HtmlNode link in nodes) | |
{ | |
link.Name = "body"; | |
} | |
} | |
//header | |
nodes = doc.DocumentNode.SelectNodes("//h3"); | |
if (nodes != null) | |
{ | |
foreach (HtmlNode link in nodes) | |
{ | |
link.Remove(); | |
} | |
} | |
//figure | |
nodes = doc.DocumentNode.SelectNodes("//figure"); | |
if (nodes != null) | |
{ | |
foreach (HtmlNode link in nodes) | |
{ | |
link.Name = "figure"; | |
} | |
} | |
//image | |
nodes = doc.DocumentNode.SelectNodes("//img"); | |
if (nodes != null) | |
{ | |
foreach (HtmlNode link in nodes) | |
{ | |
link.Name = "image"; | |
} | |
} | |
//figure title | |
nodes = doc.DocumentNode.SelectNodes("//div[@class=\"Figure-title\"]"); | |
if (nodes != null) | |
{ | |
foreach (HtmlNode link in nodes) | |
{ | |
link.Name = "figureTitle"; | |
} | |
} | |
//figure credits | |
nodes = doc.DocumentNode.SelectNodes("//div[@class=\"Figure-credits\"]"); | |
if (nodes != null) | |
{ | |
foreach (HtmlNode link in nodes) | |
{ | |
link.Name = "figureCredits"; | |
} | |
} | |
//source quote | |
nodes = doc.DocumentNode.SelectNodes("//blockquote[@class=\"SourceQuote\"]"); | |
if (nodes != null) | |
{ | |
foreach (HtmlNode link in nodes) | |
{ | |
var textNode = link.SelectNodes("//p[@class=\"SourceQuote-text\"]").First(); | |
var sourceNode = link.SelectNodes("//span[@class=\"SourceQuote-origin\"]").First(); | |
link.RemoveAllChildren(); | |
link.Name = "quote"; | |
//text | |
var quoteTextNode = new HtmlNode(HtmlNodeType.Element, link.OwnerDocument, 0); | |
quoteTextNode.Name = "quoteText"; | |
quoteTextNode.InnerHtml = textNode.InnerText; | |
link.AppendChild(quoteTextNode); | |
//source | |
var quoteSourceNode = new HtmlNode(HtmlNodeType.Element, link.OwnerDocument, 1); | |
quoteSourceNode.Name = "quoteSource"; | |
quoteSourceNode.InnerHtml = sourceNode.InnerText; | |
link.AppendChild(quoteSourceNode); | |
} | |
} | |
//blockquote | |
nodes = doc.DocumentNode.SelectNodes("//blockquote"); | |
if (nodes != null) | |
{ | |
foreach (HtmlNode link in nodes) | |
{ | |
link.Name = "quote"; | |
} | |
} | |
//a | |
nodes = doc.DocumentNode.SelectNodes("//a"); | |
if (nodes != null) | |
{ | |
foreach (HtmlNode link in nodes) | |
{ | |
link.Name = "link"; | |
} | |
} | |
//Remove unused tags | |
var docStr = doc.DocumentNode.InnerHtml; | |
docStr = Regex.Replace(docStr, "<(br|p|nobr|div|/p|/nobr|/div).*?>", string.Empty); | |
docStr = "<document>" + docStr + "</document>"; | |
System.IO.File.WriteAllText("../../converted.html", docStr, UTF8Encoding.Unicode); | |
//str = Regex.Replace(html, "img src=\\\"([^ ]+)\\\"", "img src=\"https://meduza.io$1\""); | |
//str = HtmlRemoval.ReplaceTag(str, "/p", "/x"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment