Created
January 30, 2019 00:14
-
-
Save russcam/6ec15f171d299898903ef9da78791f12 to your computer and use it in GitHub Desktop.
Checks for broken links in HTML pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<Query Kind="Program"> | |
<Reference><RuntimeDirectory>\System.Net.Http.dll</Reference> | |
<NuGetReference>HtmlAgilityPack</NuGetReference> | |
<Namespace>HtmlAgilityPack</Namespace> | |
<Namespace>System.Net.Http</Namespace> | |
<Namespace>System.Threading.Tasks</Namespace> | |
<Namespace>System.Collections.Concurrent</Namespace> | |
</Query> | |
void Main() | |
{ | |
var directory = "INPUT DIRECTORY HERE"; | |
var excludeLinks = new HashSet<string>(); | |
var client = new HttpClient(); | |
var badLinks = new ConcurrentDictionary<string, HashSet<string>>(); | |
var files = Directory.EnumerateFiles(directory, "*.html", SearchOption.AllDirectories).ToList(); | |
Parallel.ForEach(files, html => | |
{ | |
var document = new HtmlDocument(); | |
document.Load(html); | |
foreach (var anchor in document.DocumentNode.SelectNodes("//a")) | |
{ | |
var href = anchor.GetAttributeValue("href", string.Empty); | |
if (string.IsNullOrEmpty(href) || !href.StartsWith("http") || excludeLinks.Contains(href)) | |
{ | |
continue; | |
} | |
try | |
{ | |
var result = client.SendAsync(new HttpRequestMessage(HttpMethod.Head, new Uri(href))).Result; | |
if (result.StatusCode == System.Net.HttpStatusCode.NotFound) | |
{ | |
HashSet<string> links; | |
if (!badLinks.TryGetValue(html, out links)) | |
{ | |
links = new HashSet<string> { href }; | |
badLinks.TryAdd(html, links); | |
} | |
else | |
{ | |
links.Add(href); | |
} | |
} | |
} | |
catch (Exception) | |
{ | |
HashSet<string> links; | |
if (!badLinks.TryGetValue(html, out links)) | |
{ | |
links = new HashSet<string> { href }; | |
badLinks.TryAdd(html, links); | |
} | |
else | |
{ | |
links.Add(href); | |
} | |
} | |
} | |
}); | |
badLinks.Dump(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment