Skip to content

Instantly share code, notes, and snippets.

@lukepothier
Created August 14, 2018 19:54
Show Gist options
  • Save lukepothier/9857e1f6342ccf61753eceef1f7f0661 to your computer and use it in GitHub Desktop.
Save lukepothier/9857e1f6342ccf61753eceef1f7f0661 to your computer and use it in GitHub Desktop.
Get valid URLs from sitemap (no spidering because I'm lazy)
public IEnumerable<Uri> GetSitemapUrls(Uri sitemapUrl)
{
var sitemapText = GetSitemapText(sitemapUrl);
if (string.IsNullOrWhiteSpace(sitemapText))
yield break;
var urlRegex = new Regex(@"\b(?:https?://|www\.)[^ \f\n\r\t\v\]]+\b", RegexOptions.Compiled | RegexOptions.IgnoreCase);
foreach (Match m in urlRegex.Matches(sitemapText))
yield return new Uri(CleanUriString(m.Value));
}
string GetSitemapText(Uri sitemapUri)
{
var wc = new WebClient
{
Encoding = System.Text.Encoding.UTF8
};
return wc.DownloadString(sitemapUri);
}
string CleanUriString(string dirtyUriString)
{
var legalCharacters = @"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=".ToCharArray();
var cleanedString = dirtyUriString;
foreach (var character in dirtyUriString)
{
var matchIndex = dirtyUriString.IndexOf(character);
if (!legalCharacters.Any(x => x.Equals(character)) && matchIndex > 0)
cleanedString = dirtyUriString.Substring(0, matchIndex);
}
return cleanedString;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment