Created
August 14, 2018 19:54
-
-
Save lukepothier/9857e1f6342ccf61753eceef1f7f0661 to your computer and use it in GitHub Desktop.
Get valid URLs from sitemap (no spidering because I'm lazy)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| public IEnumerable<Uri> GetSitemapUrls(Uri sitemapUrl) | |
| { | |
| var sitemapText = GetSitemapText(sitemapUrl); | |
| if (string.IsNullOrWhiteSpace(sitemapText)) | |
| yield break; | |
| var urlRegex = new Regex(@"\b(?:https?://|www\.)[^ \f\n\r\t\v\]]+\b", RegexOptions.Compiled | RegexOptions.IgnoreCase); | |
| foreach (Match m in urlRegex.Matches(sitemapText)) | |
| yield return new Uri(CleanUriString(m.Value)); | |
| } | |
| string GetSitemapText(Uri sitemapUri) | |
| { | |
| var wc = new WebClient | |
| { | |
| Encoding = System.Text.Encoding.UTF8 | |
| }; | |
| return wc.DownloadString(sitemapUri); | |
| } | |
| string CleanUriString(string dirtyUriString) | |
| { | |
| var legalCharacters = @"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=".ToCharArray(); | |
| var cleanedString = dirtyUriString; | |
| foreach (var character in dirtyUriString) | |
| { | |
| var matchIndex = dirtyUriString.IndexOf(character); | |
| if (!legalCharacters.Any(x => x.Equals(character)) && matchIndex > 0) | |
| cleanedString = dirtyUriString.Substring(0, matchIndex); | |
| } | |
| return cleanedString; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment