Created
January 18, 2014 20:15
-
-
Save Deathspike/8495642 to your computer and use it in GitHub Desktop.
super simple web crawler example (c#) for random people on SO (needs error checking; your job)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using HtmlAgilityPack; | |
| using System; | |
| using System.Collections.Generic; | |
| using System.Net; | |
| using System.Threading.Tasks; | |
| namespace Crawl { | |
| class Crawlie { | |
| private HashSet<string> _crawled; | |
| private List<string> _matches; | |
| private Uri _root; | |
| public Crawlie(Uri root) { | |
| _crawled = new HashSet<string>(); | |
| _matches = new List<string>(); | |
| _root = root; | |
| } | |
| private Uri _Create(string url) { | |
| var uri = new Uri(url, UriKind.RelativeOrAbsolute); | |
| return uri.IsAbsoluteUri ? uri : new Uri(_root, url); | |
| } | |
| public async Task Add(string url) { | |
| var uri = _Create(url); | |
| if (!_root.IsBaseOf(uri)) return; | |
| _crawled.Add(uri.AbsoluteUri); | |
| using (var wc = new WebClient()) { | |
| var html = await wc.DownloadStringTaskAsync(uri.AbsoluteUri); | |
| var doc = new HtmlDocument(); | |
| doc.LoadHtml(html); | |
| foreach (var el in doc.DocumentNode.Descendants("a")) { | |
| var href = el.GetAttributeValue("href", string.Empty); | |
| if (!string.IsNullOrWhiteSpace(href)) { | |
| var targetUri = _Create(href); | |
| if (!_crawled.Contains(targetUri.AbsoluteUri)) { | |
| _matches.Add(targetUri.AbsoluteUri); | |
| await Add(targetUri.AbsoluteUri); | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| class Program { | |
| static void Main(string[] args) { | |
| var crawlie = new Crawlie(new Uri("http://www.roelvanuden.nl")); | |
| crawlie.Add("/").Wait(); | |
| // todo: do something with result. | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment