Skip to content

Instantly share code, notes, and snippets.

@Deathspike
Created January 18, 2014 20:15
Show Gist options
  • Select an option

  • Save Deathspike/8495642 to your computer and use it in GitHub Desktop.

Select an option

Save Deathspike/8495642 to your computer and use it in GitHub Desktop.
super simple web crawler example (c#) for random people on SO (needs error checking; your job)
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Net;
using System.Threading.Tasks;
namespace Crawl {
class Crawlie {
private HashSet<string> _crawled;
private List<string> _matches;
private Uri _root;
public Crawlie(Uri root) {
_crawled = new HashSet<string>();
_matches = new List<string>();
_root = root;
}
private Uri _Create(string url) {
var uri = new Uri(url, UriKind.RelativeOrAbsolute);
return uri.IsAbsoluteUri ? uri : new Uri(_root, url);
}
public async Task Add(string url) {
var uri = _Create(url);
if (!_root.IsBaseOf(uri)) return;
_crawled.Add(uri.AbsoluteUri);
using (var wc = new WebClient()) {
var html = await wc.DownloadStringTaskAsync(uri.AbsoluteUri);
var doc = new HtmlDocument();
doc.LoadHtml(html);
foreach (var el in doc.DocumentNode.Descendants("a")) {
var href = el.GetAttributeValue("href", string.Empty);
if (!string.IsNullOrWhiteSpace(href)) {
var targetUri = _Create(href);
if (!_crawled.Contains(targetUri.AbsoluteUri)) {
_matches.Add(targetUri.AbsoluteUri);
await Add(targetUri.AbsoluteUri);
}
}
}
}
}
}
class Program {
static void Main(string[] args) {
var crawlie = new Crawlie(new Uri("http://www.roelvanuden.nl"));
crawlie.Add("/").Wait();
// todo: do something with result.
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment