Skip to content

Instantly share code, notes, and snippets.

@JeffreyZhao
Created July 25, 2009 15:00
Show Gist options
  • Save JeffreyZhao/154815 to your computer and use it in GitHub Desktop.
Save JeffreyZhao/154815 to your computer and use it in GitHub Desktop.
Action-based message with ActorLite
namespace StrongCrawling
{
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
using ActorLite;
public interface ICrawlRequestHandler
{
void Crawl(Monitor monitor, string url);
}
public interface ICrawlResponseHandler
{
void Succeeded(Crawler crawler, string url, List<string> links);
void Failed(Crawler crawler, string url, Exception ex);
}
public class Monitor : Actor<Action<ICrawlResponseHandler>>, ICrawlResponseHandler
{
protected override void Receive(Action<ICrawlResponseHandler> message)
{
message(this);
}
private HashSet<string> m_allUrls;
private Queue<string> m_readyToCrawl;
public int MaxCrawlerCount { private set; get; }
public int WorkingCrawlerCount { private set; get; }
public Monitor(int crawlerCount)
{
this.m_allUrls = new HashSet<string>();
this.m_readyToCrawl = new Queue<string>();
this.MaxCrawlerCount = crawlerCount;
this.WorkingCrawlerCount = 0;
}
#region ICrawlResponseHandler Members
void ICrawlResponseHandler.Succeeded(Crawler crawler, string url, List<string> links)
{
Console.WriteLine("{0} crawled, {1} link(s).", url, links.Count);
foreach (var newUrl in links)
{
if (!this.m_allUrls.Contains(newUrl))
{
this.m_allUrls.Add(newUrl);
this.m_readyToCrawl.Enqueue(newUrl);
}
}
this.DispatchCrawlingTasks(crawler);
}
void ICrawlResponseHandler.Failed(Crawler crawler, string url, Exception ex)
{
Console.WriteLine("{0} error occurred: {1}.", url, ex.Message);
this.DispatchCrawlingTasks(crawler);
}
#endregion
private void DispatchCrawlingTasks(Crawler reusableCrawler)
{
if (this.m_readyToCrawl.Count <= 0)
{
this.WorkingCrawlerCount--;
}
var url = this.m_readyToCrawl.Dequeue();
reusableCrawler.Post(c => c.Crawl(this, url));
while (this.m_readyToCrawl.Count > 0 &&
this.WorkingCrawlerCount < this.MaxCrawlerCount)
{
var newUrl = this.m_readyToCrawl.Dequeue();
new Crawler().Post(c => c.Crawl(this, newUrl));
this.WorkingCrawlerCount++;
}
}
public void Start(string url)
{
this.m_allUrls.Add(url);
this.WorkingCrawlerCount++;
new Crawler().Post(c => c.Crawl(this, url));
}
}
public class Crawler : Actor<Action<ICrawlRequestHandler>>, ICrawlRequestHandler
{
protected override void Receive(Action<ICrawlRequestHandler> message)
{
message(this);
}
#region ICrawlRequestHandler Members
void ICrawlRequestHandler.Crawl(Monitor monitor, string url)
{
try
{
string content = new WebClient().DownloadString(url);
var matches = Regex.Matches(content, @"href=""(http://[^""]+)""").Cast<Match>();
var links = matches.Select(m => m.Groups[1].Value).Distinct().ToList();
monitor.Post(m => m.Succeeded(this, url, links));
}
catch (Exception ex)
{
monitor.Post(m => m.Failed(this, url, ex));
}
}
#endregion
}
static class Program
{
static void Main(string[] args)
{
new Monitor(5).Start("http://www.cnblogs.com/");
new Monitor(10).Start("http://www.csdn.net/");
Console.ReadLine();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment