Skip to content

Instantly share code, notes, and snippets.

@JeffreyZhao
Created August 2, 2009 12:00
Show Gist options
  • Save JeffreyZhao/160043 to your computer and use it in GitHub Desktop.
Save JeffreyZhao/160043 to your computer and use it in GitHub Desktop.
namespace Crawling
{
using System;
using System.Linq;
using System.Collections.Generic;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using ActorLite;
static class Program
{
static void Main(string[] args)
{
var monitor = new Monitor(5);
monitor.Post(m => m.Crawl("http://www.cnblogs.com/"));
TestStatisticPort testPort = new TestStatisticPort(monitor);
testPort.Start();
}
public class TestStatisticPort : IPort<IStatisticResponseHandler>, IStatisticResponseHandler
{
private IPort<IStatisticRequestHandelr> m_statisticPort;
public TestStatisticPort(IPort<IStatisticRequestHandelr> statisticPort)
{
this.m_statisticPort = statisticPort;
}
public void Start()
{
while (true)
{
Console.ReadLine();
this.m_statisticPort.Post(s => s.GetCrawledCount(this));
}
}
#region IPort<IStatisticResponseHandler> Members
void IPort<IStatisticResponseHandler>.Post(Action<IStatisticResponseHandler> message)
{
message(this);
}
#endregion
#region IStatisticResponseHandler Members
void IStatisticResponseHandler.ReplyCrawledCount(int count)
{
Console.WriteLine("Crawled: {0}", count);
}
void IStatisticResponseHandler.ReplyContent(string url, string content)
{
throw new NotImplementedException();
}
#endregion
}
}
public interface IPort<out T>
{
void Post(Action<T> message);
}
internal interface ICrawlRequestHandler
{
void Crawl(IPort<ICrawlResponseHandler> collector, string url);
}
internal interface ICrawlResponseHandler
{
void Succeeded(IPort<ICrawlRequestHandler> crawler, string url, string content, List<string> links);
void Failed(IPort<ICrawlRequestHandler> crawler, string url, Exception ex);
}
public interface IStatisticRequestHandelr
{
void GetCrawledCount(IPort<IStatisticResponseHandler> requester);
void GetContent(IPort<IStatisticResponseHandler> requester, string url);
}
public interface IStatisticResponseHandler
{
void ReplyCrawledCount(int count);
void ReplyContent(string url, string content);
}
internal class Crawler : Actor<Action<Crawler>>, IPort<Crawler>, ICrawlRequestHandler
{
protected override void Receive(Action<Crawler> message) { message(this); }
#region ICrawlRequestHandler Members
void ICrawlRequestHandler.Crawl(IPort<ICrawlResponseHandler> collector, string url)
{
WebClient client = new WebClient();
client.DownloadStringCompleted += (sender, e) =>
{
if (e.Error == null)
{
this.Post(c => c.Crawled(collector, url, e.Result));
}
else
{
collector.Post(c => c.Failed(this, url, e.Error));
}
};
client.DownloadStringAsync(new Uri(url));
}
private void Crawled(IPort<ICrawlResponseHandler> collector, string url, string content)
{
var matches = Regex.Matches(content, @"href=""(http://[^""]+)""").Cast<Match>();
var links = matches.Select(m => m.Groups[1].Value).Distinct().ToList();
collector.Post(c => c.Succeeded(this, url, content, links));
}
#endregion
}
public class Monitor : Actor<Action<Monitor>>, IPort<Monitor>,
ICrawlResponseHandler,
IStatisticRequestHandelr
{
protected override void Receive(Action<Monitor> message) { message(this); }
private HashSet<string> m_allUrls;
private Queue<string> m_readyToCrawl;
private Dictionary<string, string> m_urlContent;
public int MaxCrawlerCount { private set; get; }
public int WorkingCrawlerCount { private set; get; }
public Monitor(int crawlerCount)
{
this.m_allUrls = new HashSet<string>();
this.m_readyToCrawl = new Queue<string>();
this.m_urlContent = new Dictionary<string, string>();
this.MaxCrawlerCount = crawlerCount;
this.WorkingCrawlerCount = 0;
}
public void Crawl(string url)
{
if (this.m_allUrls.Contains(url)) return;
this.m_allUrls.Add(url);
if (this.WorkingCrawlerCount < this.MaxCrawlerCount)
{
this.WorkingCrawlerCount++;
IPort<ICrawlRequestHandler> crawler = new Crawler();
crawler.Post(c => c.Crawl(this, url));
}
else
{
this.m_readyToCrawl.Enqueue(url);
}
}
#region ICrawlResponseHandler Members
void ICrawlResponseHandler.Succeeded(IPort<ICrawlRequestHandler> crawler, string url, string content, List<string> links)
{
this.m_urlContent[url] = content;
Console.WriteLine("{0} crawled, {1} link(s).", url, links.Count);
foreach (var newUrl in links)
{
if (!this.m_allUrls.Contains(newUrl))
{
this.m_allUrls.Add(newUrl);
this.m_readyToCrawl.Enqueue(newUrl);
}
}
this.DispatchCrawlingTasks(crawler);
}
void ICrawlResponseHandler.Failed(IPort<ICrawlRequestHandler> crawler, string url, Exception ex)
{
Console.WriteLine("{0} error occurred: {1}.", url, ex.Message);
this.DispatchCrawlingTasks(crawler);
}
#endregion
private void DispatchCrawlingTasks(IPort<ICrawlRequestHandler> reusableCrawler)
{
if (this.m_readyToCrawl.Count <= 0)
{
this.WorkingCrawlerCount--;
}
var url = this.m_readyToCrawl.Dequeue();
reusableCrawler.Post(c => c.Crawl(this, url));
while (this.m_readyToCrawl.Count > 0 &&
this.WorkingCrawlerCount < this.MaxCrawlerCount)
{
var newUrl = this.m_readyToCrawl.Dequeue();
IPort<ICrawlRequestHandler> crawler = new Crawler();
crawler.Post(c => c.Crawl(this, newUrl));
this.WorkingCrawlerCount++;
}
}
#region IStatisticRequestHandelr Members
void IStatisticRequestHandelr.GetCrawledCount(IPort<IStatisticResponseHandler> requester)
{
requester.Post(r => r.ReplyCrawledCount(this.m_urlContent.Count));
}
void IStatisticRequestHandelr.GetContent(IPort<IStatisticResponseHandler> requester, string url)
{
string content;
if (!this.m_urlContent.TryGetValue(url, out content))
{
content = null;
}
requester.Post(r => r.ReplyContent(url, content));
}
#endregion
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment