Created
August 2, 2009 12:00
-
-
Save JeffreyZhao/160043 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace Crawling | |
{ | |
using System; | |
using System.Linq; | |
using System.Collections.Generic; | |
using System.Net; | |
using System.Text; | |
using System.Text.RegularExpressions; | |
using ActorLite; | |
static class Program | |
{ | |
static void Main(string[] args) | |
{ | |
var monitor = new Monitor(5); | |
monitor.Post(m => m.Crawl("http://www.cnblogs.com/")); | |
TestStatisticPort testPort = new TestStatisticPort(monitor); | |
testPort.Start(); | |
} | |
public class TestStatisticPort : IPort<IStatisticResponseHandler>, IStatisticResponseHandler | |
{ | |
private IPort<IStatisticRequestHandelr> m_statisticPort; | |
public TestStatisticPort(IPort<IStatisticRequestHandelr> statisticPort) | |
{ | |
this.m_statisticPort = statisticPort; | |
} | |
public void Start() | |
{ | |
while (true) | |
{ | |
Console.ReadLine(); | |
this.m_statisticPort.Post(s => s.GetCrawledCount(this)); | |
} | |
} | |
#region IPort<IStatisticResponseHandler> Members | |
void IPort<IStatisticResponseHandler>.Post(Action<IStatisticResponseHandler> message) | |
{ | |
message(this); | |
} | |
#endregion | |
#region IStatisticResponseHandler Members | |
void IStatisticResponseHandler.ReplyCrawledCount(int count) | |
{ | |
Console.WriteLine("Crawled: {0}", count); | |
} | |
void IStatisticResponseHandler.ReplyContent(string url, string content) | |
{ | |
throw new NotImplementedException(); | |
} | |
#endregion | |
} | |
} | |
public interface IPort<out T> | |
{ | |
void Post(Action<T> message); | |
} | |
internal interface ICrawlRequestHandler | |
{ | |
void Crawl(IPort<ICrawlResponseHandler> collector, string url); | |
} | |
internal interface ICrawlResponseHandler | |
{ | |
void Succeeded(IPort<ICrawlRequestHandler> crawler, string url, string content, List<string> links); | |
void Failed(IPort<ICrawlRequestHandler> crawler, string url, Exception ex); | |
} | |
public interface IStatisticRequestHandelr | |
{ | |
void GetCrawledCount(IPort<IStatisticResponseHandler> requester); | |
void GetContent(IPort<IStatisticResponseHandler> requester, string url); | |
} | |
public interface IStatisticResponseHandler | |
{ | |
void ReplyCrawledCount(int count); | |
void ReplyContent(string url, string content); | |
} | |
internal class Crawler : Actor<Action<Crawler>>, IPort<Crawler>, ICrawlRequestHandler | |
{ | |
protected override void Receive(Action<Crawler> message) { message(this); } | |
#region ICrawlRequestHandler Members | |
void ICrawlRequestHandler.Crawl(IPort<ICrawlResponseHandler> collector, string url) | |
{ | |
WebClient client = new WebClient(); | |
client.DownloadStringCompleted += (sender, e) => | |
{ | |
if (e.Error == null) | |
{ | |
this.Post(c => c.Crawled(collector, url, e.Result)); | |
} | |
else | |
{ | |
collector.Post(c => c.Failed(this, url, e.Error)); | |
} | |
}; | |
client.DownloadStringAsync(new Uri(url)); | |
} | |
private void Crawled(IPort<ICrawlResponseHandler> collector, string url, string content) | |
{ | |
var matches = Regex.Matches(content, @"href=""(http://[^""]+)""").Cast<Match>(); | |
var links = matches.Select(m => m.Groups[1].Value).Distinct().ToList(); | |
collector.Post(c => c.Succeeded(this, url, content, links)); | |
} | |
#endregion | |
} | |
public class Monitor : Actor<Action<Monitor>>, IPort<Monitor>, | |
ICrawlResponseHandler, | |
IStatisticRequestHandelr | |
{ | |
protected override void Receive(Action<Monitor> message) { message(this); } | |
private HashSet<string> m_allUrls; | |
private Queue<string> m_readyToCrawl; | |
private Dictionary<string, string> m_urlContent; | |
public int MaxCrawlerCount { private set; get; } | |
public int WorkingCrawlerCount { private set; get; } | |
public Monitor(int crawlerCount) | |
{ | |
this.m_allUrls = new HashSet<string>(); | |
this.m_readyToCrawl = new Queue<string>(); | |
this.m_urlContent = new Dictionary<string, string>(); | |
this.MaxCrawlerCount = crawlerCount; | |
this.WorkingCrawlerCount = 0; | |
} | |
public void Crawl(string url) | |
{ | |
if (this.m_allUrls.Contains(url)) return; | |
this.m_allUrls.Add(url); | |
if (this.WorkingCrawlerCount < this.MaxCrawlerCount) | |
{ | |
this.WorkingCrawlerCount++; | |
IPort<ICrawlRequestHandler> crawler = new Crawler(); | |
crawler.Post(c => c.Crawl(this, url)); | |
} | |
else | |
{ | |
this.m_readyToCrawl.Enqueue(url); | |
} | |
} | |
#region ICrawlResponseHandler Members | |
void ICrawlResponseHandler.Succeeded(IPort<ICrawlRequestHandler> crawler, string url, string content, List<string> links) | |
{ | |
this.m_urlContent[url] = content; | |
Console.WriteLine("{0} crawled, {1} link(s).", url, links.Count); | |
foreach (var newUrl in links) | |
{ | |
if (!this.m_allUrls.Contains(newUrl)) | |
{ | |
this.m_allUrls.Add(newUrl); | |
this.m_readyToCrawl.Enqueue(newUrl); | |
} | |
} | |
this.DispatchCrawlingTasks(crawler); | |
} | |
void ICrawlResponseHandler.Failed(IPort<ICrawlRequestHandler> crawler, string url, Exception ex) | |
{ | |
Console.WriteLine("{0} error occurred: {1}.", url, ex.Message); | |
this.DispatchCrawlingTasks(crawler); | |
} | |
#endregion | |
private void DispatchCrawlingTasks(IPort<ICrawlRequestHandler> reusableCrawler) | |
{ | |
if (this.m_readyToCrawl.Count <= 0) | |
{ | |
this.WorkingCrawlerCount--; | |
} | |
var url = this.m_readyToCrawl.Dequeue(); | |
reusableCrawler.Post(c => c.Crawl(this, url)); | |
while (this.m_readyToCrawl.Count > 0 && | |
this.WorkingCrawlerCount < this.MaxCrawlerCount) | |
{ | |
var newUrl = this.m_readyToCrawl.Dequeue(); | |
IPort<ICrawlRequestHandler> crawler = new Crawler(); | |
crawler.Post(c => c.Crawl(this, newUrl)); | |
this.WorkingCrawlerCount++; | |
} | |
} | |
#region IStatisticRequestHandelr Members | |
void IStatisticRequestHandelr.GetCrawledCount(IPort<IStatisticResponseHandler> requester) | |
{ | |
requester.Post(r => r.ReplyCrawledCount(this.m_urlContent.Count)); | |
} | |
void IStatisticRequestHandelr.GetContent(IPort<IStatisticResponseHandler> requester, string url) | |
{ | |
string content; | |
if (!this.m_urlContent.TryGetValue(url, out content)) | |
{ | |
content = null; | |
} | |
requester.Post(r => r.ReplyContent(url, content)); | |
} | |
#endregion | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment