Created
July 23, 2014 14:15
-
-
Save magicdawn/feda871d8174f0b9f525 to your computer and use it in GitHub Desktop.
huaban-board-downloader C# Edition
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="utf-8" ?> | |
<configuration> | |
<appSettings> | |
<!--是否在debug--> | |
<add key="Debug" value="false"/> | |
<!--存放文件夹--> | |
<add key="ImageDir" value="image"/> | |
<!--重试次数--> | |
<add key="MaxTryTimes" value="5"/> | |
<!--线程数量--> | |
<add key="ThreadCount" value="5"/> | |
</appSettings> | |
</configuration> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Text; | |
namespace ImageDownLoader | |
{ | |
class Config | |
{ | |
//示例url | |
public static readonly string ExampleUrl = "http://huaban.com/boards/13715778/"; | |
//是否debug | |
public static readonly bool Debug; | |
//文件 | |
public static readonly string ImageDir; | |
//重试次数 | |
public static readonly int MaxTryTimes; | |
//错误记录 | |
public static readonly string ErrorLog = "下载失败记录.txt"; | |
//线程数量 | |
public static int ThreadCount; //可通过命令行修改 | |
static string AppConfig(string key) | |
{ | |
//如果删除config问价,返回null | |
return System.Configuration.ConfigurationManager.AppSettings[key]; | |
} | |
static Config() | |
{ | |
Debug = bool.Parse(AppConfig("Debug") ?? "false"); | |
ImageDir = AppConfig("ImageDir") ?? "image"; | |
MaxTryTimes = int.Parse(AppConfig("MaxTryTimes") ?? "5"); | |
ThreadCount = int.Parse(AppConfig("ThreadCount") ?? "5"); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Text; | |
using System.Threading; | |
using System.Diagnostics; | |
using System.Threading.Tasks; | |
namespace ImageDownLoader | |
{ | |
class Program | |
{ | |
//相当于Task.WaitAll | |
//不用Task实现,因为实测比new Thread慢,可能线程池什么的有限制吧 | |
static bool CheckAnyRunning(Thread[] threads) | |
{ | |
foreach(var t in threads) | |
{ | |
if(t.IsAlive) | |
{ | |
return true; | |
} | |
} | |
return false; | |
} | |
static void WriteWithColor(Action act) | |
{ | |
var old = Console.ForegroundColor; | |
act(); | |
Console.ForegroundColor = old; | |
} | |
static Program() | |
{ | |
} | |
static void Main(string[] args) | |
{ | |
var watch = new Stopwatch(); | |
watch.Start(); | |
var argUrl = ""; | |
if(Config.Debug) | |
{ | |
argUrl = Config.ExampleUrl; | |
} | |
else if(args.Length > 0) | |
{ | |
argUrl = args[0]; | |
} | |
else | |
{ | |
Console.WriteLine(); | |
Console.WriteLine(" 请指定图片所在网页(如{0})",Config.ExampleUrl); | |
Console.WriteLine(" 后可跟线程数量,如({0} {1})",Config.ExampleUrl,Config.ThreadCount); | |
Console.WriteLine(); | |
WriteWithColor(() => { | |
Console.ForegroundColor = ConsoleColor.Cyan; | |
Console.WriteLine(" 花瓣网-画板下载-命令行工具 by Magicdawn 2014-7-23"); | |
}); | |
return; | |
} | |
/* | |
* 1.请求地址, | |
*/ | |
var html = Magicdawn.HttpHelper.Request(argUrl); | |
var title = HuabanUtil.FindTitle(html); | |
var username = HuabanUtil.FindUsername(title); | |
var count = int.Parse(HuabanUtil.FindCount(title)); | |
var width = count.ToString().Length; | |
var errorPath = "{0}/{1}/{2}".format(Config.ImageDir,title,Config.ErrorLog); | |
Queue<Tuple<string,string,string>> pins = new Queue<Tuple<string,string,string>>(); | |
/* | |
* title路径合法 ? | |
*/ | |
if(title.ContainOneOf("/ \\ : * ? \" < > |")) | |
{ | |
Console.WriteLine("title中有不合法内容,不能做文件夹名。"); | |
Console.Write("请手动指定 : "); | |
title = Console.ReadLine(); | |
} | |
/* | |
* 文件夹是否存在 | |
*/ | |
Console.WriteLine("系列为 : {0}",title); | |
Console.WriteLine("画板共 {0} 张图 , 作者为 : {1}",count,username); | |
if(!Directory.Exists(Config.ImageDir)) | |
Directory.CreateDirectory(Config.ImageDir); | |
if(Directory.Exists(Config.ImageDir + "/" + title)) | |
{ | |
Console.WriteLine(); | |
Console.Write("你好像下载过了...要重新下?(y/n) : "); | |
watch.Stop(); | |
if(Console.ReadLine() != "y") | |
{ | |
return;//退出 | |
} | |
else | |
{ | |
//接着下载 | |
File.Delete(errorPath); | |
watch.Start(); | |
} | |
} | |
else | |
{ | |
Directory.CreateDirectory(Config.ImageDir + "/" + title); | |
} | |
/* | |
* 添加当前页 | |
*/ | |
var page_pins = HuabanUtil.FindPins(html); | |
foreach(var p in page_pins) | |
{ | |
pins.Enqueue(p); | |
} | |
/* | |
* 访问后续页 | |
*/ | |
var pageNum = count / 100 + 1; | |
foreach(var i in Enumerable.Range(0,pageNum)) | |
{ | |
var maxId = pins.Last().Item1; //id,src,ext | |
var url = "{0}?max={1}&limit=100".format(argUrl,maxId); | |
html = Magicdawn.HttpHelper.Request(url); | |
page_pins = HuabanUtil.FindPins(html); | |
foreach(var p in page_pins) | |
{ | |
pins.Enqueue(p); | |
} | |
} | |
/* | |
* 开始下载 | |
*/ | |
var index = 1; //要处理的索引 | |
if(args.Length > 1) | |
{ | |
// url 线程数量 | |
Config.ThreadCount = int.Parse(args[1]); | |
} | |
var threads = new Thread[Config.ThreadCount]; | |
for(int i = 0;i < Config.ThreadCount;i++) | |
{ | |
threads[i] = new Thread(() => { | |
Tuple<string,string,string> p; | |
var client = new System.Net.WebClient(); | |
string curIndex; //当前是第几张图 | |
while(pins.Count > 0) | |
{ | |
lock(pins) | |
{ | |
p = pins.Dequeue(); | |
curIndex = index.ToString().PadLeft(width,'0'); | |
index++; | |
} | |
/* | |
* 有pin = (id,src,type)了,找url path ext | |
* 下载 | |
*/ | |
var src = p.Item2; //() | |
var ext = p.Item3; | |
var path = "{0}/{1}/{2}.{3}".format(Config.ImageDir,title,curIndex,ext); | |
Console.WriteLine("正在下载第{0}张图 : {1}",curIndex,src); | |
//Console.WriteLine(curIndex + "@" + Thread.CurrentThread.ManagedThreadId); | |
if(!HuabanUtil.Download(client,src,path)) | |
{ | |
Console.WriteLine("第{0}张图下载失败!",curIndex); | |
File.AppendAllText(errorPath, | |
//2014-7-23 20:13:38 第001张 http://xxx | |
"{0} 第{1}张 {2}".format( | |
DateTime.Now.ToStringX(), //时间 | |
curIndex, //第几张 | |
src | |
) | |
); | |
} | |
} | |
}) { IsBackground = true }; | |
threads[i].Start(); | |
} | |
while(CheckAnyRunning(threads)) | |
{ | |
Thread.Sleep(1000); | |
} | |
//等待其他线程作业 | |
Console.WriteLine("下载完成了...耗时 {0}分{1}秒",watch.Elapsed.Minutes,watch.Elapsed.Seconds); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Net; | |
using System.Text; | |
using System.Text.RegularExpressions; | |
namespace ImageDownLoader | |
{ | |
public class HuabanUtil | |
{ | |
static Regex regexTitle = new Regex(@"<title>(?<title>[\s\S]*?)收集");//group['title'] | |
//在title中找 | |
static Regex regexInTitle = new Regex(@"\((?<count>\d+)图\)_@(?<username>[\s\S]*)"); | |
//在pin_string里面找 | |
static Regex regexPinId = new Regex(@"""pin_id""[\s]*?:[\s]*?(?<id>\d+)"); | |
static Regex regexPinBucket = new Regex(@"""bucket"":""(?<bucket>\w+)"""); | |
static Regex regexPinKey = new Regex(@"""key"":""(?<key>[\w_-]+)"""); | |
static Regex regexPinType = new Regex(@"""type"":""image/(?<type>\w+)"""); | |
//找出board的title,username,count | |
public static string FindTitle(string html) | |
{ | |
return regexTitle.Match(html).Groups["title"].Value; | |
} | |
internal static string FindUsername(string title) | |
{ | |
return regexInTitle.Match(title).Groups["username"].Value; | |
} | |
internal static string FindCount(string title) | |
{ | |
return regexInTitle.Match(title).Groups["count"].Value; | |
} | |
/* | |
img_host = { | |
"hbimg": "img.hb.aicdn.com", | |
"hbfile": "hbfile.b0.upaiyun.com/img/apps" | |
} | |
hbfile = { | |
"hbfile": "hbfile.b0.upaiyun.com", | |
"hbimg2": "hbimg2.b0.upaiyun.com" | |
} | |
*/ | |
//图片服务器 | |
static Dictionary<string,string> imgHost = new Dictionary<string,string>() { | |
{ "hbimg", "img.hb.aicdn.com" }, | |
{ "hbfile", "hbfile.b0.upaiyun.com/img/apps" } | |
}; | |
static Dictionary<string,string> hbFile = new Dictionary<string,string>() { | |
{ "hbfile", "hbfile.b0.upaiyun.com" }, | |
{ "hbimg2", "hbimg2.b0.upaiyun.com" } | |
}; | |
//返回[(int id,string src,string "image/jpeg")] | |
internal static IEnumerable<Tuple<string,string,string>> FindPins(string html) | |
{ | |
var pins_index = html.IndexOf("\"pins\""); //"pins":[{"pin_id | |
var remain = html.Substring(pins_index + 7); //[{... | |
var end_index = Magicdawn.Util.StringFinder.GetSecondIndex(remain); | |
remain = remain.Substring(0,end_index); // [...] | |
var pins_string = new List<string>(); | |
while(remain.IndexOf('{') > 0) | |
{ | |
var left = remain.IndexOf('{'); | |
var right = Magicdawn.Util.StringFinder.GetSecondIndex(remain,left); | |
var content = remain.Substring(left + 1,right - left);//不包括 {} | |
pins_string.Add(content); | |
remain = remain.Substring(right); | |
} | |
foreach(var p_string in pins_string) | |
{ | |
var id = regexPinId.Match(p_string).Groups["id"].Value; | |
var bucket = regexPinBucket.Match(p_string).Groups["bucket"].Value; | |
var key = regexPinKey.Match(p_string).Groups["key"].Value; | |
var typeMatch = regexPinType.Match(p_string); | |
var baseUrl = imgHost[bucket]; | |
var src = string.Format("http://{0}/{1}",baseUrl,key); | |
var ext = "jpg"; | |
if(typeMatch != null) | |
{ | |
//type可能匹配不到 | |
ext = GetFileExt(typeMatch.Groups["type"].Value); | |
} | |
yield return Tuple.Create(id,src,ext); | |
} | |
} | |
internal static string GetFileExt(string type) | |
{ | |
//type是image/xxx | |
type = type.ToLowerInvariant(); | |
if(type == "jpeg" || type == "pjpeg") | |
return "jpg"; | |
else | |
return type; | |
} | |
public static bool Download(WebClient client,string src,string path,int times = 0) | |
{ | |
try | |
{ | |
client.DownloadFile(src,path); | |
} | |
catch(WebException) | |
{ | |
times++; | |
if(times <= Config.MaxTryTimes) | |
{ | |
return Download(client,src,path,times); //尝试下一次 | |
} | |
else | |
{ | |
return false; //下载失败 | |
} | |
} | |
return true; //默认成功 | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
其中的Magicdawn.Util.xxx 见本人的Magicdawn Library那个仓库