Created
March 29, 2018 06:28
-
-
Save yamasol/f86d243baf1a64330b8a7339f7555d8f to your computer and use it in GitHub Desktop.
PTT crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using HtmlAgilityPack; | |
| using System; | |
| using System.Collections.Generic; | |
| using System.Linq; | |
| using System.Text; | |
| using System.Threading.Tasks; | |
| using NEO_Crawler_PTT.Model; | |
| using ZapLib; | |
| using System.CodeDom; | |
| namespace NEO_Crawler_PTT.Main | |
| { | |
| class Crawler | |
| { | |
| public void Run() | |
| { | |
| /* Set Target Website URL */ | |
| string uri = "https://www.ptt.cc"; | |
| int page = Int32.Parse(Config.get("page")); | |
| var baseUrl = Config.get("pttUri");/* Send HTTP Request and Get Result */ | |
| HtmlWeb web = new HtmlWeb(); | |
| string next = "https://www.ptt.cc/bbs/C_Chat/index.html"; | |
| for (int a = 0; a < page; a++) //爬 ? 頁 | |
| { | |
| int s=1; //第一頁 預設 | |
| var htmlDoc = web.Load(next); | |
| if (htmlDoc == null) | |
| Console.WriteLine("fail"); | |
| else | |
| Console.WriteLine("成功"); | |
| /* Save HTML to File */ | |
| //string s = htmlDoc.DocumentNode.OuterHtml; | |
| //System.IO.StreamWriter file = new System.IO.StreamWriter(@"D:\res.html"); | |
| /* Find Element in DOM Tree by XPath */ | |
| ModelPost m_post = new ModelPost(); | |
| var content = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"r-ent\"]"); //文章內容 | |
| var link = content.SelectNodes("//div[@class=\"title\"]//a"); | |
| var btn = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"action-bar\"]"); //標題內容 | |
| var nex = btn.SelectNodes("//div[@class=\"btn-group btn-group-paging\"]//a[2]"); | |
| foreach (var n in link) //取得文章與連結 | |
| { | |
| m_post.name = n.InnerText; | |
| m_post.link = uri + n.Attributes["href"].Value; | |
| Console.WriteLine("\n -----第 "+s+" 篇開始-----"); | |
| s++; | |
| Console.WriteLine(m_post.name + "\n" + m_post.link); | |
| enter(m_post.link); // 副程式進入點 | |
| } | |
| foreach (var n in nex) //取得下一頁 | |
| { | |
| if (n.NodeType == HtmlNodeType.Element) | |
| { | |
| m_post.next = uri + n.Attributes["href"].Value; | |
| Console.WriteLine("Next: " + uri + n.Attributes["href"].Value); | |
| } | |
| } | |
| next = m_post.next; //取得一頁 | |
| } | |
| } | |
| public static string enter(string link) | |
| { | |
| string a, b, c, d, e, f = null; | |
| HtmlWeb web = new HtmlWeb(); | |
| var htmlDoc = web.Load(link); | |
| if (htmlDoc == null) | |
| Console.WriteLine("fail"); | |
| else | |
| Console.WriteLine(" "); | |
| //Console.WriteLine(link); | |
| var content =htmlDoc.DocumentNode.SelectSingleNode("//div[@id=\"main-container\"]"); //取得文章\ | |
| var top = htmlDoc.DocumentNode.SelectSingleNode("//div[@id=\"main-content\"]"); //文章ALL | |
| var other = htmlDoc.DocumentNode.SelectNodes("//div[@id=\"main-content\"]"); | |
| var maker = top.SelectNodes("//div[@class=\"article-metaline\"]"); //取得標題列表 | |
| var titler =top.SelectNodes("//div[@class=\"article-metaline-right\"]"); //標題開頭 | |
| var recall = top.SelectNodes("//div[@class=\"push\"]"); //取得回覆留言 | |
| ModelComment m_comment = new ModelComment(); | |
| if(other != null) //fixing | |
| { | |
| foreach(var n in other) | |
| { | |
| //content.RemoveChild(n); //無法移除 top階層 :main-content | |
| //卻可以移除content階層 :main-container | |
| a=n.SelectSingleNode("span[1]").InnerText; | |
| b=n.SelectSingleNode("span[2]").InnerText; | |
| // Console.WriteLine(a +" "+ b+" "); | |
| } | |
| } | |
| if (maker != null) | |
| { | |
| foreach (var n in maker) | |
| { | |
| if (n.NodeType == HtmlNodeType.Element) | |
| { | |
| top.RemoveChild(n); | |
| switch (n.SelectSingleNode("span[1]").InnerText) | |
| { | |
| case "作者": | |
| //Console.WriteLine("作者: " +n.SelectSingleNode("span[2]").InnerText); | |
| m_comment.maker=n.SelectSingleNode("span[2]").InnerText; | |
| break; | |
| case "標題": | |
| // Console.WriteLine("標題: " +n.SelectSingleNode("span[2]").InnerText); | |
| m_comment.title=n.SelectSingleNode("span[2]").InnerText; | |
| break; | |
| case "時間": | |
| // Console.WriteLine("時間: " +n.SelectSingleNode("span[2]").InnerText); | |
| m_comment.date=n.SelectSingleNode("span[2]").InnerText; | |
| break; | |
| } | |
| } | |
| }; | |
| } | |
| else | |
| { | |
| Console.WriteLine("-----此頁為公告 因此為空值-----"); | |
| } | |
| if (recall != null) | |
| { | |
| foreach(var n in recall) | |
| { | |
| if (n.NodeType == HtmlNodeType.Element) | |
| { | |
| top.RemoveChild(n); //移除留言者 內容 時間 | |
| a=n.SelectSingleNode("span[2]").InnerText; | |
| b=n.SelectSingleNode("span[3]").InnerText; | |
| c=n.SelectSingleNode("span[4]").InnerText; | |
| m_comment.recall=a+b+c; | |
| //Console.WriteLine(a +b +c ); | |
| } | |
| } | |
| } | |
| if (titler != null) | |
| { | |
| foreach (var n in titler) | |
| { | |
| if (n.NodeType == HtmlNodeType.Element) | |
| { | |
| top.RemoveChild(n); //移除 看板 + C_CHAT | |
| a=n.SelectSingleNode("span[1]").InnerText; | |
| b=n.SelectSingleNode("span[2]").InnerText; | |
| //Console.WriteLine(a+" "+b); | |
| } | |
| } | |
| } | |
| Console.WriteLine(" 作者:"+m_comment.maker+"\n 標題:"+m_comment.title+"\n 日期:"+m_comment.date); | |
| Console.WriteLine(top.InnerText); | |
| Console.WriteLine(m_comment.recall); | |
| Console.WriteLine("-----此篇已結束-----"); | |
| return f; | |
| } | |
| } | |
| } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment