Skip to content

Instantly share code, notes, and snippets.

@yamasol
Created March 29, 2018 06:28
Show Gist options
  • Select an option

  • Save yamasol/f86d243baf1a64330b8a7339f7555d8f to your computer and use it in GitHub Desktop.

Select an option

Save yamasol/f86d243baf1a64330b8a7339f7555d8f to your computer and use it in GitHub Desktop.
PTT crawler
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using NEO_Crawler_PTT.Model;
using ZapLib;
using System.CodeDom;
namespace NEO_Crawler_PTT.Main
{
class Crawler
{
public void Run()
{
/* Set Target Website URL */
string uri = "https://www.ptt.cc";
int page = Int32.Parse(Config.get("page"));
var baseUrl = Config.get("pttUri");/* Send HTTP Request and Get Result */
HtmlWeb web = new HtmlWeb();
string next = "https://www.ptt.cc/bbs/C_Chat/index.html";
for (int a = 0; a < page; a++) //爬 ? 頁
{
int s=1; //第一頁 預設
var htmlDoc = web.Load(next);
if (htmlDoc == null)
Console.WriteLine("fail");
else
Console.WriteLine("成功");
/* Save HTML to File */
//string s = htmlDoc.DocumentNode.OuterHtml;
//System.IO.StreamWriter file = new System.IO.StreamWriter(@"D:\res.html");
/* Find Element in DOM Tree by XPath */
ModelPost m_post = new ModelPost();
var content = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"r-ent\"]"); //文章內容
var link = content.SelectNodes("//div[@class=\"title\"]//a");
var btn = htmlDoc.DocumentNode.SelectSingleNode("//div[@class=\"action-bar\"]"); //標題內容
var nex = btn.SelectNodes("//div[@class=\"btn-group btn-group-paging\"]//a[2]");
foreach (var n in link) //取得文章與連結
{
m_post.name = n.InnerText;
m_post.link = uri + n.Attributes["href"].Value;
Console.WriteLine("\n -----第 "+s+" 篇開始-----");
s++;
Console.WriteLine(m_post.name + "\n" + m_post.link);
enter(m_post.link); // 副程式進入點
}
foreach (var n in nex) //取得下一頁
{
if (n.NodeType == HtmlNodeType.Element)
{
m_post.next = uri + n.Attributes["href"].Value;
Console.WriteLine("Next: " + uri + n.Attributes["href"].Value);
}
}
next = m_post.next; //取得一頁
}
}
public static string enter(string link)
{
string a, b, c, d, e, f = null;
HtmlWeb web = new HtmlWeb();
var htmlDoc = web.Load(link);
if (htmlDoc == null)
Console.WriteLine("fail");
else
Console.WriteLine(" ");
//Console.WriteLine(link);
var content =htmlDoc.DocumentNode.SelectSingleNode("//div[@id=\"main-container\"]"); //取得文章\
var top = htmlDoc.DocumentNode.SelectSingleNode("//div[@id=\"main-content\"]"); //文章ALL
var other = htmlDoc.DocumentNode.SelectNodes("//div[@id=\"main-content\"]");
var maker = top.SelectNodes("//div[@class=\"article-metaline\"]"); //取得標題列表
var titler =top.SelectNodes("//div[@class=\"article-metaline-right\"]"); //標題開頭
var recall = top.SelectNodes("//div[@class=\"push\"]"); //取得回覆留言
ModelComment m_comment = new ModelComment();
if(other != null) //fixing
{
foreach(var n in other)
{
//content.RemoveChild(n); //無法移除 top階層 :main-content
//卻可以移除content階層 :main-container
a=n.SelectSingleNode("span[1]").InnerText;
b=n.SelectSingleNode("span[2]").InnerText;
// Console.WriteLine(a +" "+ b+" ");
}
}
if (maker != null)
{
foreach (var n in maker)
{
if (n.NodeType == HtmlNodeType.Element)
{
top.RemoveChild(n);
switch (n.SelectSingleNode("span[1]").InnerText)
{
case "作者":
//Console.WriteLine("作者: " +n.SelectSingleNode("span[2]").InnerText);
m_comment.maker=n.SelectSingleNode("span[2]").InnerText;
break;
case "標題":
// Console.WriteLine("標題: " +n.SelectSingleNode("span[2]").InnerText);
m_comment.title=n.SelectSingleNode("span[2]").InnerText;
break;
case "時間":
// Console.WriteLine("時間: " +n.SelectSingleNode("span[2]").InnerText);
m_comment.date=n.SelectSingleNode("span[2]").InnerText;
break;
}
}
};
}
else
{
Console.WriteLine("-----此頁為公告 因此為空值-----");
}
if (recall != null)
{
foreach(var n in recall)
{
if (n.NodeType == HtmlNodeType.Element)
{
top.RemoveChild(n); //移除留言者 內容 時間
a=n.SelectSingleNode("span[2]").InnerText;
b=n.SelectSingleNode("span[3]").InnerText;
c=n.SelectSingleNode("span[4]").InnerText;
m_comment.recall=a+b+c;
//Console.WriteLine(a +b +c );
}
}
}
if (titler != null)
{
foreach (var n in titler)
{
if (n.NodeType == HtmlNodeType.Element)
{
top.RemoveChild(n); //移除 看板 + C_CHAT
a=n.SelectSingleNode("span[1]").InnerText;
b=n.SelectSingleNode("span[2]").InnerText;
//Console.WriteLine(a+" "+b);
}
}
}
Console.WriteLine(" 作者:"+m_comment.maker+"\n 標題:"+m_comment.title+"\n 日期:"+m_comment.date);
Console.WriteLine(top.InnerText);
Console.WriteLine(m_comment.recall);
Console.WriteLine("-----此篇已結束-----");
return f;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment