Last active
August 22, 2016 00:58
-
-
Save katz/b9725cf1debdc33793eeb8a53edcc305 to your computer and use it in GitHub Desktop.
C#のScrapySharpでスクレイピングする / Web scraping by ScrapySharp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<head> | |
<meta charset="utf-8"> | |
<title>title</title> | |
</head> | |
<body> | |
<ul> | |
<li>hoge 1</li> | |
<li>fuga 2</li> | |
<li>piyo 3</li> | |
</ul> | |
<table> | |
<tr> | |
<td>weather</td> | |
<td>sunny</td> | |
</tr> | |
<tr> | |
<td>my location is</td> | |
<td>Tokyo</td> | |
</tr> | |
</table> | |
</body> | |
</html> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using HtmlAgilityPack; | |
using ScrapySharp.Extensions; | |
using ScrapySharp.Network; | |
using System.Linq; | |
using System.Text.RegularExpressions; | |
namespace ScrapySharpTest | |
{ | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
var browser = new ScrapingBrowser(); | |
browser.AllowAutoRedirect = true; | |
browser.AllowMetaRedirect = true; | |
//まずはスクレイピング対象のページを取得してくる。 | |
// Fetch the target web page | |
var pageResult = browser.NavigateToPage(new Uri("http://example.com/page.html")); | |
//ページに対してCSSセレクタを適用し、該当するDOMノードの最初のものを取り出す。 | |
// → 「hoge 1」が返る | |
// Apply CSS selector to extract DOM Nodes from the page, and grab the first node. The below statement will return "hoge 1". | |
pageResult.Html.CssSelect("ul li").First().InnerText; | |
//ページに対してCSSセレクタを適用してDOMノード群を取り出し、ノード群の中からinnerTextに「fuga」が入っている最初のノードをLINQで絞り込む | |
// → 「fuga 2」が返る | |
// Apply CSS selector to extract DOM Nodes from the page, and get the first node that contains "fuga". The below statement will return "fuga 2". | |
pageResult.Html.CssSelect("ul li").First(elem => elem.InnerText.Contains("fuga")).InnerText; | |
// <td>タグ内に「location」という文字が入っているノードの隣のノードをXPathで絞り込む | |
// → 「Tokyo」が返る | |
// Apply XPath to extract DOM Nodes from the page, and grab the first node. The below statement will return "Tokyo". | |
pageResult.Html.SelectNodes("//td[contains(text(),'location')]/following-sibling::td").First().InnerText; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment