Created
May 20, 2020 07:35
-
-
Save sappho192/36af8e2ffd191b65e1ab768de677ff0d to your computer and use it in GitHub Desktop.
[C#] Webpage crawler with HtmlAgilityPack
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using HtmlAgilityPack; | |
using System; | |
using System.Net; | |
namespace WebCrawlerApp | |
{ | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
const string URL = @"http://example.com"; | |
var pageSource = GetPageSource(URL); | |
//Console.WriteLine(pageSource); | |
var pageHtml = new HtmlDocument(); | |
pageHtml.LoadHtml(pageSource); | |
PrintHtmlElements(pageHtml); | |
} | |
private static void PrintHtmlElements(HtmlDocument pageHtml) | |
{ | |
var headingText = pageHtml.DocumentNode.SelectSingleNode(@"/html/body/div[1]/h1").InnerText; | |
Console.WriteLine($"[Title]\n{headingText}"); | |
var paragraphs = pageHtml.DocumentNode.SelectNodes(@"/html/body/div[1]/p"); | |
foreach (var pNode in paragraphs) | |
{ | |
Console.WriteLine($"[Paragraph]\n{pNode.InnerText}"); | |
} | |
} | |
private static string GetPageSource(string url) | |
{ | |
var webClient = new WebClient(); | |
return webClient.DownloadString(url); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment