Created
October 25, 2019 13:08
-
-
Save joe-oli/50d6daf118a40d003da3b833f60c32fa to your computer and use it in GitHub Desktop.
example crawler using HtmlAgilityPack
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static class OtoDomCrawler | |
{ | |
public static async Task<List<Page>> Search(string url) | |
{ | |
var offers = new List<Page>(); | |
while (url != null) | |
{ | |
var html = (await new HtmlWeb().LoadFromWebAsync(url)).DocumentNode; | |
offers.AddRange(GetOffers(html)); | |
url = html.SelectSingleNode("//a[@data-dir='next']")?.GetAttributeValue("href", null); | |
} | |
return offers; | |
} | |
private static IEnumerable<Page> GetOffers(HtmlNode html) | |
{ | |
foreach (var offer in html.SelectNodes("//*[@class='offer-item-details']")) | |
{ | |
var title = offer.SelectSingleNode(".//*[@class='offer-item-title']"); | |
var link = title?.AncestorsAndSelf("a").FirstOrDefault(); | |
yield return new Page | |
{ | |
Link = link?.GetAttributeValue("href", null)?.Split("#")?.FirstOrDefault(), | |
Title = title?.InnerText | |
}; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment