Skip to content

Instantly share code, notes, and snippets.

@joe-oli
Created October 25, 2019 13:08
Show Gist options
  • Save joe-oli/50d6daf118a40d003da3b833f60c32fa to your computer and use it in GitHub Desktop.
Save joe-oli/50d6daf118a40d003da3b833f60c32fa to your computer and use it in GitHub Desktop.
example crawler using HtmlAgilityPack
public static class OtoDomCrawler
{
public static async Task<List<Page>> Search(string url)
{
var offers = new List<Page>();
while (url != null)
{
var html = (await new HtmlWeb().LoadFromWebAsync(url)).DocumentNode;
offers.AddRange(GetOffers(html));
url = html.SelectSingleNode("//a[@data-dir='next']")?.GetAttributeValue("href", null);
}
return offers;
}
private static IEnumerable<Page> GetOffers(HtmlNode html)
{
foreach (var offer in html.SelectNodes("//*[@class='offer-item-details']"))
{
var title = offer.SelectSingleNode(".//*[@class='offer-item-title']");
var link = title?.AncestorsAndSelf("a").FirstOrDefault();
yield return new Page
{
Link = link?.GetAttributeValue("href", null)?.Split("#")?.FirstOrDefault(),
Title = title?.InnerText
};
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment