Skip to content

Instantly share code, notes, and snippets.

@vmandic
Last active October 16, 2018 17:24
Show Gist options
  • Save vmandic/ef2d4b8c6e7a1555c040c4f88035262c to your computer and use it in GitHub Desktop.
Save vmandic/ef2d4b8c6e7a1555c040c4f88035262c to your computer and use it in GitHub Desktop.
meds-processor, part 1 of 4, snippet 9
ISet<HzzoMedsDownloadDto> ParseHtmlDocuments(IDocument[] docs) =>
docs.Aggregate(
new HashSet<HzzoMedsDownloadDto>(),
(docList, doc) => new HashSet<HzzoMedsDownloadDto>(docList.Concat(ParseHtmlDocument(doc)))
);
static ISet<HzzoMedsDownloadDto> ParseMedsLiElements(IEnumerable<IElement> elems) =>
elems.Aggregate(new HashSet<HzzoMedsDownloadDto>(), (medsList, li) =>
{
var href = li.QuerySelector("a").GetAttribute("href");
// NOTE: this domain is not available, links don't work :-(
if (!href.Contains("cdn.hzzo.hr"))
{
var downloadDto = new HzzoMedsDownloadDto(
href,
li.TextContent.TrimEnd().Split(' ').LastOrDefault(),
DOWNLOAD_DIR
);
// NOTE: that's it folks, docs from 2013 and older are messed up
// and can't be approached with this generic parser in this app
// A more sophisticated parser (more if/else loops...) would be needed
if (downloadDto.ValidFrom > filterDtParsable2013)
medsList.Add(downloadDto);
}
return medsList;
});
static ISet<HzzoMedsDownloadDto> ParseHtmlDocument(IDocument doc) =>
ParseMedsLiElements(SelectLiElements(doc));
static IEnumerable<IElement> SelectLiElements(IDocument doc) =>
doc.QuerySelectorAll("section#main > ul li").Where(_predicateForListLiMeds);
static Func<IElement, bool> _predicateForListLiMeds =
x =>
// primary list:
x.TextContent.Contains("Osnovna lista lijekova") ||
// supplementary list:
x.TextContent.Contains("Dopunska lista lijekova");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment