Skip to content

Instantly share code, notes, and snippets.

View vmandic's full-sized avatar
🤠
chillin'

Vedran Mandić vmandic

🤠
chillin'
View GitHub Profile
@vmandic
vmandic / HzzoExcelParser.cs
Created October 25, 2018 16:51
meds-processor, p3, s2
public async Task<ISet<HzzoMedsDownloadDto>> Run(ISet<HzzoMedsDownloadDto> meds)
{
await Task.WhenAll(
// NOTE: due to excel docs designed in different ways, we do this separation of work
StartLongRunning(() => ParsePrimaryListsStartingWith2014_02(meds)),
StartLongRunning(() => ParseSupplementaryListsStartingWith2014_02(meds)),
StartLongRunning(() => ParsePrimaryListsUpTo2014_01(meds)),
StartLongRunning(() => ParseSupplementaryListsUpTo2014_01(meds))
);
@vmandic
vmandic / HzzoExcelParser.cs
Last active October 24, 2018 20:38
meds-processor, p3, s1
using System.Collections.Generic;
using System.Threading.Tasks;
using MedsProcessor.Common.Models;
namespace MedsProcessor.Parser
{
public class HzzoExcelParser
{
public Task<ISet<HzzoMedsDownloadDto>> Run(ISet<HzzoMedsDownloadDto> meds)
{
@vmandic
vmandic / AppController.cs
Created October 19, 2018 19:41
meds-processor, p2, s10
public async Task<ActionResult> Index(
[FromServices] HzzoHtmlScraper scraper,
[FromServices] HzzoExcelDownloader downloader)
{
var startTime = DateTime.Now;
var meds = await downloader.Run(await scraper.Run());
var totalTime = startTime - DateTime.Now;
return Ok(
$"Done! Handler duration: {totalTime.Duration()}" +
@vmandic
vmandic / HzzoMedsDownloadDto.cs
Last active October 19, 2018 19:59
meds-processor, p2, s9
public class HzzoMedsDownloadDto
{
private readonly string _rootLocation;
public HzzoMedsDownloadDto(string href, string validFrom, string rootLocation)
{
this.Href = href;
this.ValidFrom = DateTime.Parse(validFrom);
this._rootLocation = rootLocation;
}
@vmandic
vmandic / HzzoHtmlScraper.cs
Created October 19, 2018 19:32
meds-processor, p2, s8
ISet<HzzoMedsDownloadDto> ParseMedsLiElements(IEnumerable<IElement> elems) =>
elems.Aggregate(new HashSet<HzzoMedsDownloadDto>(), (medsList, li) =>
{
var href = li.QuerySelector("a").GetAttribute("href");
// NOTE: this domain is not available, links don't work :-(
if (!href.Contains("cdn.hzzo.hr"))
{
var dtParts = li.TextContent.TrimEnd().Split(' ').LastOrDefault().Split('.');
var downloadDto = new HzzoMedsDownloadDto(
@vmandic
vmandic / HzzoHtmlScraper.cs
Last active October 19, 2018 19:32
meds-processor, p2, s7
readonly AppPathsInfo _appPathsInfo;
public HzzoHtmlScraper(IBrowsingContext browsingContext, AppPathsInfo appPathsInfo)
{
this._browsingContext = browsingContext;
this._appPathsInfo = appPathsInfo;
}
@vmandic
vmandic / HzzoExcelDownloader.cs
Created October 19, 2018 19:19
meds-processor, p2, s6
async Task<HzzoMedsDownloadDto> DownloadExcel(HzzoMedsDownloadDto doc)
{
doc.DocumentStream = await _httpCli.GetStreamAsync(doc.Href);
return doc;
}
static Task SaveExcel(HzzoMedsDownloadDto doc) =>
Task.Factory.StartNew(() =>
{
using(var fileStream = File.Create(doc.FilePath, BUFFER_SIZE, FileOptions.Asynchronous))
@vmandic
vmandic / HzzoExcelDownloader.cs
Last active October 19, 2018 19:16
meds-processor, p2, s5
public async Task<ISet<HzzoMedsDownloadDto>> Run(ISet<HzzoMedsDownloadDto> meds)
{
// NOTE: throttle requests in parallel
var parallelismDegree = 5;
var waitBetweenRequestsMs = 500;
var savingItems = new List<Task>();
var notDownloadedDocs = meds.Where(x => !x.IsAlreadyDownloaded).ToList();
for (int i = 0; i < notDownloadedDocs.Count; i += parallelismDegree)
@vmandic
vmandic / Startup.cs
Created October 18, 2018 19:03
meds-processor, p2, s4
public void ConfigureServices(IServiceCollection services)
{
// we need the http client factory:
services.AddHttpClient();
services.AddAngleSharp();
services.AddSingleton(
s => new AppPathsInfo(s.GetService<IHostingEnvironment>().ContentRootPath));
services.AddSingleton<HzzoHtmlScraper>();
@vmandic
vmandic / HzzoExcelDownloader.cs
Last active October 20, 2018 09:02
meds-processor, p2, s3
using System.Collections.Generic;
using System.IO;
using System.Net.Http;
using System.Threading.Tasks;
using MedsProcessor.Common.Models;
using static MedsProcessor.Common.Constants;
namespace MedsProcessor.Downloader
{
public class HzzoExcelDownloader