Skip to content

Instantly share code, notes, and snippets.

@Gutek
Created March 1, 2016 14:58
Show Gist options
  • Save Gutek/70d5b5223e5c35b5bbd5 to your computer and use it in GitHub Desktop.
Save Gutek/70d5b5223e5c35b5bbd5 to your computer and use it in GitHub Desktop.
Finds RSS feeds for bogs
open System.IO
open FSharp.Data
let readLines filePath = File.ReadAllLines(filePath);;
let downloadWebAsync (links:seq<string>) =
links
|> Seq.map HtmlDocument.AsyncLoad
let castToOption (linkElements:seq<HtmlNode>) =
linkElements
|> Seq.choose (fun x ->
x.TryGetAttribute("type")
|> Option.map (fun a -> x.Attribute("href").Value(), a.Value())
)
let getLinkElements (doc:HtmlDocument) : seq<HtmlNode> =
doc.Descendants["link"]
let filterOnlyRss (_:string, t:string) =
t.Equals("application/rss+xml", System.StringComparison.OrdinalIgnoreCase)
let filterNoComments (uri:string, _:string) =
uri.ToLower().Contains("comments") = false
let getLinks =
readLines "urls.txt"
|> Seq.filter (fun f -> f.StartsWith("//") = false)
let loadAll =
getLinks
|> downloadWebAsync
|> Async.Parallel
|> Async.RunSynchronously
|> Seq.map getLinkElements
|> Seq.map castToOption
|> Seq.map (Seq.filter (filterOnlyRss))
|> Seq.map (Seq.filter (filterNoComments))
//
[<EntryPoint>]
let main argv =
let all = loadAll
all
|> Seq.concat
|> Seq.distinct
|> Seq.iter (fun (x,y) -> printfn "%s" x)
0
@Gutek
Copy link
Author

Gutek commented Mar 1, 2016

thanks, that works perfectly. now i need to understand whats going on there :)

@orient-man
Copy link

Krzysiek przerobił świetnie, ale dorzucę jeszcze swoje "muśnięcia":

#r "./packages/FSharp.Data/lib/net40/FSharp.Data.dll"
open System
open System.IO
open FSharp.Data

let readLines filePath = File.ReadAllLines(filePath)

let downloadWebAsync = HtmlDocument.AsyncLoad >> Async.Catch

let getLinkElements (doc:HtmlDocument) = doc.Descendants["link"]

let tryGetLinkInfo (link:HtmlNode) =
    let getHref() = link.Attribute("href").Value()
    link.TryGetAttribute("type") |> Option.map (fun t -> getHref(), t.Value())

let filterOnlyRss (_, t:string) =
    t.Equals("application/rss+xml", StringComparison.OrdinalIgnoreCase)

let filterNoComments (uri:string, _) = not (uri.ToLower().Contains("comments"))

let getLinks =
    readLines "h:/projekty/_robol/urls.txt"
    |> Seq.filter (fun f -> not (f.StartsWith("//")))

let loadAll () =
    getLinks
    |> Seq.map downloadWebAsync
    |> Async.Parallel
    |> Async.RunSynchronously

let findLinks =
    getLinkElements
    >> Seq.choose tryGetLinkInfo
    >> Seq.filter filterOnlyRss
    >> Seq.filter filterNoComments

let filterErrors = function
| Choice1Of2 d -> Some(d)
| Choice2Of2 e -> printfn "FAIELD WITH: %A" e; None

loadAll ()
|> Seq.choose filterErrors
|> Seq.collect findLinks
|> Seq.distinct
|> Seq.iter (fun (x, _) -> printfn "%s" x)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment