-
-
Save paralax/d1d9c2b236055d3285fa22b339c419a8 to your computer and use it in GitHub Desktop.
small HTML table scraper and demo in F#
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| open System | |
| open System.Net | |
| open System.Text | |
| // https://social.msdn.microsoft.com/Forums/en-US/5a26bb89-c0e4-4ca4-b0c7-220c5fe1f495/how-to-get-a-html-table-using-regex?forum=regexp | |
| (* | |
| let table_pattern = "<table.*?>(.*?)</table>" | |
| let tr_pattern = "<tr.*?>(.*?)</tr>" | |
| let td_pattern = "<td.*?>(.*?)</td>" | |
| *) | |
| let tagData (tag:string) (html:string): string list = | |
| [ for m in RegularExpressions.Regex.Matches(html.Replace("\n", "").Trim().Replace("\r", ""), | |
| String.Format("<{0}.*?>(.*?)</{0}>", tag, tag), | |
| RegularExpressions.RegexOptions.IgnoreCase) | |
| -> m.Groups.Item(1).Value ] | |
| let tables(html:string): string list = | |
| tagData "table" html | |
| let rows(html:string):string list = | |
| tagData "tr" html | |
| let cells(html:string): string list = | |
| tagData "td" html | |
| let stripHtml(html:string): string = | |
| RegularExpressions.Regex.Replace(html, "<[^>]*>", "") | |
| let output (location:string) (latencies:float list) (threshhold:float): unit = | |
| printfn "%s min/avg/max = %f/%f/%f" location (latencies |> List.min) (latencies |> List.average) (latencies |> List.max) | |
| match (latencies |> List.max) > threshhold with | |
| | true -> printfn "Looks like a bad day on the net" |> ignore | |
| | false -> printfn "All OK" | |
| [<EntryPoint>] | |
| let main args = | |
| let wc = new WebClient() | |
| let html = wc.DownloadString("http://www.verizonenterprise.com/about/network/latency/") | |
| tables html | |
| |> List.map (fun x -> rows x | |
| |> List.map (fun x -> cells x | |
| |> List.map stripHtml)) | |
| |> List.tail | |
| |> List.head | |
| |> Seq.skip 2 | |
| |> List.ofSeq | |
| |> List.tail | |
| |> List.map (fun row -> (row |> List.head, row |> List.tail |> List.map float) ) | |
| |> List.map (fun (loc,lat) -> (loc, lat, RegularExpressions.Regex.Match(loc, "(\d+.\d+)").Groups.Item(1).Value |> float)) | |
| |> List.iter (fun (area,lat,thresh) -> output area lat thresh) | |
| 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment