Created
August 15, 2017 03:58
-
-
Save majorgreys/ce3a19bee4147a0fedb897b76d668ad7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #load @".paket/load/net45/canopy.fsx" | |
| #load @".paket/load/net45/FSharp.Data.fsx" | |
| open FSharp.Data | |
| open canopy | |
| open OpenQA.Selenium | |
| open System.Text.RegularExpressions | |
| open FSharp.Data.HttpContentTypes | |
| // `choco install selenium-all-drivers` | |
| chromeDir <- "C:\\tools\\selenium" | |
| let baseUrl = "http://voices.revealdigital.com" | |
| let indexUrl = "/cgi-bin/independentvoices?a=cl&cl=CL1&sp=IBJBJF&ai=1&e=-------en-20--1--txt-txIN---------------1" | |
| let topLevel = HtmlDocument.Load(baseUrl + indexUrl) | |
| let issues = | |
| topLevel.CssSelect "li > a" | |
| |> List.map (fun a -> | |
| (baseUrl + a.AttributeValue("href"), a.InnerText().Trim())) | |
| let (|FirstRegexGroup|_|) pattern input = | |
| let m = Regex.Match(input,pattern) | |
| if (m.Success) then Some m.Groups.[1].Value else None | |
| let listPagePdfUrls issueUrl = | |
| let parsePageUrlForPageId pageUrl = | |
| match pageUrl with | |
| | FirstRegexGroup ".*&d=(.*)&" pageId -> | |
| Some pageId | |
| | _ -> None | |
| let issueId = parsePageUrlForPageId issueUrl | |
| // browse to url | |
| url issueUrl | |
| // give some time for page to load | |
| sleep 5 | |
| // list pdf urls by examining each issue page | |
| elements "td a" | |
| |> List.map (fun e -> e.GetAttribute "href") | |
| |> List.distinct | |
| |> List.map parsePageUrlForPageId | |
| |> List.choose id // remove None | |
| |> List.filter (fun e -> e.Equals(issueId.Value) |> not) // remove link to issue index page | |
| // urls to static pdfs are easy to construct once we have the page ids | |
| |> List.map (fun e -> | |
| (e, sprintf "http://voices.revealdigital.com/cgi-bin/independentvoices?a=is&oid=%s&type=staticpdf" e)) | |
| let listPdfUrls () = | |
| let pdfUrls = | |
| issues | |
| |> List.map (fun e -> listPagePdfUrls (fst e)) | |
| |> List.collect id | |
| pdfUrls | |
| let pdfUrls = listPdfUrls() | |
| System.IO.File.WriteAllLines("pdfurls.txt", pdfUrls |> List.map snd) | |
| let R = System.Random() | |
| start chrome | |
| System.IO.File.ReadAllLines("pdfurls.txt") | |
| |> Array.toList | |
| |> List.filter (fun e -> e.Contains("1963")) | |
| |> List.iter (fun e -> | |
| printfn "Downloading %s" e | |
| url e | |
| // Wait before proceeding to next file so as not to overwhelm the server with repeated requests | |
| sleep (R.Next(5, 15)) | |
| ) | |
| quit () |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment