Last active
December 20, 2015 01:08
-
-
Save ptrelford/6046430 to your computer and use it in GitHub Desktop.
Morning Dew
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open System.IO | |
open System.Text | |
open System.Xml | |
open System.Web | |
open FSharp.Data | |
open HtmlAgilityPack | |
let parse prefix (doc:XmlDocument) = | |
let content = | |
doc.GetElementsByTagName("div") | |
|> Seq.cast<XmlNode> | |
|> Seq.find (fun div -> match div.Attributes.["class"] with null -> false | x -> x.Value = "post-entry") | |
printfn "%s" content.InnerXml | |
let mutable label = None | |
for node in content.ChildNodes do | |
if node.Name = "h3" then label <- Some node.InnerText | |
if node.Name = "ul" then | |
label |> Option.iter (fun label -> | |
for item in node.ChildNodes do | |
let a = item.FirstChild | |
if a <> null && a.Name = "a" then | |
let author = HttpUtility.HtmlDecode(item.InnerText).Replace(a.InnerText, "").Trim() | |
let index = author.LastIndexOf('(') | |
let author = if index = -1 then "" else author.Substring(index).TrimStart('(').TrimEnd(')') | |
let href = a.Attributes.["href"].Value | |
let text = HttpUtility.HtmlDecode(a.InnerText).Replace("\r","").Replace("\n","").Replace("\"", "\"\"") | |
let entry = sprintf "%s,\"%s\",\"%s\",\"%s\",\"%s\"" prefix label href text author | |
System.Diagnostics.Debug.WriteLine(entry) | |
printfn "%s" entry | |
) | |
let toXml url = | |
let web = HtmlWeb() | |
let s = new StringWriter(StringBuilder()) | |
let writer = new XmlTextWriter(s) | |
web.LoadHtmlAsXml(url, writer) | |
s.ToString() | |
.Replace("""<g:plusone size="medium" />""","") | |
.Replace("""<g:plusone href="http://www.alvinashcraft.com/2013/07/19/dew-drop-july-19-2013-1588/"></g:plusone>""","") | |
.Replace("""<li><a _hrere28093_the="" windows="" phone="" 7="" numbers="" that="" /> (Brandon Watson)</li>""","") | |
let [<Literal>] path = "C:/morningdew/posts.csv" | |
type Posts = CsvProvider<path> | |
let posts = Posts.Load(path) | |
for post in posts.Data do | |
let doc = XmlDocument() | |
let url = post.Url // "http://www.alvinashcraft.com/2011/03/31/dew-drop-March-31-2011/" | |
let xml = toXml url | |
doc.LoadXml(xml) | |
let prefix = sprintf "%d,%d,%d,%d" post.Number post.Year post.Month post.Day | |
parse prefix doc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open System | |
open System.Net | |
//http://www.alvinashcraft.com/2013/07/19/dew-drop-july-19-2013-1588/ | |
//http://www.alvinashcraft.com/2013/03/20/dew-drop-March-20-2013-1510/ | |
//http://www.alvinashcraft.com/2013/03/05/dew-drop-march-5-2013-1509/ | |
//1218,2011,12,13, | |
//http://www.alvinashcraft.com/2011/12/13/dew-drop-December-13-2011-1218/ | |
//http://www.alvinashcraft.com/2011/12/12/dew-drop-december-12-2011/ | |
// Dew Drop – March 5, 2013 (#1,509) | |
//http://www.alvinashcraft.com/2008/11/28/dew-drop-november-28-2008/ | |
//http://www.alvinashcraft.com/2008/06/23/dew-drop-June-23-2008/ | |
//http://www.alvinashcraft.com/2008/06/20/dew-droplet-june-20-2008/ | |
//http://www.alvinashcraft.com/2008/06/11/dew-drop-june-11-2008/ | |
//http://www.alvinashcraft.com/2008/03/08/dew-drop-March-8-2008/ | |
//http://www.alvinashcraft.com/2008/03/07/daily-bits-march-7-2008/ | |
//108,2007,12,10,http://www.alvinashcraft.com/2007/12/10/daily-bits-December-10-2007/ | |
//http://www.alvinashcraft.com/2007/12/07/daily-dose-of-links-20071207/ | |
//http://www.alvinashcraft.com/2007/11/21/daily-dose-of-links-20071121/ | |
//93,2007,11,2,http://www.alvinashcraft.com/2007/11/02/daily-dose-of-links-20071102/ | |
let postUrl (year,month,day) n = | |
let date = DateTime(year,month,day) | |
let monthName = date.ToString("MMMM") | |
if date >= DateTime(2013,12,13) then | |
sprintf "http://www.alvinashcraft.com/%d/%02d/%02d/dew-drop-%s-%d-%d-%d/" year month day monthName day year n | |
elif date < DateTime(2008,6,23) && date >= DateTime(2008,6,11) then | |
sprintf "http://www.alvinashcraft.com/%d/%02d/%02d/dew-droplet-%s-%d-%d/" year month day monthName day year | |
elif date >= DateTime(2008,3,8) then | |
sprintf "http://www.alvinashcraft.com/%d/%02d/%02d/dew-drop-%s-%d-%d/" year month day monthName day year | |
elif date >= DateTime(2007,12,10) then | |
sprintf "http://www.alvinashcraft.com/%d/%02d/%02d/daily-bits-%s-%d-%d/" year month day monthName day year | |
else | |
sprintf "http://www.alvinashcraft.com/%d/%02d/%02d/daily-dose-of-links-%d%02d%02d/" year month day year month day | |
let postExists (url:String) = | |
let request = WebRequest.CreateHttp(url) | |
printfn "Requesting %s" url | |
let code = | |
try | |
use response = request.GetResponse() :?> HttpWebResponse | |
response.StatusCode | |
with _ -> HttpStatusCode.NotFound | |
code = HttpStatusCode.OK | |
let rec next (find:DateTime) n (date:DateTime) (count:int) = | |
let date = | |
if n = 1509 then DateTime(2013,3,5) // no posts for a few weeks | |
else date | |
let url = postUrl (date.Year, date.Month, date.Day) n | |
if n = 90 then printfn "Done" | |
else | |
if postExists url then | |
let line = sprintf "%d,%d,%d,%d,%s" n date.Year date.Month date.Day url | |
System.Diagnostics.Debug.WriteLine(line) | |
printfn "OK %s" line | |
let find = | |
if date = DateTime(2007,12,5) | |
then DateTime(2007,11,21) | |
else date.AddDays(-1.) | |
next find (n-1) find 10 | |
elif count > 0 then | |
next find n (date.AddDays(-1.)) (count-1) | |
else | |
printfn "***No %d" n | |
next find (n-1) find 10 | |
let date,n = (DateTime(2013,7,19)), 1588 | |
//let date, n = (DateTime(2012,08,22)), 1173 | |
//http://www.alvinashcraft.com/2011/12/13/dew-drop-December-13-2011-1218/ | |
//let date, n = DateTime(2013,3,20), 1510 | |
//let date, n = DateTime(2011,12,13), 1218 | |
//let date, n = DateTime(2008,6,23), 267 // http://www.alvinashcraft.com/2008/06/23/dew-drop-June-23-2008/ | |
//let date, n = DateTime(2008,3,8), 177 // http://www.alvinashcraft.com/2008/03/08/dew-drop-March-8-2008/ | |
//let date, n = DateTime(2007,12,10), 108 // http://www.alvinashcraft.com/2007/12/10/daily-bits-December-10-2007/ | |
//let date, n = DateTime(2007,12,5), 105 //http://www.alvinashcraft.com/2007/12/05/daily-dose-of-links-20071205/ | |
next date n date 10 | |
// 1046 12/02 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment