Skip to content

Instantly share code, notes, and snippets.

@SchlenkR
Created December 27, 2023 09:20
Show Gist options
  • Save SchlenkR/698e8eff4f448a106fd13f0202049591 to your computer and use it in GitHub Desktop.
Save SchlenkR/698e8eff4f448a106fd13f0202049591 to your computer and use it in GitHub Desktop.
#r "nuget: FsHttp"
#r "nuget: FSharp.Data"
open FsHttp
open FsHttp.Operators
open FSharp.Data
let url = "https://public-api.wordpress.com/wpcom/v2/sites/15048173/articles
?className=is-style-borders
&showExcerpt=0
&excerptLength=55
&showReadMore=0
&readMoreLabel=Keep%20reading
&showDate=1
&showImage=1
&showCaption=1
&disableImageLazyLoad=0
&fetchPriority
&imageShape=landscape
&minHeight=0
&moreButton=1
&moreButtonText
&showAuthor=0
&showAvatar=1
&showCategory=0
&postLayout=grid
&columns=3
&colGap=3
&postsToShow=100000
&mediaPosition=top
&&categories%5B0%5D=121184674
&includeSubcategories=1
&&&&&typeScale=4
&imageScale=3
&mobileStack=0
&sectionHeader
&specificMode=0
&textColor
&customTextColor
&singleMode=0
&showSubtitle=0
&postType%5B0%5D=post
&textAlign=left
&includedPostStatuses%5B0%5D=publish
&deduplicate=1
&wpcom_site=15048173
&page=1
"
let posts =
% get url
|> Response.toJson
|> fun x -> x?items.EnumerateArray()
let postHtmls =
posts |> Seq.map (fun x -> x?html.GetString())
let postInfos =
[
for i, htmlText in postHtmls |> Seq.indexed do
printfn $"iteration {i}"
let html = HtmlDocument.Parse(htmlText)
let imgSrc =
try html.CssSelect("img").Head.AttributeValue("src") |> Some
with _ -> None
let theLink =
html
.CssSelect("article.category-f-weekly")
.CssSelect("div.entry-wrapper").Head
.CssSelect("a").Head
let href = theLink.AttributeValue("href")
let title = theLink.InnerText()
{| imgSrc = imgSrc; href = href; title = title |}
]
let mdContent =
[
yield "# Thank You Sergey :)"
yield "This is a list of all the posts that were scraped from the [F# Weekly](https://sergeytihon.com/category/f-weekly/) blog."
yield "## Some Stats"
yield $"- {postInfos.Length} posts scraped"
yield "## Posts"
for post in postInfos do
yield $"## [{post.title}]({post.href})"
yield
match post.imgSrc with
| Some x -> $"""<img src="{x}">""".Replace("&crop=1", "&crop=0")
| None -> "No Image Available"
]
|> String.concat "\n\n"
do System.IO.File.WriteAllText(
__SOURCE_DIRECTORY__ + "/posts.md",
mdContent)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment