Created
November 17, 2015 23:54
-
-
Save evelinag/0ce68655f2aae1ecabcb to your computer and use it in GitHub Desktop.
Analysing box office success of James Bond films using HTML type provider
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#load "packages/FsLab/FsLab.fsx" | |
open FSharp.Data | |
open XPlot.GoogleCharts | |
let bondUrl = "https://en.wikipedia.org/w/index.php?title=List_of_James_Bond_films&oldid=688916363" | |
type BondProvider = HtmlProvider<"https://en.wikipedia.org/w/index.php?title=List_of_James_Bond_films&oldid=688916363"> | |
let bondWiki = BondProvider.Load(bondUrl) | |
let boxOffice = | |
let allBoxOffice = | |
[| for row in bondWiki.Tables.``Box office``.Rows -> | |
row.Title, row.Year, row.Budget2, row.``Box office 2``, row.``Bond actor`` |] | |
allBoxOffice.[1..allBoxOffice.Length-3] | |
|> Array.map (fun (titleRaw, yr, bdgt, bo, actorRaw) -> | |
let actor = actorRaw.[actorRaw.Length/2 + 1 .. ] | |
let title = | |
match titleRaw |> Seq.tryFindIndex ((=) '!') with | |
| Some(idx) -> titleRaw.[idx+1 ..] | |
| None -> titleRaw | |
title, int yr, float bdgt, float bo, actor) | |
let rating = | |
let allRatings = | |
[| for row in bondWiki.Tables.``Reception and accolades``.Rows -> | |
row.Film, row.``Rotten Tomatoes`` |] | |
allRatings.[0..allRatings.Length-2] | |
|> Array.map (fun (title, r) -> | |
title, r.[0..r.IndexOf('%')-1] |> float ) | |
let options = | |
Options( | |
title = "Bond fims - rating and box office", | |
hAxis = Axis(title = "Year"), | |
vAxis = Axis(title = "Box office (millions $)"), | |
bubble = Bubble(textStyle=TextStyle(color="transparent")), | |
colors = [| "red"; "gold" |] | |
) | |
Array.map2 (fun (title, yr, bdgt, bo, actor) (_, rt) -> | |
title + " (" + actor + ")", yr, bo, rt, bdgt ) boxOffice rating | |
|> Chart.Bubble | |
|> Chart.WithLabels(["Title"; "Year"; "Box office"; "Rating"; "Budget"]) | |
|> Chart.WithOptions(options) | |
// Use RProvider to replicate the plot from http://opiateforthemass.es/articles/james-bond-film-ratings/ | |
open RProvider | |
open RProvider.ggplot2 | |
let (++) (plot1:RDotNet.SymbolicExpression) (plot2:RDotNet.SymbolicExpression) = | |
R.``+``(plot1, plot2) | |
let df = | |
namedParams [ | |
"Title", box (boxOffice |> Array.map (fun (t, _,_,_,_) -> t)) | |
"Actor", box (boxOffice |> Array.map (fun (_,_,_,_,a) -> a) |> R.as_factor) | |
"Year", box (boxOffice |> Array.map (fun (_,y,_,_,_) -> y)) | |
"Budget", box (boxOffice |> Array.map (fun (_, _,b,_,_) -> b)) | |
"BoxOffice", box (boxOffice |> Array.map (fun (_, _,_,b,_) -> b)) | |
"Rating", box (rating |> Array.map snd) | |
] | |
|> R.data_frame | |
let dfActors = | |
let actorYrs = | |
boxOffice | |
|> Seq.groupBy (fun (_,_,_,_,a) -> a) | |
|> Seq.map (fun (a, dt) -> | |
a, Seq.map (fun (_,y,_,_,_) -> y) dt |> Seq.min, Seq.map (fun (_,y,_,_,_) -> y) dt |> Seq.max) | |
|> Array.ofSeq | |
|> Array.map (fun (a, y1, y2) -> if y1 = y2 then a, y1, y2+1 else a, y1, y2) | |
namedParams [ | |
"Actor", box (Array.map (fun (a,_,_) -> a) actorYrs) | |
"YearMin", box (Array.map (fun (_,y,_) -> y) actorYrs) | |
"YearMax", box (Array.map (fun (_,_,y) -> y) actorYrs)] | |
|> R.data_frame | |
R.ggplot() | |
// background rectangles based on actors | |
++ R.geom__rect( | |
namedParams [ | |
"data", box dfActors | |
"mapping", box ( | |
R.aes__string( | |
namedParams["xmin", box "YearMin"; "xmax", box "YearMax"; "ymin", box "-Inf"; "ymax", box "Inf"; | |
"fill", box "Actor"])) | |
"alpha", box 0.3]) | |
// write actor names on rectangles | |
++ R.geom__text( | |
namedParams [ | |
"data", box dfActors | |
"mapping", box ( | |
R.aes__string( | |
namedParams["x", box "YearMin"; "y", box (Array.map (fun (_,_,_,b,_) -> b) boxOffice |> Array.max); | |
"label", box "Actor"; "angle", box 90; "hjust", box 1; "vjust", box 1])) | |
"alpha", box 0.6 | |
"size", box 5]) | |
// film names | |
++ R.geom__text( | |
namedParams [ | |
"data", box df | |
"mapping", box ( | |
R.aes__string( | |
namedParams["x", box "Year"; "y", box 0; | |
"label", box "Title"; "angle", box 90; "hjust", box 0; "vjust", box 0.5])) | |
"size", box 4]) | |
// film data | |
++ R.geom__point( | |
data=df, | |
mapping = R.aes__string( | |
namedParams["x", "Year"; "y", "BoxOffice"; "size", "Budget"; "colour", "Rating"])) | |
// Rotten tomatoes rating gradient | |
++ R.scale__colour__continuous( | |
namedParams["low", "red"; "high", "green"; "name", "Rotten Tomatoes rating"]) | |
// Increase minimum point size for readability | |
++ R.scale__size__continuous( | |
namedParams["name", box "Budget (2005 mil. dollars)"; "range", box [3; 10]]) | |
++ R.theme__bw() | |
++ R.theme(namedParams["plot.title", R.element__text(lineheight=0.8, face="bold")]) | |
++ R.guides(namedParams["fill", false]) | |
++ R.labs( | |
namedParams["title", "Box office results, budgets, and ratings of James Bond films\n" | |
"x", ""; "y", "Box office earnings (in 2005 mil. dollars)"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment