mathias-brandewinder · December 7, 2016 08:27 · tianke0711 · Dec 7, 2016
diff --git a/gradient-boosting-1.fsx b/gradient-boosting-1.fsx
 // blog post: brandewinder.com/2016/08/06/gradient-boosting-part-1
 // https://en.wikipedia.org/wiki/Gradient_boosting#Algorithm

 (*
 Exploring the dataset
 *)

 #I "./packages/"
 #r "fsharp.data/lib/net40/fsharp.data.dll"
 open FSharp.Data
 #r "xplot.googlecharts/lib/net45/xplot.googlecharts.dll"
 #r "google.datatable.net.wrapper/lib/google.datatable.net.wrapper.dll"
 open XPlot.GoogleCharts

 type Wine = CsvProvider<"data/winequality-red.csv",";",InferRows=1500>

 let reds = Wine.Load("data/winequality-red.csv")

 type Observation = Wine.Row

 type Feature = Observation -> float

 let ``Alcohol Level`` : Feature = 
    fun obs -> obs.Alcohol |> float

 let ``Volatile Acidity`` : Feature =
    fun obs -> obs.``Volatile acidity`` |> float

 let ``Fixed Acidity`` : Feature =
    fun obs -> obs.``Fixed acidity`` |> float

 let options = Configuration.Options()
 options.dataOpacity <- 0.25
 options.pointSize <- 10

 reds.Rows
 |> Seq.map (fun obs -> ``Alcohol Level`` obs, obs.Quality) 
 |> Chart.Scatter
 |> Chart.WithOptions options
 |> Chart.WithTitle "Alcohol Level vs. Quality"
 |> Chart.WithXTitle "Alcohol Level"
 |> Chart.WithYTitle "Quality"
 |> Chart.Show

 reds.Rows
 |> Seq.map (fun obs -> ``Volatile Acidity`` obs, obs.Quality) 
 |> Chart.Scatter
 |> Chart.WithOptions options
 |> Chart.WithTitle "Volatile Acidity vs. Quality"
 |> Chart.WithXTitle "Volatile Acidity"
 |> Chart.WithYTitle "Quality"
 |> Chart.Show

 reds.Rows
 |> Seq.map (fun obs -> ``Fixed Acidity`` obs, obs.Quality) 
 |> Chart.Scatter
 |> Chart.WithOptions options
 |> Chart.WithTitle "Fixed Acidity vs. Quality"
 |> Chart.WithXTitle "Fixed Acidity"
 |> Chart.WithYTitle "Quality"
 |> Chart.Show

 (*
 Stumps
 *)

 type Example = Observation * float

 type Predictor = Observation -> float

 let learnStump (sample:Example seq) (feature:Feature) threshold =
    let under = 
        sample 
        |> Seq.filter (fun (obs,lbl) -> feature obs <= threshold)
        |> Seq.averageBy (fun (obs,lbl) -> lbl)
    let over = 
        sample 
        |> Seq.filter (fun (obs,lbl) -> feature obs > threshold)
        |> Seq.averageBy (fun (obs,lbl) -> lbl)
    fun obs ->
        if (feature obs <= threshold)
        then under
        else over

 let redSample = 
    reds.Rows 
    |> Seq.map (fun row -> row, row.Quality |> float)

 let testStump = learnStump redSample ``Alcohol Level`` 11.0

 let predicted = 
    redSample
    |> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs |> testStump))

 predicted
 |> Seq.sortBy fst
 |> Chart.Line
 |> Chart.WithTitle "Alcohol Level vs. Quality"
 |> Chart.WithXTitle "Alcohol Level"
 |> Chart.WithYTitle "Quality"
 |> Chart.Show

 (*
 Picking the best stump
 2 issues: quality, and possible splits
 *)

 let sumOfSquares (sample:Example seq) predictor = 
    sample
    |> Seq.sumBy (fun (obs,lbl) -> 
        pown (lbl - predictor obs) 2)

 sumOfSquares redSample testStump 

 let evenSplits (sample:Example seq) (feature:Feature) (n:int) =
    let values = sample |> Seq.map (fst >> feature)
    let min = values |> Seq.min
    let max = values |> Seq.max
    let width = (max-min) / (float (n + 1))
    [ min + width .. width .. max - width ]

 let alcoholSplits = evenSplits redSample ``Alcohol Level`` 10

 let bestStump = 
    alcoholSplits 
    |> List.map (learnStump redSample ``Alcohol Level``)
    |> List.minBy (sumOfSquares redSample)

 sumOfSquares redSample bestStump 

 redSample
 |> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs |> bestStump))
 |> Seq.sortBy fst
 |> Chart.Line
 |> Chart.WithTitle "Alcohol Level vs. Quality"
 |> Chart.WithXTitle "Alcohol Level"
 |> Chart.WithYTitle "Quality"
 |> Chart.Show

 (*
 Analyzing the residuals    
 *)

 redSample
 |> Seq.map (fun (obs,lbl) -> ``Alcohol Level`` obs, lbl - (obs |> bestStump))
 |> Chart.Scatter
 |> Chart.WithOptions options
 |> Chart.WithTitle "Alcohol Level vs. Residuals"
 |> Chart.WithXTitle "Alcohol Level"
 |> Chart.WithYTitle "Residuals"
 |> Chart.Show

 // alternate chart, aggregating together Observations
 // with same alcohol level
 redSample
 |> Seq.map (fun (obs,lbl) -> ``Alcohol Level`` obs, lbl - (obs |> bestStump))
 |> Seq.groupBy fst
 |> Seq.map (fun (x,group) -> 
    x, 
    group 
    |> Seq.map snd 
    |> Seq.average)
 |> Chart.Scatter
 |> Chart.WithOptions options
 |> Chart.WithTitle "Alcohol Level vs. Residuals"
 |> Chart.WithXTitle "Alcohol Level"
 |> Chart.WithYTitle "Residuals"
 |> Chart.Show

 (*
 Fitting another stump on the residuals
 *)

 let residualsSample =
    redSample
    |> Seq.map (fun (obs,lbl) -> obs, lbl - (obs |> bestStump))

 let residualsStump = 
    alcoholSplits 
    |> List.map (learnStump residualsSample ``Alcohol Level``)
    |> List.minBy (sumOfSquares redSample)

 let combined = fun obs -> bestStump obs + residualsStump obs

 sumOfSquares redSample combined 

 redSample
 |> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs |> combined))
 |> Seq.sortBy fst
 |> Chart.Line
 |> Chart.WithTitle "Alcohol Level vs. Quality"
 |> Chart.WithXTitle "Alcohol Level"
 |> Chart.WithYTitle "Quality"
 |> Chart.Show

 // residuals
 redSample
 |> Seq.map (fun (obs,lbl) -> ``Alcohol Level`` obs, lbl - (obs |> combined))
 |> Chart.Scatter
 |> Chart.WithOptions options
 |> Chart.WithTitle "Alcohol Level vs. Residuals"
 |> Chart.WithXTitle "Alcohol Level"
 |> Chart.WithYTitle "Residuals"
 |> Chart.Show

 (*
 Iteratively adding stumps
 *)

 let learn (sample:Example seq) (feature:Feature) (depth:int) =

    let splits = evenSplits sample feature 10

    let rec next iterationsLeft predictor =
        
        // we have reached depth 0: we are done
        if iterationsLeft = 0 
        then predictor
        else
            // compute new residuals
            let newSample = 
                sample 
                |> Seq.map (fun (obs,y) -> obs, y - predictor obs)
            // learn possible stumps against residuals,
            // and pick the one with smallest error
            let newStump = 
                splits
                |> Seq.map (learnStump newSample feature)
                |> Seq.minBy (sumOfSquares newSample)
            // create new predictor
            let newPredictor = fun obs -> predictor obs + newStump obs
            // ... and keep going
            next (iterationsLeft - 1) newPredictor

    // initialize with a predictor that 
    // predicts the average sample value
    let baseValue = sample |> Seq.map snd |> Seq.average
    let basePredictor = fun (obs:Observation) -> baseValue

    next depth basePredictor

 let model = learn redSample ``Alcohol Level`` 10

 sumOfSquares redSample model

 redSample
 |> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs |> model))
 |> Seq.sortBy fst
 |> Chart.Line
 |> Chart.WithTitle "Alcohol Level vs. Quality"
 |> Chart.WithXTitle "Alcohol Level"
 |> Chart.WithYTitle "Quality"
 |> Chart.Show

 // increasing depth

 [ 1 .. 15 ]
 |> Seq.map (fun depth -> depth, learn redSample ``Alcohol Level`` depth)
 |> Seq.map (fun (depth,model) -> depth, sumOfSquares redSample model)
 |> Chart.Column
 |> Chart.Show
diff --git a/gradient-boosting-2.fsx b/gradient-boosting-2.fsx
 // blog post: brandewinder.com/2016/08/14/gradient-boosting-part-2
 //https://en.wikipedia.org/wiki/Gradient_boosting#Algorithm

 (*
 Exploring the dataset
 *)

 #I "./packages/"
 #r "fsharp.data/lib/net40/fsharp.data.dll"
 open FSharp.Data
 #r "xplot.googlecharts/lib/net45/xplot.googlecharts.dll"
 #r "google.datatable.net.wrapper/lib/google.datatable.net.wrapper.dll"
 open XPlot.GoogleCharts

 type Wine = CsvProvider<"data/winequality-red.csv",";",InferRows=1500>

 let reds = Wine.Load("data/winequality-red.csv")

 type Observation = Wine.Row

 type Feature = Observation -> float

 let ``Alcohol Level`` : Feature = 
    fun obs -> obs.Alcohol |> float

 let ``Chlorides`` : Feature = 
    fun obs -> obs.Chlorides |> float

 let ``Citric Acid`` : Feature = 
    fun obs -> obs.``Citric acid`` |> float

 let ``Density`` : Feature = 
    fun obs -> obs.Density |> float

 let ``Fixed Acidity`` : Feature =
    fun obs -> obs.``Fixed acidity`` |> float

 let ``Free Sulfur Dioxide`` : Feature = 
    fun obs -> obs.``Free sulfur dioxide`` |> float

 let ``PH`` : Feature = 
    fun obs -> obs.PH |> float

 let ``Residual Sugar`` : Feature = 
    fun obs -> obs.``Residual sugar`` |> float

 let ``Total Sulfur Dioxide`` : Feature = 
    fun obs -> obs.``Total sulfur dioxide`` |> float

 let ``Volatile Acidity`` : Feature =
    fun obs -> obs.``Volatile acidity`` |> float

 (*
 Trees
 *)

 type Example = Observation * float

 type Predictor = Observation -> float

 type Tree =
    | Leaf of float
    | Branch of (Feature * float) * Tree * Tree

 let exampleTree =
    Branch(
        (``Alcohol Level``, 10.5),
        Branch(
            (``Volatile Acidity``, 0.8),
            Leaf(6.0),
            Leaf(3.0)
        ),
        Leaf(5.5)
    )

 let rec predict (tree:Tree) (obs:Observation) =
    match tree with
    | Leaf(prediction) -> prediction
    | Branch((feature,split),under,over) ->
        let featureValue = feature obs
        if featureValue <= split
        then predict under obs
        else predict over obs

 predict exampleTree (reds.Rows |> Seq.head)

 let examplePredictor = predict exampleTree

 let sumOfSquares (sample:Example seq) predictor = 
    sample
    |> Seq.sumBy (fun (obs,lbl) -> 
        pown (lbl - predictor obs) 2)

 let redSample = 
    reds.Rows 
    |> Seq.map (fun row -> row, row.Quality |> float)

 sumOfSquares redSample examplePredictor 

 (*
 Learning a Tree
 *)

 let learnStump (sample:Example seq) (feature:Feature) threshold =
    let under = 
        sample 
        |> Seq.filter (fun (obs,lbl) -> feature obs <= threshold)
        |> Seq.averageBy (fun (obs,lbl) -> lbl)
    let over = 
        sample 
        |> Seq.filter (fun (obs,lbl) -> feature obs > threshold)
        |> Seq.averageBy (fun (obs,lbl) -> lbl)
    fun obs ->
        if (feature obs <= threshold)
        then under
        else over

 let evenSplits (sample:Example seq) (feature:Feature) (n:int) =
    let values = sample |> Seq.map (fst >> feature)
    let min = values |> Seq.min
    let max = values |> Seq.max
    let width = (max-min) / (float (n + 1))
    [ min + width .. width .. max - width ]

 let rec draftLearnTree (sample:Example seq) (features:Feature list) (depth:int) =
    
    if depth = 0
    then
        let avg = sample |> Seq.averageBy snd
        Leaf(avg)
    else
        let (bestFeature,bestSplit) = 
            // create all feature * split combinations
            seq {
                for feature in features do
                    let splits = evenSplits sample feature 10
                    for split in splits -> feature,split
            }
            // find the split with the smallest error
            |> Seq.minBy (fun (feature,split) -> 
                let predictor = learnStump sample feature split
                sumOfSquares sample predictor)

        let under = 
            sample 
            |> Seq.filter (fun (obs,_) -> 
                bestFeature obs <= bestSplit)
        let over = 
            sample 
            |> Seq.filter (fun (obs,_) -> 
                bestFeature obs > bestSplit)

        let underTree = draftLearnTree under features (depth - 1)
        let overTree =  draftLearnTree over features (depth - 1)

        Branch((bestFeature,bestSplit),underTree,overTree)

 // replicate the original stump
 let originalStump = draftLearnTree redSample [ ``Alcohol Level`` ] 1
 sumOfSquares redSample (predict originalStump)

 let deeperTree = draftLearnTree redSample [``Alcohol Level``;``Volatile Acidity``] 4
 sumOfSquares redSample (predict deeperTree)

 // problem!
 let explodingTree = draftLearnTree redSample [``Alcohol Level``] 5

 (*
 Cleaning things up    
 *)

 let underOver (sample:Example seq) (feat:Feature,split:float) =
    let under = sample |> Seq.filter (fun (obs,_) -> feat obs <= split)
    let over =  sample |> Seq.filter (fun (obs,_) -> feat obs > split)
    under,over

 type Splitter = Example seq -> Feature -> float list

 type Cost = Example seq -> float

 let rec learnTree (splitter:Splitter,cost:Cost) (sample:Example seq) (features:Feature list) (depth:int) =
    
    if depth = 0
    then
        let avg = sample |> Seq.averageBy snd
        Leaf(avg)
    else
        let initialCost = cost sample        
        let candidates = 
            // build up all the feature/split candidates,
            // and their associated sample splits
            seq {
                for feature in features do
                    let splits = splitter sample feature
                    for split in splits -> 
                        let under,over = underOver sample (feature,split)  
                        (feature,split),(under,over)
            }
            // compute and append cost of split
            |> Seq.map (fun (candidate,(under,over)) ->
                candidate,(under,over), cost under + cost over)
            // retain only candidates with strict cost improvement
            |> Seq.filter (fun (candidate,(under,over),splitCost) ->
                splitCost < initialCost)

        if (Seq.isEmpty candidates)
        then
            let avg = sample |> Seq.averageBy snd
            Leaf(avg)
        else
            let ((bestFeature,bestSplit),(under,over),spliCost) = 
                candidates 
                |> Seq.minBy (fun (_,_,splitCost) -> splitCost)

            let underTree = learnTree (splitter,cost) under features (depth - 1)
            let overTree =  learnTree (splitter,cost) over features (depth - 1)

            Branch((bestFeature,bestSplit),underTree,overTree)

 let evenSplitter n (sample:Example seq) (feature:Feature) = 
    let values = sample |> Seq.map (fst >> feature)
    let min = values |> Seq.min
    let max = values |> Seq.max
    if min = max 
    then []
    else
        let width = (max-min) / (float (n + 1))
        [ min + width .. width .. max - width ]

 let sumOfSquaresCost (sample:Example seq) = 
    let avg = sample |> Seq.averageBy snd
    sample |> Seq.sumBy (fun (_,lbl) -> pown (lbl - avg) 2) 

 // alternate cost specification
 let manhattanCost (sample:Example seq) =
    let avg = sample |> Seq.averageBy snd
    sample |> Seq.sumBy (fun (_,lbl) -> abs (lbl - avg))

 let stableTree = learnTree (evenSplitter 10,sumOfSquaresCost) redSample [``Alcohol Level``;``Volatile Acidity``] 10

 sumOfSquares redSample (predict stableTree)

 // we include every feature available
 let features = [
    ``Alcohol Level``
    ``Chlorides``
    ``Citric Acid``
    ``Density``
    ``Fixed Acidity``
    ``Free Sulfur Dioxide``
    ``PH``
    ``Residual Sugar``
    ``Total Sulfur Dioxide``
    ``Volatile Acidity``
 ]

 let fullTree = learnTree (evenSplitter 5,sumOfSquaresCost) redSample features 10
 sumOfSquares redSample (predict fullTree)

 // plotting actual vs. predicted values
 let options = Configuration.Options()
 options.dataOpacity <- 0.25
 options.pointSize <- 10

 redSample
 |> Seq.map (fun (obs,lbl) -> lbl, predict fullTree obs) 
 |> Chart.Scatter
 |> Chart.WithOptions options 
 |> Chart.WithTitle "Wine Quality: Actual vs. Predicted"
 |> Chart.WithXTitle "Actual"
 |> Chart.WithYTitle "Predicted"
 |> Chart.Show

 (*
 Over-fitting?    
 *)

 // we split the sample in halves
 let sampleSize = redSample |> Seq.length
 let training = redSample |> Seq.take (sampleSize/2)
 let testing = redSample |> Seq.skip (sampleSize/2)

 // careful - this takes a bit of time :)
 let trees = 
    [ for depth in 1 .. 10 -> 
        // for increasing depth
        depth, 
        // we train a tree on the training sample
        learnTree (evenSplitter 10,sumOfSquaresCost) training features depth 
    ]

 // we evaluate errors, on the training and the testing samples
 let trainingError = trees |> List.map (fun (d,tree) -> d, sumOfSquares training (predict tree))
 let testingError = trees |> List.map (fun (d,tree) -> d, sumOfSquares testing (predict tree))
             
 [ trainingError; testingError ]
 |> Chart.Line
 |> Chart.WithLabels ["Train"; "Test"]
 |> Chart.WithTitle "Over-Fitting Analysis"
 |> Chart.WithXTitle "Depth"
 |> Chart.WithYTitle "Error"
 |> Chart.Show
diff --git a/gradient-boosting-3.fsx b/gradient-boosting-3.fsx
 // blog post: brandewinder.com/2016/09/03/gradient-boosting-part-3

 (*
 Dependencies
 *)

 #I "./packages/"

 #r "fsharp.data/lib/net40/fsharp.data.dll"
 open FSharp.Data

 #r "xplot.googlecharts/lib/net45/xplot.googlecharts.dll"
 #r "google.datatable.net.wrapper/lib/google.datatable.net.wrapper.dll"
 open XPlot.GoogleCharts

 #r "fsalg/lib/fsalg.dll"
 #r "diffsharp/lib/diffsharp.dll"
 open DiffSharp.Numerical

 let scatterOptions = Configuration.Options() 
 scatterOptions.dataOpacity <- 0.25 
 scatterOptions.pointSize <- 10 
 scatterOptions.hAxis <- Axis(minValue = 0, maxValue = 10)
 scatterOptions.vAxis <- Axis(minValue = 0, maxValue = 10)

 (*
 Declaring our core types and importing the data.
 *)

 type Wine = CsvProvider<"data/winequality-red.csv",";",InferRows=1500>

 type Observation = Wine.Row
 type Feature = Observation -> float
 type Example = Observation * float
 type Predictor = Observation -> float

 let redSample = 
    Wine.Load("data/winequality-red.csv").Rows
    |> Seq.map (fun row -> row, row.Quality |> float)

 (*
 Creating features for that dataset
 *)

 let ``Alcohol Level`` : Feature = 
    fun obs -> obs.Alcohol |> float

 let ``Chlorides`` : Feature = 
    fun obs -> obs.Chlorides |> float

 let ``Citric Acid`` : Feature = 
    fun obs -> obs.``Citric acid`` |> float

 let ``Density`` : Feature = 
    fun obs -> obs.Density |> float

 let ``Fixed Acidity`` : Feature =
    fun obs -> obs.``Fixed acidity`` |> float

 let ``Free Sulfur Dioxide`` : Feature = 
    fun obs -> obs.``Free sulfur dioxide`` |> float

 let ``PH`` : Feature = 
    fun obs -> obs.PH |> float

 let ``Residual Sugar`` : Feature = 
    fun obs -> obs.``Residual sugar`` |> float

 let ``Total Sulfur Dioxide`` : Feature = 
    fun obs -> obs.``Total sulfur dioxide`` |> float

 let ``Volatile Acidity`` : Feature =
    fun obs -> obs.``Volatile acidity`` |> float

 let features = [
    ``Alcohol Level``
    ``Chlorides``
    ``Citric Acid``
    ``Density``
    ``Fixed Acidity``
    ``Free Sulfur Dioxide``
    ``PH``
    ``Residual Sugar``
    ``Total Sulfur Dioxide``
    ``Volatile Acidity``
 ]

 (*
 Basic regression tree implementation
 *)

 type Tree =
    | Leaf of float
    | Branch of (Feature * float) * Tree * Tree

 let rec predict (tree:Tree) (obs:Observation) =
    match tree with
    | Leaf(prediction) -> prediction
    | Branch((feature,split),under,over) ->
        let featureValue = feature obs
        if featureValue <= split
        then predict under obs
        else predict over obs

 let underOver (sample:Example seq) (feat:Feature,split:float) =
    let under = sample |> Seq.filter (fun (obs,_) -> feat obs <= split)
    let over =  sample |> Seq.filter (fun (obs,_) -> feat obs > split)
    under,over

 type Splitter = Example seq -> Feature -> float list
 type Cost = Example seq -> float

 let rec learnTree (splitter:Splitter,cost:Cost) (sample:Example seq) (features:Feature list) (depth:int) =
    
    if depth = 0
    then
        let avg = sample |> Seq.averageBy snd
        Leaf(avg)
    else
        let initialCost = cost sample        
        let candidates = 
            // build up all the feature/split candidates,
            // and their associated sample splits
            seq {
                for feature in features do
                    let splits = splitter sample feature
                    for split in splits -> 
                        let under,over = underOver sample (feature,split)  
                        (feature,split),(under,over)
            }
            // compute and append cost of split
            |> Seq.map (fun (candidate,(under,over)) ->
                let underSize = under |> Seq.length |> float
                let overSize = over |> Seq.length |> float
                let size = underSize + overSize
                let weightedCost = (underSize / size) * (cost under) + (overSize / size) * (cost over)
                candidate,(under,over), weightedCost)
            // retain only candidates with strict cost improvement
            |> Seq.filter (fun (candidate,(under,over),splitCost) ->
                splitCost < initialCost)

        if (Seq.isEmpty candidates)
        then
            let avg = sample |> Seq.averageBy snd
            Leaf(avg)
        else
            let ((bestFeature,bestSplit),(under,over),spliCost) = 
                candidates 
                |> Seq.minBy (fun (_,_,splitCost) -> splitCost)

            let underTree = learnTree (splitter,cost) under features (depth - 1)
            let overTree =  learnTree (splitter,cost) over features (depth - 1)

            Branch((bestFeature,bestSplit),underTree,overTree)

 let evenSplitter n (sample:Example seq) (feature:Feature) = 
    let values = sample |> Seq.map (fst >> feature)
    let min = values |> Seq.min
    let max = values |> Seq.max
    if min = max 
    then []
    else
        let width = (max-min) / (float (n + 1))
        [ min + width .. width .. max - width ]

 let sumOfSquaresCost (sample:Example seq) = 
    let avg = sample |> Seq.averageBy snd
    sample |> Seq.sumBy (fun (_,lbl) -> pown (lbl - avg) 2) 

 let fullTree = learnTree (evenSplitter 5,sumOfSquaresCost) redSample features 3

 let averageSquareError (sample:Example seq) predictor = 
    sample
    |> Seq.averageBy (fun (obs,lbl) -> 
        pown (lbl - predictor obs) 2)

 averageSquareError redSample (predict fullTree)

 redSample 
 |> Seq.map (fun (obs,lbl) -> lbl, (predict fullTree) obs)  
 |> Chart.Scatter 
 |> Chart.WithOptions scatterOptions  
 |> Chart.WithTitle "Wine Quality: Actual vs. Predicted (Tree)" 
 |> Chart.WithXTitle "Actual" 
 |> Chart.WithYTitle "Predicted" 
 |> Chart.Show 

 (*
 Gradient Boosting
 *)
    
 type Learner = Example seq -> Predictor

 let learn (sample:Example seq) (learner:Learner) (depth:int) =

    let rec next iterationsLeft predictor =
        
        // we have reached depth 0: we are done
        if iterationsLeft = 0 
        then predictor
        else
            // compute new residuals,
            let newSample = 
                sample 
                |> Seq.map (fun (obs,y) -> obs, y - predictor obs)

            // learn a predictor against residuals,
            let residualsPredictor = learner newSample

            // create new predictor
            let newPredictor = 
                fun obs -> predictor obs + residualsPredictor obs

            // ... and keep going
            next (iterationsLeft - 1) newPredictor

    // initialize with a predictor that 
    // predicts the average sample value
    let baseValue = sample |> Seq.map snd |> Seq.average
    let basePredictor = fun (obs:Observation) -> baseValue

    next depth basePredictor

 let treeLearner (sample:Example seq) = 
    learnTree (evenSplitter 5,sumOfSquaresCost) sample features 3
    |> predict

 // evaluate boosting at different depth
 [ 1 .. 5 ]
 |> List.map (fun depth ->
    let model = learn redSample treeLearner depth
    depth, averageSquareError redSample model)

 (*
 True Gradient Boosting, using pseudo-residuals
 *)

 type Loss = float -> float

 let draftBoostedLearn (sample:Example seq) (learner:Learner) (loss:Loss) (depth:int) =

    let pseudoResiduals = diff loss

    let rec next iterationsLeft predictor =
        
        // we have reached depth 0: we are done
        if iterationsLeft = 0 
        then predictor
        else
            // compute new residuals,
            let newSample = 
                sample 
                |> Seq.map (fun (obs,y) -> 
                    obs, 
                    pseudoResiduals (y - predictor obs))

            // learn a tree against residuals,
            let residualsPredictor = learner newSample

            // create new predictor
            let newPredictor = 
                fun obs -> 
                    predictor obs + residualsPredictor obs

            // ... and keep going
            next (iterationsLeft - 1) newPredictor

    // initialize with a predictor that 
    // predicts the average sample value
    let baseValue = sample |> Seq.map snd |> Seq.average
    let basePredictor = fun (obs:Observation) -> baseValue

    next depth basePredictor

 // we should have the same results as before

 let squareLoss : Loss = fun x -> 0.5 * pown x 2

 [ 1 .. 5 ]
 |> List.map (fun depth ->
    let model = draftBoostedLearn redSample treeLearner squareLoss depth
    depth, averageSquareError redSample model)


 // illustration: differentiating the square loss function
 // does produce the residuals.
 let diffSquareLoss = diff squareLoss

 [ - 5.0 .. 0.1 .. 5.0 ]
 |> List.map (fun x -> x, diffSquareLoss x)
 |> Chart.Line 
 |> Chart.Show


 (*
 Optimal combination of predictors
 *)

 let combination f1 f2 gamma : Predictor =
    fun obs -> f1 obs + gamma * f2 obs

 let gradientDescent f x0 eta epsilon =
    let rec desc x =
        let g = diff f x
        if abs g < epsilon
        then x
        else
            printfn "%.3f" x
            desc (x - eta * g)
    desc x0

 // illustration
 let foo x = pown x 2
 let min_foo = gradientDescent foo 10. 0.1 0.0001

 let optimalGamma (sample:Example seq) f1 f2 (loss:Loss) =

    let combine gamma = combination f1 f2 gamma
    let costOf gamma =
        sample
        |> Seq.sumBy (fun (obs,y) ->
            combine gamma obs - y |> loss)

    gradientDescent costOf 1.0 0.001 0.01

 let boostedLearn (sample:Example seq) (learner:Learner) (loss:Loss) (depth:int) =

    let pseudoResiduals = diff loss

    let rec next iterationsLeft predictor =
        
        // we have reached depth 0: we are done
        if iterationsLeft = 0 
        then predictor
        else
            // compute new residuals,
            let newSample = 
                sample 
                |> Seq.map (fun (obs,y) -> 
                    obs, 
                    pseudoResiduals (y - predictor obs))

            // learn a tree against residuals,
            let residualsPredictor = learner newSample

            // find optimal gamma
            let gamma = optimalGamma sample predictor residualsPredictor loss

            // create new predictor
            let newPredictor =
                fun obs -> 
                    predictor obs + gamma * residualsPredictor obs

            // ... and keep going
            next (iterationsLeft - 1) newPredictor

    // initialize with a predictor that 
    // predicts the average sample value
    let baseValue = sample |> Seq.map snd |> Seq.average
    let basePredictor = fun (obs:Observation) -> baseValue

    next depth basePredictor

 [ 1 .. 5 ]
 |> List.map (fun depth ->
    let model = boostedLearn redSample treeLearner squareLoss depth
    depth, averageSquareError redSample model)

 let ssrPredictor = boostedLearn redSample treeLearner squareLoss 5

 redSample 
 |> Seq.map (fun (obs,lbl) -> lbl, ssrPredictor obs)  
 |> Chart.Scatter 
 |> Chart.WithOptions scatterOptions  
 |> Chart.WithTitle "Wine Quality: Actual vs. Predicted (SSR)" 
 |> Chart.WithXTitle "Actual" 
 |> Chart.WithYTitle "Predicted" 
 |> Chart.Show 

 (*
 Using a more complex Loss function, the Huber Loss
 *)

 // https://en.wikipedia.org/wiki/Huber_loss#Definition
 let huber delta x = 
    if abs x <= delta
    then 0.5 * pown x 2
    else delta * (abs x - 0.5 * delta)

 [ - 5.0 .. 0.1 .. 5.0 ]
 |> List.map (fun x -> x, huber 1.0 x)
 |> Chart.Line 
 |> Chart.Show

 // illustration: differenting the square loss function
 // does produce the residuals.
 let diffHuber = diff (huber 1.0)

 [ - 5.0 .. 0.1 .. 5.0 ]
 |> List.map (fun x -> x, diffHuber x)
 |> Chart.Line 
 |> Chart.Show

 [ 1 .. 5 ]
 |> List.map (fun depth ->
    let model = boostedLearn redSample treeLearner (huber 1.0) depth
    depth, averageSquareError redSample model)

 let huberPredictor = boostedLearn redSample treeLearner (huber 1.0) 5

 redSample 
 |> Seq.map (fun (obs,lbl) -> lbl, huberPredictor obs)  
 |> Chart.Scatter 
 |> Chart.WithOptions scatterOptions  
 |> Chart.WithTitle "Wine Quality: Actual vs. Predicted (Huber 1.0)" 
 |> Chart.WithXTitle "Actual" 
 |> Chart.WithYTitle "Predicted" 
 |> Chart.Show
	// blog post: brandewinder.com/2016/08/06/gradient-boosting-part-1
	// https://en.wikipedia.org/wiki/Gradient_boosting#Algorithm

	(*
	Exploring the dataset
	*)

	#I "./packages/"
	#r "fsharp.data/lib/net40/fsharp.data.dll"
	open FSharp.Data
	#r "xplot.googlecharts/lib/net45/xplot.googlecharts.dll"
	#r "google.datatable.net.wrapper/lib/google.datatable.net.wrapper.dll"
	open XPlot.GoogleCharts

	type Wine = CsvProvider<"data/winequality-red.csv",";",InferRows=1500>

	let reds = Wine.Load("data/winequality-red.csv")

	type Observation = Wine.Row

	type Feature = Observation -> float

	let ``Alcohol Level`` : Feature =
	fun obs -> obs.Alcohol \|> float

	let ``Volatile Acidity`` : Feature =
	fun obs -> obs.``Volatile acidity`` \|> float

	let ``Fixed Acidity`` : Feature =
	fun obs -> obs.``Fixed acidity`` \|> float

	let options = Configuration.Options()
	options.dataOpacity <- 0.25
	options.pointSize <- 10

	reds.Rows
	\|> Seq.map (fun obs -> ``Alcohol Level`` obs, obs.Quality)
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Alcohol Level vs. Quality"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	reds.Rows
	\|> Seq.map (fun obs -> ``Volatile Acidity`` obs, obs.Quality)
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Volatile Acidity vs. Quality"
	\|> Chart.WithXTitle "Volatile Acidity"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	reds.Rows
	\|> Seq.map (fun obs -> ``Fixed Acidity`` obs, obs.Quality)
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Fixed Acidity vs. Quality"
	\|> Chart.WithXTitle "Fixed Acidity"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	(*
	Stumps
	*)

	type Example = Observation * float

	type Predictor = Observation -> float

	let learnStump (sample:Example seq) (feature:Feature) threshold =
	let under =
	sample
	\|> Seq.filter (fun (obs,lbl) -> feature obs <= threshold)
	\|> Seq.averageBy (fun (obs,lbl) -> lbl)
	let over =
	sample
	\|> Seq.filter (fun (obs,lbl) -> feature obs > threshold)
	\|> Seq.averageBy (fun (obs,lbl) -> lbl)
	fun obs ->
	if (feature obs <= threshold)
	then under
	else over

	let redSample =
	reds.Rows
	\|> Seq.map (fun row -> row, row.Quality \|> float)

	let testStump = learnStump redSample ``Alcohol Level`` 11.0

	let predicted =
	redSample
	\|> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs \|> testStump))

	predicted
	\|> Seq.sortBy fst
	\|> Chart.Line
	\|> Chart.WithTitle "Alcohol Level vs. Quality"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	(*
	Picking the best stump
	2 issues: quality, and possible splits
	*)

	let sumOfSquares (sample:Example seq) predictor =
	sample
	\|> Seq.sumBy (fun (obs,lbl) ->
	pown (lbl - predictor obs) 2)

	sumOfSquares redSample testStump

	let evenSplits (sample:Example seq) (feature:Feature) (n:int) =
	let values = sample \|> Seq.map (fst >> feature)
	let min = values \|> Seq.min
	let max = values \|> Seq.max
	let width = (max-min) / (float (n + 1))
	[ min + width .. width .. max - width ]

	let alcoholSplits = evenSplits redSample ``Alcohol Level`` 10

	let bestStump =
	alcoholSplits
	\|> List.map (learnStump redSample ``Alcohol Level``)
	\|> List.minBy (sumOfSquares redSample)

	sumOfSquares redSample bestStump

	redSample
	\|> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs \|> bestStump))
	\|> Seq.sortBy fst
	\|> Chart.Line
	\|> Chart.WithTitle "Alcohol Level vs. Quality"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	(*
	Analyzing the residuals
	*)

	redSample
	\|> Seq.map (fun (obs,lbl) -> ``Alcohol Level`` obs, lbl - (obs \|> bestStump))
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Alcohol Level vs. Residuals"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Residuals"
	\|> Chart.Show

	// alternate chart, aggregating together Observations
	// with same alcohol level
	redSample
	\|> Seq.map (fun (obs,lbl) -> ``Alcohol Level`` obs, lbl - (obs \|> bestStump))
	\|> Seq.groupBy fst
	\|> Seq.map (fun (x,group) ->
	x,
	group
	\|> Seq.map snd
	\|> Seq.average)
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Alcohol Level vs. Residuals"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Residuals"
	\|> Chart.Show

	(*
	Fitting another stump on the residuals
	*)

	let residualsSample =
	redSample
	\|> Seq.map (fun (obs,lbl) -> obs, lbl - (obs \|> bestStump))

	let residualsStump =
	alcoholSplits
	\|> List.map (learnStump residualsSample ``Alcohol Level``)
	\|> List.minBy (sumOfSquares redSample)

	let combined = fun obs -> bestStump obs + residualsStump obs

	sumOfSquares redSample combined

	redSample
	\|> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs \|> combined))
	\|> Seq.sortBy fst
	\|> Chart.Line
	\|> Chart.WithTitle "Alcohol Level vs. Quality"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	// residuals
	redSample
	\|> Seq.map (fun (obs,lbl) -> ``Alcohol Level`` obs, lbl - (obs \|> combined))
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Alcohol Level vs. Residuals"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Residuals"
	\|> Chart.Show

	(*
	Iteratively adding stumps
	*)

	let learn (sample:Example seq) (feature:Feature) (depth:int) =

	let splits = evenSplits sample feature 10

	let rec next iterationsLeft predictor =

	// we have reached depth 0: we are done
	if iterationsLeft = 0
	then predictor
	else
	// compute new residuals
	let newSample =
	sample
	\|> Seq.map (fun (obs,y) -> obs, y - predictor obs)
	// learn possible stumps against residuals,
	// and pick the one with smallest error
	let newStump =
	splits
	\|> Seq.map (learnStump newSample feature)
	\|> Seq.minBy (sumOfSquares newSample)
	// create new predictor
	let newPredictor = fun obs -> predictor obs + newStump obs
	// ... and keep going
	next (iterationsLeft - 1) newPredictor

	// initialize with a predictor that
	// predicts the average sample value
	let baseValue = sample \|> Seq.map snd \|> Seq.average
	let basePredictor = fun (obs:Observation) -> baseValue

	next depth basePredictor

	let model = learn redSample ``Alcohol Level`` 10

	sumOfSquares redSample model

	redSample
	\|> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs \|> model))
	\|> Seq.sortBy fst
	\|> Chart.Line
	\|> Chart.WithTitle "Alcohol Level vs. Quality"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	// increasing depth

	[ 1 .. 15 ]
	\|> Seq.map (fun depth -> depth, learn redSample ``Alcohol Level`` depth)
	\|> Seq.map (fun (depth,model) -> depth, sumOfSquares redSample model)
	\|> Chart.Column
	\|> Chart.Show
	// blog post: brandewinder.com/2016/08/14/gradient-boosting-part-2
	//https://en.wikipedia.org/wiki/Gradient_boosting#Algorithm

	(*
	Exploring the dataset
	*)

	#I "./packages/"
	#r "fsharp.data/lib/net40/fsharp.data.dll"
	open FSharp.Data
	#r "xplot.googlecharts/lib/net45/xplot.googlecharts.dll"
	#r "google.datatable.net.wrapper/lib/google.datatable.net.wrapper.dll"
	open XPlot.GoogleCharts

	type Wine = CsvProvider<"data/winequality-red.csv",";",InferRows=1500>

	let reds = Wine.Load("data/winequality-red.csv")

	type Observation = Wine.Row

	type Feature = Observation -> float

	let ``Alcohol Level`` : Feature =
	fun obs -> obs.Alcohol \|> float

	let ``Chlorides`` : Feature =
	fun obs -> obs.Chlorides \|> float

	let ``Citric Acid`` : Feature =
	fun obs -> obs.``Citric acid`` \|> float

	let ``Density`` : Feature =
	fun obs -> obs.Density \|> float

	let ``Fixed Acidity`` : Feature =
	fun obs -> obs.``Fixed acidity`` \|> float

	let ``Free Sulfur Dioxide`` : Feature =
	fun obs -> obs.``Free sulfur dioxide`` \|> float

	let ``PH`` : Feature =
	fun obs -> obs.PH \|> float

	let ``Residual Sugar`` : Feature =
	fun obs -> obs.``Residual sugar`` \|> float

	let ``Total Sulfur Dioxide`` : Feature =
	fun obs -> obs.``Total sulfur dioxide`` \|> float

	let ``Volatile Acidity`` : Feature =
	fun obs -> obs.``Volatile acidity`` \|> float

	(*
	Trees
	*)

	type Example = Observation * float

	type Predictor = Observation -> float

	type Tree =
	\| Leaf of float
	\| Branch of (Feature * float) * Tree * Tree

	let exampleTree =
	Branch(
	(``Alcohol Level``, 10.5),
	Branch(
	(``Volatile Acidity``, 0.8),
	Leaf(6.0),
	Leaf(3.0)
	),
	Leaf(5.5)
	)

	let rec predict (tree:Tree) (obs:Observation) =
	match tree with
	\| Leaf(prediction) -> prediction
	\| Branch((feature,split),under,over) ->
	let featureValue = feature obs
	if featureValue <= split
	then predict under obs
	else predict over obs

	predict exampleTree (reds.Rows \|> Seq.head)

	let examplePredictor = predict exampleTree

	let sumOfSquares (sample:Example seq) predictor =
	sample
	\|> Seq.sumBy (fun (obs,lbl) ->
	pown (lbl - predictor obs) 2)

	let redSample =
	reds.Rows
	\|> Seq.map (fun row -> row, row.Quality \|> float)

	sumOfSquares redSample examplePredictor

	(*
	Learning a Tree
	*)

	let learnStump (sample:Example seq) (feature:Feature) threshold =
	let under =
	sample
	\|> Seq.filter (fun (obs,lbl) -> feature obs <= threshold)
	\|> Seq.averageBy (fun (obs,lbl) -> lbl)
	let over =
	sample
	\|> Seq.filter (fun (obs,lbl) -> feature obs > threshold)
	\|> Seq.averageBy (fun (obs,lbl) -> lbl)
	fun obs ->
	if (feature obs <= threshold)
	then under
	else over

	let evenSplits (sample:Example seq) (feature:Feature) (n:int) =
	let values = sample \|> Seq.map (fst >> feature)
	let min = values \|> Seq.min
	let max = values \|> Seq.max
	let width = (max-min) / (float (n + 1))
	[ min + width .. width .. max - width ]

	let rec draftLearnTree (sample:Example seq) (features:Feature list) (depth:int) =

	if depth = 0
	then
	let avg = sample \|> Seq.averageBy snd
	Leaf(avg)
	else
	let (bestFeature,bestSplit) =
	// create all feature * split combinations
	seq {
	for feature in features do
	let splits = evenSplits sample feature 10
	for split in splits -> feature,split
	}
	// find the split with the smallest error
	\|> Seq.minBy (fun (feature,split) ->
	let predictor = learnStump sample feature split
	sumOfSquares sample predictor)

	let under =
	sample
	\|> Seq.filter (fun (obs,_) ->
	bestFeature obs <= bestSplit)
	let over =
	sample
	\|> Seq.filter (fun (obs,_) ->
	bestFeature obs > bestSplit)

	let underTree = draftLearnTree under features (depth - 1)
	let overTree = draftLearnTree over features (depth - 1)

	Branch((bestFeature,bestSplit),underTree,overTree)

	// replicate the original stump
	let originalStump = draftLearnTree redSample [ ``Alcohol Level`` ] 1
	sumOfSquares redSample (predict originalStump)

	let deeperTree = draftLearnTree redSample [``Alcohol Level``;``Volatile Acidity``] 4
	sumOfSquares redSample (predict deeperTree)

	// problem!
	let explodingTree = draftLearnTree redSample [``Alcohol Level``] 5

	(*
	Cleaning things up
	*)

	let underOver (sample:Example seq) (feat:Feature,split:float) =
	let under = sample \|> Seq.filter (fun (obs,_) -> feat obs <= split)
	let over = sample \|> Seq.filter (fun (obs,_) -> feat obs > split)
	under,over

	type Splitter = Example seq -> Feature -> float list

	type Cost = Example seq -> float

	let rec learnTree (splitter:Splitter,cost:Cost) (sample:Example seq) (features:Feature list) (depth:int) =

	if depth = 0
	then
	let avg = sample \|> Seq.averageBy snd
	Leaf(avg)
	else
	let initialCost = cost sample
	let candidates =
	// build up all the feature/split candidates,
	// and their associated sample splits
	seq {
	for feature in features do
	let splits = splitter sample feature
	for split in splits ->
	let under,over = underOver sample (feature,split)
	(feature,split),(under,over)
	}
	// compute and append cost of split
	\|> Seq.map (fun (candidate,(under,over)) ->
	candidate,(under,over), cost under + cost over)
	// retain only candidates with strict cost improvement
	\|> Seq.filter (fun (candidate,(under,over),splitCost) ->
	splitCost < initialCost)

	if (Seq.isEmpty candidates)
	then
	let avg = sample \|> Seq.averageBy snd
	Leaf(avg)
	else
	let ((bestFeature,bestSplit),(under,over),spliCost) =
	candidates
	\|> Seq.minBy (fun (_,_,splitCost) -> splitCost)

	let underTree = learnTree (splitter,cost) under features (depth - 1)
	let overTree = learnTree (splitter,cost) over features (depth - 1)

	Branch((bestFeature,bestSplit),underTree,overTree)

	let evenSplitter n (sample:Example seq) (feature:Feature) =
	let values = sample \|> Seq.map (fst >> feature)
	let min = values \|> Seq.min
	let max = values \|> Seq.max
	if min = max
	then []
	else
	let width = (max-min) / (float (n + 1))
	[ min + width .. width .. max - width ]

	let sumOfSquaresCost (sample:Example seq) =
	let avg = sample \|> Seq.averageBy snd
	sample \|> Seq.sumBy (fun (_,lbl) -> pown (lbl - avg) 2)

	// alternate cost specification
	let manhattanCost (sample:Example seq) =
	let avg = sample \|> Seq.averageBy snd
	sample \|> Seq.sumBy (fun (_,lbl) -> abs (lbl - avg))

	let stableTree = learnTree (evenSplitter 10,sumOfSquaresCost) redSample [``Alcohol Level``;``Volatile Acidity``] 10

	sumOfSquares redSample (predict stableTree)

	// we include every feature available
	let features = [
	``Alcohol Level``
	``Chlorides``
	``Citric Acid``
	``Density``
	``Fixed Acidity``
	``Free Sulfur Dioxide``
	``PH``
	``Residual Sugar``
	``Total Sulfur Dioxide``
	``Volatile Acidity``
	]

	let fullTree = learnTree (evenSplitter 5,sumOfSquaresCost) redSample features 10
	sumOfSquares redSample (predict fullTree)

	// plotting actual vs. predicted values
	let options = Configuration.Options()
	options.dataOpacity <- 0.25
	options.pointSize <- 10

	redSample
	\|> Seq.map (fun (obs,lbl) -> lbl, predict fullTree obs)
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Wine Quality: Actual vs. Predicted"
	\|> Chart.WithXTitle "Actual"
	\|> Chart.WithYTitle "Predicted"
	\|> Chart.Show

	(*
	Over-fitting?
	*)

	// we split the sample in halves
	let sampleSize = redSample \|> Seq.length
	let training = redSample \|> Seq.take (sampleSize/2)
	let testing = redSample \|> Seq.skip (sampleSize/2)

	// careful - this takes a bit of time :)
	let trees =
	[ for depth in 1 .. 10 ->
	// for increasing depth
	depth,
	// we train a tree on the training sample
	learnTree (evenSplitter 10,sumOfSquaresCost) training features depth
	]

	// we evaluate errors, on the training and the testing samples
	let trainingError = trees \|> List.map (fun (d,tree) -> d, sumOfSquares training (predict tree))
	let testingError = trees \|> List.map (fun (d,tree) -> d, sumOfSquares testing (predict tree))

	[ trainingError; testingError ]
	\|> Chart.Line
	\|> Chart.WithLabels ["Train"; "Test"]
	\|> Chart.WithTitle "Over-Fitting Analysis"
	\|> Chart.WithXTitle "Depth"
	\|> Chart.WithYTitle "Error"
	\|> Chart.Show
	// blog post: brandewinder.com/2016/09/03/gradient-boosting-part-3

	(*
	Dependencies
	*)

	#I "./packages/"

	#r "fsharp.data/lib/net40/fsharp.data.dll"
	open FSharp.Data

	#r "xplot.googlecharts/lib/net45/xplot.googlecharts.dll"
	#r "google.datatable.net.wrapper/lib/google.datatable.net.wrapper.dll"
	open XPlot.GoogleCharts

	#r "fsalg/lib/fsalg.dll"
	#r "diffsharp/lib/diffsharp.dll"
	open DiffSharp.Numerical

	let scatterOptions = Configuration.Options()
	scatterOptions.dataOpacity <- 0.25
	scatterOptions.pointSize <- 10
	scatterOptions.hAxis <- Axis(minValue = 0, maxValue = 10)
	scatterOptions.vAxis <- Axis(minValue = 0, maxValue = 10)

	(*
	Declaring our core types and importing the data.
	*)

	type Wine = CsvProvider<"data/winequality-red.csv",";",InferRows=1500>

	type Observation = Wine.Row
	type Feature = Observation -> float
	type Example = Observation * float
	type Predictor = Observation -> float

	let redSample =
	Wine.Load("data/winequality-red.csv").Rows
	\|> Seq.map (fun row -> row, row.Quality \|> float)

	(*
	Creating features for that dataset
	*)

	let ``Alcohol Level`` : Feature =
	fun obs -> obs.Alcohol \|> float

	let ``Chlorides`` : Feature =
	fun obs -> obs.Chlorides \|> float

	let ``Citric Acid`` : Feature =
	fun obs -> obs.``Citric acid`` \|> float

	let ``Density`` : Feature =
	fun obs -> obs.Density \|> float

	let ``Fixed Acidity`` : Feature =
	fun obs -> obs.``Fixed acidity`` \|> float

	let ``Free Sulfur Dioxide`` : Feature =
	fun obs -> obs.``Free sulfur dioxide`` \|> float

	let ``PH`` : Feature =
	fun obs -> obs.PH \|> float

	let ``Residual Sugar`` : Feature =
	fun obs -> obs.``Residual sugar`` \|> float

	let ``Total Sulfur Dioxide`` : Feature =
	fun obs -> obs.``Total sulfur dioxide`` \|> float

	let ``Volatile Acidity`` : Feature =
	fun obs -> obs.``Volatile acidity`` \|> float

	let features = [
	``Alcohol Level``
	``Chlorides``
	``Citric Acid``
	``Density``
	``Fixed Acidity``
	``Free Sulfur Dioxide``
	``PH``
	``Residual Sugar``
	``Total Sulfur Dioxide``
	``Volatile Acidity``
	]

	(*
	Basic regression tree implementation
	*)

	type Tree =
	\| Leaf of float
	\| Branch of (Feature * float) * Tree * Tree

	let rec predict (tree:Tree) (obs:Observation) =
	match tree with
	\| Leaf(prediction) -> prediction
	\| Branch((feature,split),under,over) ->
	let featureValue = feature obs
	if featureValue <= split
	then predict under obs
	else predict over obs

	let underOver (sample:Example seq) (feat:Feature,split:float) =
	let under = sample \|> Seq.filter (fun (obs,_) -> feat obs <= split)
	let over = sample \|> Seq.filter (fun (obs,_) -> feat obs > split)
	under,over

	type Splitter = Example seq -> Feature -> float list
	type Cost = Example seq -> float

	let rec learnTree (splitter:Splitter,cost:Cost) (sample:Example seq) (features:Feature list) (depth:int) =

	if depth = 0
	then
	let avg = sample \|> Seq.averageBy snd
	Leaf(avg)
	else
	let initialCost = cost sample
	let candidates =
	// build up all the feature/split candidates,
	// and their associated sample splits
	seq {
	for feature in features do
	let splits = splitter sample feature
	for split in splits ->
	let under,over = underOver sample (feature,split)
	(feature,split),(under,over)
	}
	// compute and append cost of split
	\|> Seq.map (fun (candidate,(under,over)) ->
	let underSize = under \|> Seq.length \|> float
	let overSize = over \|> Seq.length \|> float
	let size = underSize + overSize
	let weightedCost = (underSize / size) * (cost under) + (overSize / size) * (cost over)
	candidate,(under,over), weightedCost)
	// retain only candidates with strict cost improvement
	\|> Seq.filter (fun (candidate,(under,over),splitCost) ->
	splitCost < initialCost)

	if (Seq.isEmpty candidates)
	then
	let avg = sample \|> Seq.averageBy snd
	Leaf(avg)
	else
	let ((bestFeature,bestSplit),(under,over),spliCost) =
	candidates
	\|> Seq.minBy (fun (_,_,splitCost) -> splitCost)

	let underTree = learnTree (splitter,cost) under features (depth - 1)
	let overTree = learnTree (splitter,cost) over features (depth - 1)

	Branch((bestFeature,bestSplit),underTree,overTree)

	let evenSplitter n (sample:Example seq) (feature:Feature) =
	let values = sample \|> Seq.map (fst >> feature)
	let min = values \|> Seq.min
	let max = values \|> Seq.max
	if min = max
	then []
	else
	let width = (max-min) / (float (n + 1))
	[ min + width .. width .. max - width ]

	let sumOfSquaresCost (sample:Example seq) =
	let avg = sample \|> Seq.averageBy snd
	sample \|> Seq.sumBy (fun (_,lbl) -> pown (lbl - avg) 2)

	let fullTree = learnTree (evenSplitter 5,sumOfSquaresCost) redSample features 3

	let averageSquareError (sample:Example seq) predictor =
	sample
	\|> Seq.averageBy (fun (obs,lbl) ->
	pown (lbl - predictor obs) 2)

	averageSquareError redSample (predict fullTree)

	redSample
	\|> Seq.map (fun (obs,lbl) -> lbl, (predict fullTree) obs)
	\|> Chart.Scatter
	\|> Chart.WithOptions scatterOptions
	\|> Chart.WithTitle "Wine Quality: Actual vs. Predicted (Tree)"
	\|> Chart.WithXTitle "Actual"
	\|> Chart.WithYTitle "Predicted"
	\|> Chart.Show

	(*
	Gradient Boosting
	*)

	type Learner = Example seq -> Predictor

	let learn (sample:Example seq) (learner:Learner) (depth:int) =

	let rec next iterationsLeft predictor =

	// we have reached depth 0: we are done
	if iterationsLeft = 0
	then predictor
	else
	// compute new residuals,
	let newSample =
	sample
	\|> Seq.map (fun (obs,y) -> obs, y - predictor obs)

	// learn a predictor against residuals,
	let residualsPredictor = learner newSample

	// create new predictor
	let newPredictor =
	fun obs -> predictor obs + residualsPredictor obs

	// ... and keep going
	next (iterationsLeft - 1) newPredictor

	// initialize with a predictor that
	// predicts the average sample value
	let baseValue = sample \|> Seq.map snd \|> Seq.average
	let basePredictor = fun (obs:Observation) -> baseValue

	next depth basePredictor

	let treeLearner (sample:Example seq) =
	learnTree (evenSplitter 5,sumOfSquaresCost) sample features 3
	\|> predict

	// evaluate boosting at different depth
	[ 1 .. 5 ]
	\|> List.map (fun depth ->
	let model = learn redSample treeLearner depth
	depth, averageSquareError redSample model)

	(*
	True Gradient Boosting, using pseudo-residuals
	*)

	type Loss = float -> float

	let draftBoostedLearn (sample:Example seq) (learner:Learner) (loss:Loss) (depth:int) =

	let pseudoResiduals = diff loss

	let rec next iterationsLeft predictor =

	// we have reached depth 0: we are done
	if iterationsLeft = 0
	then predictor
	else
	// compute new residuals,
	let newSample =
	sample
	\|> Seq.map (fun (obs,y) ->
	obs,
	pseudoResiduals (y - predictor obs))

	// learn a tree against residuals,
	let residualsPredictor = learner newSample

	// create new predictor
	let newPredictor =
	fun obs ->
	predictor obs + residualsPredictor obs

	// ... and keep going
	next (iterationsLeft - 1) newPredictor

	// initialize with a predictor that
	// predicts the average sample value
	let baseValue = sample \|> Seq.map snd \|> Seq.average
	let basePredictor = fun (obs:Observation) -> baseValue

	next depth basePredictor

	// we should have the same results as before

	let squareLoss : Loss = fun x -> 0.5 * pown x 2

	[ 1 .. 5 ]
	\|> List.map (fun depth ->
	let model = draftBoostedLearn redSample treeLearner squareLoss depth
	depth, averageSquareError redSample model)


	// illustration: differentiating the square loss function
	// does produce the residuals.
	let diffSquareLoss = diff squareLoss

	[ - 5.0 .. 0.1 .. 5.0 ]
	\|> List.map (fun x -> x, diffSquareLoss x)
	\|> Chart.Line
	\|> Chart.Show


	(*
	Optimal combination of predictors
	*)

	let combination f1 f2 gamma : Predictor =
	fun obs -> f1 obs + gamma * f2 obs

	let gradientDescent f x0 eta epsilon =
	let rec desc x =
	let g = diff f x
	if abs g < epsilon
	then x
	else
	printfn "%.3f" x
	desc (x - eta * g)
	desc x0

	// illustration
	let foo x = pown x 2
	let min_foo = gradientDescent foo 10. 0.1 0.0001

	let optimalGamma (sample:Example seq) f1 f2 (loss:Loss) =

	let combine gamma = combination f1 f2 gamma
	let costOf gamma =
	sample
	\|> Seq.sumBy (fun (obs,y) ->
	combine gamma obs - y \|> loss)

	gradientDescent costOf 1.0 0.001 0.01

	let boostedLearn (sample:Example seq) (learner:Learner) (loss:Loss) (depth:int) =

	let pseudoResiduals = diff loss

	let rec next iterationsLeft predictor =

	// we have reached depth 0: we are done
	if iterationsLeft = 0
	then predictor
	else
	// compute new residuals,
	let newSample =
	sample
	\|> Seq.map (fun (obs,y) ->
	obs,
	pseudoResiduals (y - predictor obs))

	// learn a tree against residuals,
	let residualsPredictor = learner newSample

	// find optimal gamma
	let gamma = optimalGamma sample predictor residualsPredictor loss

	// create new predictor
	let newPredictor =
	fun obs ->
	predictor obs + gamma * residualsPredictor obs

	// ... and keep going
	next (iterationsLeft - 1) newPredictor

	// initialize with a predictor that
	// predicts the average sample value
	let baseValue = sample \|> Seq.map snd \|> Seq.average
	let basePredictor = fun (obs:Observation) -> baseValue

	next depth basePredictor

	[ 1 .. 5 ]
	\|> List.map (fun depth ->
	let model = boostedLearn redSample treeLearner squareLoss depth
	depth, averageSquareError redSample model)

	let ssrPredictor = boostedLearn redSample treeLearner squareLoss 5

	redSample
	\|> Seq.map (fun (obs,lbl) -> lbl, ssrPredictor obs)
	\|> Chart.Scatter
	\|> Chart.WithOptions scatterOptions
	\|> Chart.WithTitle "Wine Quality: Actual vs. Predicted (SSR)"
	\|> Chart.WithXTitle "Actual"
	\|> Chart.WithYTitle "Predicted"
	\|> Chart.Show

	(*
	Using a more complex Loss function, the Huber Loss
	*)

	// https://en.wikipedia.org/wiki/Huber_loss#Definition
	let huber delta x =
	if abs x <= delta
	then 0.5 * pown x 2
	else delta * (abs x - 0.5 * delta)

	[ - 5.0 .. 0.1 .. 5.0 ]
	\|> List.map (fun x -> x, huber 1.0 x)
	\|> Chart.Line
	\|> Chart.Show

	// illustration: differenting the square loss function
	// does produce the residuals.
	let diffHuber = diff (huber 1.0)

	[ - 5.0 .. 0.1 .. 5.0 ]
	\|> List.map (fun x -> x, diffHuber x)
	\|> Chart.Line
	\|> Chart.Show

	[ 1 .. 5 ]
	\|> List.map (fun depth ->
	let model = boostedLearn redSample treeLearner (huber 1.0) depth
	depth, averageSquareError redSample model)

	let huberPredictor = boostedLearn redSample treeLearner (huber 1.0) 5

	redSample
	\|> Seq.map (fun (obs,lbl) -> lbl, huberPredictor obs)
	\|> Chart.Scatter
	\|> Chart.WithOptions scatterOptions
	\|> Chart.WithTitle "Wine Quality: Actual vs. Predicted (Huber 1.0)"
	\|> Chart.WithXTitle "Actual"
	\|> Chart.WithYTitle "Predicted"
	\|> Chart.Show