Last active
November 20, 2019 13:59
-
-
Save mathias-brandewinder/5650553 to your computer and use it in GitHub Desktop.
Recursive minimal entropy partitioning, based on Fayyad & Irani: break a continuous variable into discrete intervals top-down, maximizing the entropy gained at each step, with a stopping rule using the Minimum Description Length principle.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace Discretization | |
// Recursive minimal entropy partitioning, | |
// based on Fayyad & Irani 1993. | |
// See the following article, section 3.3, | |
// for a description of the algorithm: | |
// http://www.math.unipd.it/~dulli/corso04/disc.pdf | |
// Note: this can certainly be optimized. | |
module MDL = | |
open System | |
// Logarithm of n in base b | |
let logb n b = log n / log b | |
let entropy (data: (_ * _) seq) = | |
let N = data |> Seq.length |> (float) | |
data | |
|> Seq.countBy snd | |
|> Seq.sumBy (fun (_,count) -> | |
let p = (float)count/N | |
- p * log p) | |
// A Block of data to be split, with its | |
// relevant characteristics (size, number | |
// of classes, entropy) | |
type Block (data: (float * int) []) = | |
let s = data |> Array.length |> (float) | |
let classes = data |> Array.map snd |> Set.ofArray |> Set.count | |
let k = classes |> (float) | |
let h = entropy (data) | |
member this.Data = data | |
member this.Classes = classes | |
member this.S = s | |
member this.K = k | |
member this.H = h | |
// Entropy gained by splitting "original" block | |
// into 2 blocks "left" and "right" | |
let private entropyGain (original:Block) (left:Block) (right:Block) = | |
original.H - | |
((left.S / original.S) * left.H + (right.S / original.S) * right.H) | |
// Minimum entropy gain required | |
// for a split of the "original" block | |
// into 2 blocks "left" and "right" | |
let private minGain (original:Block) (left:Block) (right:Block) = | |
let delta = | |
logb (pown 3. original.Classes - 2.) 2. - | |
(original.K * original.H - left.K * left.H - right.K * right.H) | |
((logb (original.S - 1.) 2.) / original.S) + (delta / original.S) | |
// Identify the best acceptable value | |
// to split a block of data | |
let split (data:Block) = | |
// Candidate values to use as split | |
// We remove the smallest, because | |
// by definition no value is smaller | |
let candidates = | |
data.Data | |
|> Array.map fst | |
|> Seq.distinct | |
|> Seq.sort | |
|> Seq.toList | |
|> List.tail | |
let walls = seq { | |
for value in candidates do | |
// Split the data into 2 groups, | |
// below/above the value | |
let g1, g2 = | |
data.Data | |
|> Array.partition (fun (v,c) -> v < value) | |
let block1 = Block(g1) | |
let block2 = Block(g2) | |
let gain = entropyGain data block1 block2 | |
let threshold = minGain data block1 block2 | |
// if minimum threshold is met, | |
// the value is an acceptable candidate | |
if gain >= threshold | |
then yield (value, gain, block1, block2) } | |
if (Seq.isEmpty walls) then None | |
else | |
// Return the split value that | |
// yields the best entropy gain | |
walls | |
|> Seq.maxBy (fun (value, gain, b1, b2) -> gain) | |
|> Some | |
// Top-down recursive partition of a data block, | |
// accumulating the partitioning values into | |
// a list of "walls" (splitting values) | |
let partition (data:Block) = | |
let rec recursiveSplit (walls:float list) (data:Block) = | |
match (split data) with | |
| None -> walls // no split found | |
| Some(value, gain, b1, b2) -> | |
// append new split value | |
let walls = value::walls | |
// Search for new splits in first group | |
let walls = recursiveSplit walls b1 | |
// Search for new splits in second group | |
recursiveSplit walls b2 | |
// and go search! | |
recursiveSplit [] data |> List.sort |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#load "MDL.fs" | |
open System | |
open Discretization.MDL | |
let rng = System.Random() | |
let tests = [ | |
// one single class | |
"Single", | |
[| for i in 1..100 -> (rng.NextDouble(), 0) |]; | |
// class 0 from 0 to 1, class 1 from 1 to 2 | |
"Separate", | |
[| for i in 1..100 -> (rng.NextDouble(), 0) | |
for i in 1..100 -> (rng.NextDouble() + 1.0, 1) |]; | |
// overlapping classes | |
"Mixture", | |
[| for i in 1..100 -> (rng.NextDouble(), rng.Next(0,2)) | |
for i in 1..100 -> (rng.NextDouble() + 0.5, rng.Next(1,3)) | |
for i in 1..100 -> (rng.NextDouble() + 1.0, rng.Next(2,4)) | |
for i in 1..100 -> (rng.NextDouble() + 1.5, rng.Next(3,5)) |]; | |
"Alternating", | |
[| for i in 0 .. 100 -> ((float)i, if i % 2 = 0 then 0 else 1) |]; ] | |
tests |> List.iter (fun (title, testData) -> | |
printfn "Sample: %s" title | |
let data = Block(testData) | |
let result = partition data | |
printfn "[ %s ]" (String.Join(", ", result))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment