Last active
November 3, 2015 01:29
-
-
Save hallettj/9cfa8a4763ba670bb1fd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Data.List (groupBy, sort, sortOn, transpose) | |
type Vector = [Double] | |
type Matrix = [Vector] -- two-dimensional matrix | |
{- Distance functions -} | |
-- type for a distance function | |
type Distance = Vector -> Vector -> Double | |
pearson :: Vector -> Vector -> Double | |
pearson v1 v2 = if den == 0 | |
then 0 | |
else 1 - (num / den) | |
where | |
num = sum (zipWith (*) v1 v2) - (sumv1 * sumv2 / len v1) | |
den = sqrt (((sumSq v1 - (sumv1 ^^ 2)) / len v1) * ((sumSq v2 - (sumv2 ^^ 2)) / len v1)) | |
sumSq v = sum (map (^^ 2) v) | |
sumv1 = sum v1 | |
sumv2 = sum v2 | |
-- The native `length` function returns an Int, but we need a Double | |
len :: [a] -> Double | |
len xs = fromIntegral (length xs) | |
{- Hierarchical clustering -} | |
data Bicluster = Bicluster { clustLeft :: Bicluster, clustRight :: Bicluster, clustDist :: Double, clustVec :: Vector } | |
| Emptycluster | |
deriving (Eq, Show) | |
hcluster :: Distance -> Matrix -> Bicluster | |
hcluster dist vs = hclusterRec dist vs (map (Bicluster Emptycluster Emptycluster 0) vs) | |
hclusterRec :: Distance -> Matrix -> [Bicluster] -> Bicluster | |
hclusterRec dist vs [c] = c | |
hclusterRec dist vs cs = hclusterRec dist vs mergeClosest | |
where | |
clusterPairs = [(c1, c2) | c1 <- cs, c2 <- cs, c1 /= c2] | |
closest = head $ sortOn (\(c1, c2) -> dist (clustVec c1) (clustVec c2)) clusterPairs | |
(c1, c2) = closest | |
mergeClosest = mergeClusters dist c1 c2 : [c | c <- cs, c /= c1 && c /= c2] | |
mergeClusters :: Distance -> Bicluster -> Bicluster -> Bicluster | |
mergeClusters dist c1 c2 = | |
Bicluster { clustLeft = c1 | |
, clustRight = c2 | |
, clustDist = dist (clustVec c1) (clustVec c2) | |
, clustVec = zipWith avg (clustVec c1) (clustVec c2) | |
} | |
avg :: Double -> Double -> Double | |
avg x y = (x + y) / 2 | |
{- K-Means clustering -} | |
type Centroid = Vector | |
type Cluster = [Vector] | |
kcluster :: Distance -> Int -> Matrix -> [Cluster] | |
kcluster dist k vs = kclusterRec dist vs initCentroids [] | |
where | |
initCentroids = take k vs | |
kclusterRec :: Distance -> Matrix -> [Centroid] -> [Cluster] -> [Cluster] | |
kclusterRec dist vs centroids clusters = if newClusters == clusters | |
then newClusters | |
else kclusterRec dist vs newCentroids newClusters | |
where | |
assignments :: [(Vector, Centroid)] | |
assignments = zip vs (map (nearest dist centroids) vs) | |
newClusters = sort $ map (sort . map vector) $ groupOn centroid assignments | |
newCentroids = map averageVec newClusters | |
vector (v, c) = v | |
centroid (v, c) = c | |
nearest :: Distance -> [Centroid] -> Vector -> Centroid | |
nearest dist cs v = head $ sortOn (dist v) cs | |
averageVec :: [Vector] -> Vector | |
averageVec vs = map average (transpose vs) | |
average :: [Double] -> Double | |
average xs = sum xs / len xs | |
groupOn :: Eq b => (a -> b) -> [a] -> [[a]] | |
groupOn f xs = groupBy (\x y -> f x == f y) xs | |
{- test program -} | |
fixture :: Matrix | |
fixture = [ [1, 2, 3] | |
, [4, 5, 6] | |
] | |
main :: IO () | |
main = do | |
let res = pearson (fixture !! 0) (fixture !! 1) | |
let res' = pearson (transpose fixture !! 0) (transpose fixture !! 1) | |
putStrLn (show res) | |
putStrLn (show res') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment