Skip to content

Instantly share code, notes, and snippets.

@hallettj
Last active November 3, 2015 01:29
Show Gist options
  • Save hallettj/9cfa8a4763ba670bb1fd to your computer and use it in GitHub Desktop.
Save hallettj/9cfa8a4763ba670bb1fd to your computer and use it in GitHub Desktop.
import Data.List (groupBy, sort, sortOn, transpose)
type Vector = [Double]
type Matrix = [Vector] -- two-dimensional matrix
{- Distance functions -}
-- type for a distance function
type Distance = Vector -> Vector -> Double
pearson :: Vector -> Vector -> Double
pearson v1 v2 = if den == 0
then 0
else 1 - (num / den)
where
num = sum (zipWith (*) v1 v2) - (sumv1 * sumv2 / len v1)
den = sqrt (((sumSq v1 - (sumv1 ^^ 2)) / len v1) * ((sumSq v2 - (sumv2 ^^ 2)) / len v1))
sumSq v = sum (map (^^ 2) v)
sumv1 = sum v1
sumv2 = sum v2
-- The native `length` function returns an Int, but we need a Double
len :: [a] -> Double
len xs = fromIntegral (length xs)
{- Hierarchical clustering -}
data Bicluster = Bicluster { clustLeft :: Bicluster, clustRight :: Bicluster, clustDist :: Double, clustVec :: Vector }
| Emptycluster
deriving (Eq, Show)
hcluster :: Distance -> Matrix -> Bicluster
hcluster dist vs = hclusterRec dist vs (map (Bicluster Emptycluster Emptycluster 0) vs)
hclusterRec :: Distance -> Matrix -> [Bicluster] -> Bicluster
hclusterRec dist vs [c] = c
hclusterRec dist vs cs = hclusterRec dist vs mergeClosest
where
clusterPairs = [(c1, c2) | c1 <- cs, c2 <- cs, c1 /= c2]
closest = head $ sortOn (\(c1, c2) -> dist (clustVec c1) (clustVec c2)) clusterPairs
(c1, c2) = closest
mergeClosest = mergeClusters dist c1 c2 : [c | c <- cs, c /= c1 && c /= c2]
mergeClusters :: Distance -> Bicluster -> Bicluster -> Bicluster
mergeClusters dist c1 c2 =
Bicluster { clustLeft = c1
, clustRight = c2
, clustDist = dist (clustVec c1) (clustVec c2)
, clustVec = zipWith avg (clustVec c1) (clustVec c2)
}
avg :: Double -> Double -> Double
avg x y = (x + y) / 2
{- K-Means clustering -}
type Centroid = Vector
type Cluster = [Vector]
kcluster :: Distance -> Int -> Matrix -> [Cluster]
kcluster dist k vs = kclusterRec dist vs initCentroids []
where
initCentroids = take k vs
kclusterRec :: Distance -> Matrix -> [Centroid] -> [Cluster] -> [Cluster]
kclusterRec dist vs centroids clusters = if newClusters == clusters
then newClusters
else kclusterRec dist vs newCentroids newClusters
where
assignments :: [(Vector, Centroid)]
assignments = zip vs (map (nearest dist centroids) vs)
newClusters = sort $ map (sort . map vector) $ groupOn centroid assignments
newCentroids = map averageVec newClusters
vector (v, c) = v
centroid (v, c) = c
nearest :: Distance -> [Centroid] -> Vector -> Centroid
nearest dist cs v = head $ sortOn (dist v) cs
averageVec :: [Vector] -> Vector
averageVec vs = map average (transpose vs)
average :: [Double] -> Double
average xs = sum xs / len xs
groupOn :: Eq b => (a -> b) -> [a] -> [[a]]
groupOn f xs = groupBy (\x y -> f x == f y) xs
{- test program -}
fixture :: Matrix
fixture = [ [1, 2, 3]
, [4, 5, 6]
]
main :: IO ()
main = do
let res = pearson (fixture !! 0) (fixture !! 1)
let res' = pearson (transpose fixture !! 0) (transpose fixture !! 1)
putStrLn (show res)
putStrLn (show res')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment