Last active
June 11, 2019 02:33
-
-
Save sir-deenicus/4958646 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Learn more about F# at http://fsharp.net | |
// See the 'F# Tutorial' project for more help. | |
open System.IO | |
open SevenZip | |
open System | |
SevenZipCompressor.SetLibraryPath(@"path/to/7z.dll") | |
let dir = @"files/path/..." | |
let txts = Directory.GetFiles(dir + @"Test") | |
let sz = SevenZip.SevenZipCompressor() | |
let st = System.Diagnostics.Stopwatch() | |
sz.CompressionMethod <- CompressionMethod.Ppmd | |
sz.CompressionLevel <- CompressionLevel.Low | |
let compress2 (f:byte[]) = | |
use mio = new MemoryStream(f) | |
use m2 = new MemoryStream(f.Length * 2) | |
sz.CompressStream(mio, m2) | |
m2.GetBuffer().[0..int m2.Length - 1] | |
st.Start() | |
let compressionMap = txts |> Array.map (fun f -> Path.GetFileNameWithoutExtension f, f |> File.ReadAllBytes |> compress2) |> Map.ofArray | |
let compredist f1 f2 n1 n2 = | |
let code = compressionMap.[n1] | |
let code2 = compressionMap.[n2] | |
let fxy = Array.append f1 f2 | |
let code3 = compress2 (fxy) | |
float(code3.Length - (min (code.Length) (code2.Length))) / float(max (code.Length) (code2.Length)) | |
let nearEdges = [| for f in txts -> let n1 = Path.GetFileNameWithoutExtension f | |
n1, | |
let fbytes = File.ReadAllBytes f | |
txts |> Array.map (fun fname -> | |
let n2 = Path.GetFileNameWithoutExtension fname | |
Path.GetFileNameWithoutExtension fname, compredist fbytes (File.ReadAllBytes(fname)) n1 n2) | |
|> Array.sortBy snd|] | |
st.Stop() | |
let nearEdgesMap = nearEdges |> Map.ofArray | |
let pairs = nearEdgesMap |> Map.map (fun _ v -> Map.ofArray v) | |
printfn "%A" nearEdges | |
type 'a Tree = | |
| Node of 'a | |
| Branch of 'a Tree * 'a Tree | |
type Cluster<'a when 'a : comparison> = | |
| Singleton of 'a Set | |
| Clusters of 'a Set * 'a Tree | |
let completelinkage (ps:Map<'a, Map<'a,float>>) (a: 'a Set) (b:'a Set) = | |
a |> Set.map (fun item1 -> b |> Set.map (fun item2 -> ps.[item1].[item2]) | |
|> Set.maxElement) //we only want the two largest pair distances) | |
|> Set.maxElement | |
let distclust ps = function | |
| Singleton (item), Clusters(items, _) -> completelinkage ps item items | |
| Clusters (items, _) , Singleton(item) -> completelinkage ps items item | |
| Clusters (items1, _), Clusters(items2, _) -> completelinkage ps items1 items2 | |
| Singleton (item1) , Singleton(item2) -> ps.[item1.MaximumElement].[item2.MaximumElement] | |
let mergeClusters = function | |
| Singleton (item), Clusters(items, dendogram) | |
| Clusters (items, dendogram) , Singleton(item) -> Clusters(Set.union item items, Branch(dendogram, Node item.MinimumElement)) | |
| Clusters (items1, dendogram1), Clusters(items2, dendogram2) -> Clusters(Set.union items1 items2, Branch(dendogram1, dendogram2)) | |
| Singleton (item1) , Singleton(item2) -> Clusters(Set.union item1 item2, Branch(Node item1.MinimumElement, Node item2.MinimumElement)) | |
let r = Random() | |
(* | |
A function that takes a cluster and a set of clusters and finds the nearest item using cluster dist functions | |
A function that takes a cluster and an item and calculates distance as maxdist (item, clustermember) | |
There is a map that holds every item and its neighbiors | |
If we have an item we find the closest item by looking it up in the map. | |
But we also need to find the closest in the cluster. So we must compare the item to a cluster | |
To do this we for each cluster, compare the distance to our current item | |
If an item is closest we add the merged 2 to the cluster stack as a branch and remove the item from actives | |
If a cluster is closest we merge the item to the tree, remove it from the cluster stack and add the new tree to the stack | |
If the next item we are looking at is a cluster we must find the closest item. | |
To find it in the single set we map each item to its distance from the cluster using dist clust | |
We also sort the cluster set by distance from current cluster | |
Again if the single item is the closest we merge with cluster and remove from map; | |
If the cluster is the closest we remove both clusters from clusterset, merge them and put them back | |
Recurse | |
Item , Item -> Pack as a Singleton | |
*) | |
// (clusterset : Map<string, string Cluster>) | |
let asCluster x = Singleton (set [x]) | |
let closestinActives distances cluster (item : string Set) = | |
item |> Set.map (fun s -> distclust distances (asCluster s, cluster), s) | |
|> Set.minElement | |
let find points first closest (distances : Map<string, Map<string,float>>) = | |
let initialActives = points |> Set.ofArray |> Set.remove first | |
|> Set.remove closest | |
let rec seek (stack : string Cluster list) (actives : string Set) = | |
let current = stack.Head | |
if stack.Length = 1 && actives = Set.empty then current | |
else | |
let nextDist, next = if actives.Count = 0 then Double.MaxValue,"" else closestinActives distances current actives | |
if stack.Length = 1 then seek (asCluster next :: stack) (actives.Remove(next)) | |
else let topofstack = stack.Tail.Head | |
let stackDist = distclust distances (topofstack, current) | |
if nextDist < stackDist then | |
seek (asCluster next :: stack) (actives.Remove(next)) | |
else seek ((mergeClusters (current, topofstack)) :: (stack.Tail.Tail)) actives | |
seek [asCluster closest ; asCluster first] initialActives | |
let rec toGraph depth = function | |
| Node(x) -> x, "", " node\r\n [\r\n id\t\""+x+"\"\r\n label\t\"" + x + "\"\r\n ]\r\n" | |
| Branch(ltree,rtree) -> let lname, lgraph, names1 = toGraph (depth + 1) ltree | |
let rname, rgraph, names2 = toGraph (depth + 1) rtree | |
let name = string (r.Next(0, int(2. ** (float depth + 9.))) ) | |
name, sprintf "%s\r\n%s\r\n edge\r\n [\r\n source\t\"%s\"\r\n target\t\"%s\"\r\n ]\r\n edge\r\n [\r\n source\t\"%s\"\r\n target\t\"%s\"\r\n ]" | |
lgraph rgraph name lname name rname, | |
(sprintf " node\r\n [\r\n id\t\"%s\"\r\n label\t\"\"\r\n ]\r\n" name) + names1 + names2 | |
let first = fst nearEdges.[r.Next(0,txts.Length)] | |
let closest = fst nearEdgesMap.[first].[1] | |
let items, fcluster = (function | Clusters(leset, letree) -> leset, letree) (find (nearEdges |> Array.map fst) first closest pairs) | |
let _, outgraph, nodes = toGraph 0 fcluster | |
let n = "graph [" + nodes + outgraph + "]" | |
File.WriteAllText("mbook.gml", n) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment