Last active
September 25, 2021 17:52
-
-
Save palladin/bc278fc010e4d244ef7a to your computer and use it in GitHub Desktop.
Experiment
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#time "on" | |
let inline readAllHashes nSkip nSentences = | |
sentencesFile | |
|> File.ReadLines | |
|> Stream.ofSeq | |
|> Stream.skip nSkip | |
|> Stream.take nSentences | |
|> Stream.map(fun line -> | |
let separatorIndex = line.IndexOf(' ') | |
let sentenceItself = line.Substring(separatorIndex + 1) | |
sentence2hashes sentenceItself) | |
let inline shinglingBeginingAndEnd nSkip nSentences = | |
readAllHashes nSkip nSentences | |
|> Stream.mapi(fun id hashes -> spair id hashes) | |
|> Stream.collect(fun pair -> | |
let id = spair_fst pair | |
let hashes = spair_snd pair | |
shingling nWordInShingle hashes | |
Stream.ofArray |> Stream.map(fun shingle -> spair shingle id)) | |
let inline allGroups nSkip nSentences = | |
shinglingBeginingAndEnd nSkip nSentences | |
|> Stream.toSeq | |
// group by sub sets of hashes | |
|> ParStream.ofSeq | |
|> ParStream.groupBy(fun pair -> spair_fst pair) | |
// filter out groups with only single element inside | |
|> ParStream.filter(fun (_, g) -> g |> Seq.length > 1) | |
|> ParStream.map(fun (_, g) -> g |> Stream.ofSeq |> Stream.map(fun pair -> spair_snd pair) |> Stream.toArray) | |
|> ParStream.toArray | |
allGroups nSkip nSentences |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment