Last active
January 1, 2016 12:09
-
-
Save sir-deenicus/8143136 to your computer and use it in GitHub Desktop.
Hyphenation algorithm used by Tex.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
type HyphTree = | |
| Node of (int [] option) * Map<char, HyphTree> | |
| Empty | |
let insertpattern tree pattern = | |
let chars = pattern |> Seq.filter (Char.IsNumber >> not) |> Seq.toArray | |
let points = Text.RegularExpressions.Regex.Split(pattern, "[.a-z]") |> Array.map (fun c -> let b, i = Int32.TryParse c in if b then int i else 0) | |
let rec bt t = function | |
| i when i = chars.Length -> Node(Some points, Map.empty) | |
| i -> | |
let pl,tdat = match t with | Node (l,m) -> l,m | _ -> None,Map.empty | |
Node(pl,mapAddGeneric tdat chars.[i] (fun intree -> bt intree (i+1)) (bt Empty (i+1))) | |
bt tree 0 | |
let hyphenate tree (worda:string) = | |
let word = tolower worda | |
if word.Length <= 4 then [|word|] | |
else | |
let usedword = "." + word + "." | |
let ps = Array.create (usedword.Length + 1) 0 | |
for i in 0..usedword.Length - 1 do | |
let _,_,_ = | |
recurse (fun (charindex,_,continuing) -> not continuing || charindex = usedword.Length - i) //in usedword.[i..][c] must account for the fact that later start makes shorter word | |
(fun (charindex,t,_) -> | |
let c = usedword.[i..].[charindex] | |
match t with | |
| Node(optpoint, treedict) -> | |
let nexttree = mapGet treedict c Empty | |
match optpoint with | |
| Some p -> | |
for k in 0..p.Length - 1 do ps.[i + k] <- max ps.[i + k] p.[k] | |
| _ -> () | |
(charindex + 1,nexttree, nexttree <> Empty) | |
| _ -> (charindex,t,false) ) | |
(0,tree,true) | |
() | |
ps.[2] <- 0; ps.[ps.Length - 2] <- 0; ps.[ps.Length - 3] <- 0 | |
let usepoint = ps.[2..] | |
let sylls,lastsyll,i = | |
word |> Seq.fold (fun (wlist,curstring,i) c -> | |
if usepoint.[i] % 2 = 1 then ((curstring + string c)::wlist, "", i + 1) | |
else wlist, curstring + string c, i + 1) ([],"",0) | |
lastsyll::sylls |> List.rev |> List.toArray |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment