Last active
September 14, 2015 04:06
-
-
Save crzysdrs/11414475 to your computer and use it in GitHub Desktop.
Pandoc Sentence Splitter (for diff output)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env runhaskell | |
| {- | |
| Splits sentences in Pandoc conversions. Useful for diff output. | |
| Put PDsentences.hs in your path and specify it as a new filter for | |
| pandoc. This allows cleaner diff views with most markdown files being primarily | |
| written text. | |
| .gitconfig: | |
| [diff "markdown"] | |
| textconv = pandoc -t markdown_github --no-wrap --filter=PDsentences.hs | |
| .gitattributes: | |
| *.md diff=markdown | |
| -} | |
| import Text.Pandoc.JSON | |
| import Text.Regex.Posix | |
| import Text.Regex | |
| endOfSentencePat = ".*[\\.?!;][)\"\'>»’”]?$" | |
| ignoredPat = "^(e\\.g|i\\.e|mr|mrs|ms|dr|jr|sr|vs)\\.$" | |
| makeBreak :: [Char] -> Bool | |
| makeBreak s = s =~ endOfSentencePat && not commonWord | |
| where commonWord = | |
| match (mkRegexWithOpts ignoredPat True False) s | |
| consumeSpace :: [Inline] -> [Inline] | |
| consumeSpace (Space:is) = consumeSpace is | |
| consumeSpace is = breakSentence is | |
| breakSentence :: [Inline] -> [Inline] | |
| breakSentence ((Str s):is) = | |
| (Str s):needsLineBreak | |
| where needsLineBreak = | |
| if makeBreak s then | |
| LineBreak:consumeSpace is | |
| else | |
| breakSentence is | |
| breakSentence (i:is) = i:breakSentence is | |
| breakSentence [] = [] | |
| splitSentences (Plain t) = Plain (breakSentence t) | |
| splitSentences (Para t) = Para (breakSentence t) | |
| splitSentences x = x | |
| main :: IO () | |
| main = toJSONFilter splitSentences |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment