Last active
December 31, 2015 14:48
-
-
Save ivant/8002288 to your computer and use it in GitHub Desktop.
Build a sorted word frequency list from a file, trimmed to a given quantile.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Build a sorted word frequency list from a file, trimmed to a given quantile. | |
-- | |
-- Usage: WordStats <book.txt> <quantile> | |
-- | |
-- `quantile` is a number between 0 and 1. | |
-- | |
-- Example: | |
-- ./WordStats "Don Quijote.txt" 0.85 > "Don Quijote.words.85" | |
import Control.Applicative | |
import Control.Monad (forM_) | |
import Data.ByteString (ByteString) | |
import qualified Data.ByteString as B | |
import Data.Char (isAlpha) | |
import Data.HashMap.Strict (HashMap) | |
import qualified Data.HashMap.Strict as M | |
import Data.List (sortBy) | |
import Data.Ord (comparing, Down(..)) | |
import Data.Ratio | |
import qualified Data.Text as T | |
import qualified Data.Text.Encoding as E | |
import System.Environment (getArgs) | |
import Text.Printf | |
getWords :: T.Text -> [T.Text] | |
getWords text = [ noPunctuationWord | |
| word <- T.toLower <$> T.words text | |
, let noPunctuationWord = T.filter isAlpha word | |
, T.length noPunctuationWord > 0 | |
] | |
wordMap :: [T.Text] -> HashMap T.Text Int | |
wordMap words = M.fromListWith (+) (zip words [1,1..]) | |
main = do | |
args <- getArgs | |
let file = args !! 0 | |
percentile = read $ args !! 1 :: Double | |
text <- E.decodeUtf8 <$> B.readFile file | |
let words = getWords text | |
wm = wordMap words | |
freqList = sortBy (comparing (Down . snd)) (M.toList wm) :: [(T.Text, Int)] | |
totalWords = fromIntegral $ sum $ map snd freqList :: Integer | |
accumFreqList = let (ws,cs) = unzip freqList | |
in zip3 ws cs (scanl1 (+) cs) | |
percentileWords = takeWhile (\(_,_,a) -> (fromRational $ fromIntegral a % totalWords) <= percentile) accumFreqList | |
forM_ percentileWords $ \(w,c,a) -> do | |
printf "%d %s\n" c (T.unpack w) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment