Skip to content

Instantly share code, notes, and snippets.

@raichoo
Last active January 10, 2016 13:21
Show Gist options
  • Save raichoo/3ae8dd14699b1f731916 to your computer and use it in GitHub Desktop.
Save raichoo/3ae8dd14699b1f731916 to your computer and use it in GitHub Desktop.
{-# OPTIONS_GHC -O2 #-}
{-# LANGUAGE OverloadedStrings #-}
module Main where
import System.IO
import Text.Regex.PCRE
import Text.Regex.PCRE.ByteString.Lazy
import Data.Maybe (mapMaybe)
import Data.Char (ord)
import Data.Word (Word8)
import qualified Data.ByteString.Lazy as BSL
space :: Word8
space = fromIntegral (ord ' ')
newline :: Word8
newline = fromIntegral (ord '\n')
semicolon :: Word8
semicolon = fromIntegral (ord ';')
quote :: Word8
quote = fromIntegral (ord '"')
regex :: BSL.ByteString -> [[BSL.ByteString]]
regex line = line =~ ("\\{.*(Microsoft.*)\\|\\].*" :: String)
firstGroup :: [[BSL.ByteString]] -> Maybe BSL.ByteString
firstGroup [[_, x]] = Just x
firstGroup _ = Nothing
splitLine :: BSL.ByteString -> [BSL.ByteString]
splitLine = BSL.split space . BSL.filter (`notElem` [semicolon, quote])
cleanupName :: BSL.ByteString -> BSL.ByteString
cleanupName = BSL.intercalate "." . splitLine
main :: IO ()
main = withFile "../big.txt" ReadMode $ \handle -> do
contents <- BSL.hGetContents handle
let matchedLines = map regex . take 10000000 . BSL.split newline $ contents
dirtyLines = mapMaybe firstGroup matchedLines
cleanLines = map cleanupName dirtyLines
count = length cleanLines
print count
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment