Skip to content

Instantly share code, notes, and snippets.

@erantapaa
Created September 8, 2015 18:00
Show Gist options
  • Save erantapaa/bf831346378e28d908f5 to your computer and use it in GitHub Desktop.
Save erantapaa/bf831346378e28d908f5 to your computer and use it in GitHub Desktop.
split line oriented file into equal parts on line boundaries
module SplitFile
( splitFile, splitFileOn, splitHandle, splitHandleOn, findNext, findNewLine
) where
import System.IO
import qualified Data.ByteString.Char8 as B
splitFileOn :: (Char -> Bool) -> Int -> FilePath -> IO [Integer]
splitFileOn find parts path = do
h <- openBinaryFile path ReadMode
splitHandleOn find parts h
-- Split a file on line boundaries.
splitFile :: Int -> FilePath -> IO [Integer]
splitFile = splitFileOn (=='\n')
-- Split a handle. N.B. Handle should be opened in binary mode.
splitHandleOn :: (Char -> Bool) -> Int -> Handle -> IO [Integer]
splitHandleOn find parts h = do
end <- hFileSize h
pos <- hTell h
let split' pos' parts' = do
tell <- hTell h
let start = tell + ((end - tell) `div` (fromIntegral parts))
hSeek h AbsoluteSeek start
findNext find h
let go parts pos
| parts <= 0 = return []
| pos >= end = return []
| parts == 1 = return [end]
| otherwise = do pos' <- split' pos parts
rest <- go (parts-1) (pos'+1)
return $ pos' : rest
go parts pos
-- Split a handle into n equal parts on line boundaries.
-- Returns the offset of the ends of each part.
-- N.B. May return less than `parts` parts.
splitHandle :: Int -> Handle -> IO [Integer]
splitHandle = splitHandleOn (=='\n')
-- Read from a file handle until a character matching a predicate is found.
-- Returns the file position of the matching character or the position of
-- the last character of the file if no match is found.
-- The handle is positioned one character beyond the matching character
-- or one position past the end of file if there is no match.
findNext :: (Char -> Bool) -> Handle -> IO Integer
findNext p h = do
let blockSize = 1024
block <- B.hGet h blockSize
let mi = if B.null block
then Just (-1)
else B.findIndex p block
case mi of
Nothing -> findNext p h
Just i -> do tell <- hTell h
let delta = fromIntegral (i - B.length block)
if (not $ B.null block)
then hSeek h RelativeSeek (delta + 1)
else return ()
return $ tell + delta
-- Find the next newline character in a file.
findNewLine :: Handle -> IO Integer
findNewLine = findNext (=='\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment