Created
September 8, 2015 18:00
-
-
Save erantapaa/bf831346378e28d908f5 to your computer and use it in GitHub Desktop.
split line oriented file into equal parts on line boundaries
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module SplitFile | |
( splitFile, splitFileOn, splitHandle, splitHandleOn, findNext, findNewLine | |
) where | |
import System.IO | |
import qualified Data.ByteString.Char8 as B | |
splitFileOn :: (Char -> Bool) -> Int -> FilePath -> IO [Integer] | |
splitFileOn find parts path = do | |
h <- openBinaryFile path ReadMode | |
splitHandleOn find parts h | |
-- Split a file on line boundaries. | |
splitFile :: Int -> FilePath -> IO [Integer] | |
splitFile = splitFileOn (=='\n') | |
-- Split a handle. N.B. Handle should be opened in binary mode. | |
splitHandleOn :: (Char -> Bool) -> Int -> Handle -> IO [Integer] | |
splitHandleOn find parts h = do | |
end <- hFileSize h | |
pos <- hTell h | |
let split' pos' parts' = do | |
tell <- hTell h | |
let start = tell + ((end - tell) `div` (fromIntegral parts)) | |
hSeek h AbsoluteSeek start | |
findNext find h | |
let go parts pos | |
| parts <= 0 = return [] | |
| pos >= end = return [] | |
| parts == 1 = return [end] | |
| otherwise = do pos' <- split' pos parts | |
rest <- go (parts-1) (pos'+1) | |
return $ pos' : rest | |
go parts pos | |
-- Split a handle into n equal parts on line boundaries. | |
-- Returns the offset of the ends of each part. | |
-- N.B. May return less than `parts` parts. | |
splitHandle :: Int -> Handle -> IO [Integer] | |
splitHandle = splitHandleOn (=='\n') | |
-- Read from a file handle until a character matching a predicate is found. | |
-- Returns the file position of the matching character or the position of | |
-- the last character of the file if no match is found. | |
-- The handle is positioned one character beyond the matching character | |
-- or one position past the end of file if there is no match. | |
findNext :: (Char -> Bool) -> Handle -> IO Integer | |
findNext p h = do | |
let blockSize = 1024 | |
block <- B.hGet h blockSize | |
let mi = if B.null block | |
then Just (-1) | |
else B.findIndex p block | |
case mi of | |
Nothing -> findNext p h | |
Just i -> do tell <- hTell h | |
let delta = fromIntegral (i - B.length block) | |
if (not $ B.null block) | |
then hSeek h RelativeSeek (delta + 1) | |
else return () | |
return $ tell + delta | |
-- Find the next newline character in a file. | |
findNewLine :: Handle -> IO Integer | |
findNewLine = findNext (=='\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment