Created
March 4, 2024 23:55
-
-
Save Bodigrim/e145727a5d9f44cfe2d5292007fffdce to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env cabal | |
{- cabal: | |
build-depends: base >= 4.19, bytestring, containers | |
default-language: GHC2021 | |
-} | |
{-# LANGUAGE ExtendedLiterals #-} | |
{-# LANGUAGE MagicHash #-} | |
{-# OPTIONS_GHC -Wall -O2 -threaded #-} | |
import Control.Monad | |
import Data.ByteString (ByteString) | |
import Data.ByteString qualified as B | |
import Data.ByteString.Char8 qualified as C8 | |
import Data.ByteString.Unsafe qualified as B | |
import Data.Foldable | |
import Data.Map (Map) | |
import Data.Map.Strict qualified as M | |
import Data.Map.Internal qualified as M (balance) | |
import Data.Map.Strict.Internal qualified as M | |
import Text.Printf | |
import GHC.Word | |
import System.IO | |
import Data.IORef | |
import Control.Concurrent | |
newtype Station = Station ByteString | |
deriving (Eq, Show) | |
instance Ord Station where | |
compare (Station xs) (Station ys) = | |
compare (B.length xs) (B.length ys) <> compare xs ys | |
data Entry = Entry | |
{ _station :: !Station | |
, _temperature :: !Int | |
} deriving (Show) | |
-- Bayawan;-21.1 | |
-- Andranomenatsa;-1.2 | |
-- Benton Harbor;36.2 | |
-- Taulahā;0.6 | |
parseLine :: ByteString -> Entry | |
parseLine xs = case x4 of | |
W8# 59#Word8 -- ord ';' | |
-> Entry (Station $ B.unsafeTake (l - 4) xs) (x3' * 10 + x1' - 528) | |
W8# 45#Word8 -- ord '-' | |
-> Entry (Station $ B.unsafeTake (l - 5) xs) (528 - x3' * 10 - x1') | |
_ -> case x5 of | |
W8# 59#Word8 -- ord ';' | |
-> Entry (Station $ B.unsafeTake (l - 5) xs) (x4' * 100 + x3' * 10 + x1' - 5328) | |
_ -- ord '-' | |
-> Entry (Station $ B.unsafeTake (l - 6) xs) (5328 - x4' * 100 - x3' * 10 - x1') | |
where | |
l = B.length xs | |
x1 = B.unsafeIndex xs (l - 1) -- last digit | |
x3 = B.unsafeIndex xs (l - 3) -- another digit | |
x4 = B.unsafeIndex xs (l - 4) -- digit or sign or semicolon | |
x5 = B.unsafeIndex xs (l - 5) -- sign or semicolon | |
x1' = fromIntegral x1 | |
x3' = fromIntegral x3 | |
x4' = fromIntegral x4 | |
validateParseLine :: ByteString -> IO () | |
validateParseLine xs | |
| ds == 0 | |
= unless (xs == ss <> C8.pack ";0.0" || xs == ss <> C8.pack ";-0.0") | |
$ error $ "bad parse " ++ C8.unpack xs | |
| otherwise | |
= unless (xs == ss <> C8.pack (';' : printf "%.1f" (fromIntegral ds / 10 :: Double))) | |
$ error $ "bad parse " ++ C8.unpack xs ++ " but got " ++ show (parseLine xs) | |
where | |
Entry (Station ss) ds = parseLine xs | |
data Quartet = Quartet | |
{ _min :: !Int | |
, _total :: !Int | |
, _cnt :: !Word | |
, _max :: !Int | |
} deriving (Eq) | |
mkQuartet :: Int -> Quartet | |
mkQuartet x = Quartet x x 1 x | |
updateQuartet :: Int -> Quartet -> Quartet | |
updateQuartet x (Quartet a b c d) = Quartet (min a x) (b + x) (c + 1) (max d x) | |
instance Semigroup Quartet where | |
Quartet a b c d <> Quartet a' b' c' d' = | |
Quartet (min a a') (b + b') (c + c') (max d d') | |
-- https://github.com/haskell/containers/issues/809 | |
upsert :: forall k a. Ord k => (Maybe a -> a) -> k -> Map k a -> Map k a | |
upsert f = go | |
where | |
go :: k -> Map k a -> Map k a | |
go !k M.Tip = M.singleton k (f Nothing) | |
go k (M.Bin sx kx x l r) = case compare k kx of | |
LT -> M.balance kx x (go k l) r | |
GT -> M.balance kx x l (go k r) | |
EQ -> let fx = f (Just x) in | |
fx `seq` M.Bin sx kx fx l r | |
parse :: ByteString -> Map Station Quartet | |
parse xs = foldl' go mempty entries | |
where | |
entries = map parseLine $ C8.lines xs | |
go :: Map Station Quartet -> Entry -> Map Station Quartet | |
go m (Entry ss ds) = upsert (maybe (mkQuartet ds) (updateQuartet ds)) ss m | |
aggregate :: Map Station Quartet -> ByteString | |
aggregate m = C8.cons '{' (C8.snoc (B.drop 2 (M.foldMapWithKey go m')) '}') | |
where | |
m' = M.fromList $ map (\(Station ss, q) -> (ss, q)) $ M.assocs m | |
go ss (Quartet a b c d) = C8.pack ", " <> ss <> C8.pack | |
(printf "=%.1f/%.1f/%.1f" (fromIntegral a / 10 :: Double) (fromIntegral b / (fromIntegral c * 10) :: Double) (fromIntegral d / 10 :: Double)) | |
parseInChunks :: MVar () -> IORef (Map Station Quartet) -> Handle -> IO () | |
parseInChunks mv ref h = do | |
xs <- B.hGetSome h 1048576 | |
-- putStrLn $ "read " ++ show (B.length xs) | |
eof <- hIsEOF h | |
xs' <- if eof then pure xs else do | |
let (ys, zs) = C8.breakEnd (== '\n') xs | |
-- putStrLn $ "seek back " ++ show (B.length zs) | |
hSeek h (RelativeSeek) (negate (toInteger (B.length zs))) | |
pure ys | |
_ <- forkIO $ do | |
atomicModifyIORef' ref (\old -> (M.unionWith (<>) old (parse xs'), ())) | |
when eof $ | |
putMVar mv () | |
if eof then pure () else | |
parseInChunks mv ref h | |
main :: IO () | |
main = do | |
cnt <- B.readFile "data/measurements.txt" | |
let ls = C8.lines cnt | |
when debug $ | |
traverse_ validateParseLine ls | |
let parsed = parse cnt | |
h <- openFile "data/measurements.txt" ReadMode | |
parsedRef <- newIORef mempty | |
mv <- newEmptyMVar | |
parseInChunks mv parsedRef h | |
!_ <- takeMVar mv | |
parsed' <- readIORef parsedRef | |
when debug $ | |
print (parsed == parsed') | |
C8.putStrLn $ aggregate parsed' | |
debug :: Bool | |
debug = False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment