-
-
Save radix/bd7f462c5cd51b801e86fb3cca84f41a to your computer and use it in GitHub Desktop.
{-# LANGUAGE TupleSections #-} | |
import Control.Monad (when, foldM_, forM) | |
import Data.Functor | |
import Data.List | |
import Data.Ord (comparing) | |
import Development.Shake | |
import Development.Shake.FilePath | |
import qualified System.Directory as Dir | |
import System.Posix.Files (touchFile) | |
main :: IO () | |
main = shake shakeOptions $ do | |
let mainFiles = [ "README.md", "stack.yaml" ] | |
rule = cachingRule "_build/cache" | |
want ["_build/main"] | |
rule "main" mainFiles $ \out -> do | |
putNormal "building main from scratch" | |
Stdout contents <- cmd "cat" mainFiles | |
writeFile' out contents | |
liftIO $ gcCache "_build/cache" 5 | |
-- | A Rule which keeps a cache of whatever you build, so that if the files are in the same state as | |
-- they were in a previous build, they will not be rebuilt (even if there was an intermediate build | |
-- where the files WERE different). | |
cachingRule :: FilePath -> FilePath -> [FilePath] -> (FilePath -> Action ()) -> Rules () | |
cachingRule cacheDir target needs buildAction = do | |
-- _always_ hash the inputs, and then touch the cache directory, so even no-op rebuilds still let | |
-- the GC know that some artifacts have been used recently. Is this too slow? | |
hash <- liftIO $ hashAll needs | |
let hashDir = cacheDir </> hash | |
cacheTarget = hashDir </> target | |
fullTarget = "_build" </> target | |
liftIO $ do | |
e <- Dir.doesDirectoryExist hashDir | |
when e (touchFile hashDir) | |
-- We define two rules: one for doing the compilation, which runs the passed in action with an | |
-- output location inside the hash-named cache directory: | |
cacheDir </> "*" </> target %> buildAction | |
-- And another for copying from that cache directory to the final destination. | |
fullTarget %> \_ -> do | |
putNormal ("checking hash for " ++ target) | |
need (cacheTarget:needs) | |
copyFile' cacheTarget fullTarget | |
-- | Generate a single checksum of all the given files. | |
hashAll :: [FilePath] -> IO String | |
hashAll files = do | |
let files' = sort files | |
Stdout out <- cmd "md5sum" files' | |
Stdout final <- cmd (Stdin out) "md5sum" | |
case words final of | |
[checksum, "-"] -> return checksum | |
_ -> error ("strange output from md5sum: " ++ show final) | |
-- | If the `cacheDir` is more than `gcMax` megabytes, delete the oldest directories inside of it | |
-- until the size is under `gcMax`. | |
gcCache :: FilePath -> Int -> IO () | |
gcCache cacheDir gcMax = do | |
Stdout duOutput <- cmd "du -m --max-depth=1" [cacheDir] | |
let sizesWithTotal :: [(FilePath, Int)] | |
sizesWithTotal = map parseLine (lines duOutput) | |
total = snd (last sizesWithTotal) | |
sizes = init sizesWithTotal | |
putStrLn $ "[GC] " ++ cacheDir ++ " is " ++ show total ++ " MB." | |
when (total > gcMax) $ putStrLn ("[GC] Performing GC on " ++ cacheDir) | |
fileInfo <- mapM (\(fn, size) -> (fn,size,) <$> Dir.getModificationTime fn) sizes | |
let fileInfoSorted = sortBy (comparing (\(_, _, mtime) -> mtime)) fileInfo | |
forFoldM_ total fileInfoSorted $ \reducedSize (filename, size, _) -> | |
if reducedSize < gcMax then | |
return reducedSize | |
else do | |
putStrLn $ "[GC] Removing " ++ show filename | |
Dir.removeDirectoryRecursive filename | |
return (reducedSize - size) | |
where | |
forFoldM_ base l action = foldM_ action base l | |
parseLine :: String -> (FilePath, Int) | |
parseLine l = case words l of | |
[s, fn] -> (fn, read s :: Int) | |
other -> error ("strange output from `du`: " ++ show other) |
Also TBD: garbage collection. I think something like "if work directory is bigger than N gigabytes, delete oldest directories" would work well
It would be nice to touch
the directories when they are used, as a weak reference counting for garbage collection, kinda like nix's delete-older-than
.
Oh also ignore files would be nice for things like .git.
@eborden I've updated it!
- generally refactored so that it's in a reusable form (hackage package TBD...)
- implemented GC
- always update the modification time of cache directories so the GC knows they're in use.
hm, the way I'm doing the hashing / touching is breaking when depending on a file that is not statically available. working on a fix
I'm working on applying this to my company's build. I've found some issues.
TODO:
- hashing/touching is broken when working on dynamically built files (have a super inefficient workaround for now)
- hardcoded
_build
(fixed locally) - doesn't work with rules that build multiple files (still working this out)
Very cool! CC @snowleopard, as this is essentially lots of simultaneous input-output pairs at once, which is a prerequisite for something like Shake-on-a-server with multiple users.
@radix: What is the case for #3 (doesn't work with rules that build multiple files), could you elaborate?
Here's a proof-of-concept of a "Nixy Shake" -- shake rules which don't rebuild their input if there's already a cached output, which ISN'T replaced when you change the inputs. So you can switch back and forth between branches and get fast builds.
Convenience functions for doing this easily TBD.