Skip to content

Instantly share code, notes, and snippets.

@ryo1kato
Created August 16, 2014 15:07
Show Gist options
  • Select an option

  • Save ryo1kato/ef95223112506606ed26 to your computer and use it in GitHub Desktop.

Select an option

Save ryo1kato/ef95223112506606ed26 to your computer and use it in GitHub Desktop.
--
-- hmlgrep - Haskell Multi-Line Grep
--
{-
TODOs:
* Show filenames with --count option for multiple file input
* Implement match highlight
* FIX: '-' and '--' handling in optparse-applicative
* https://github.com/pcapriotti/optparse-applicative/pull/99
* Use Boyer-Moore for non-regex patterns using stringsearch library:
http://hackage.haskell.org/package/stringsearch-0.3.3/docs/Data-ByteString-Search.html
INSTALL
$ cabal install directory
$ cabal install optparse-appricative
$ ghc --make hmlgrep.hs
-}
import Control.Monad
import Data.Int
import Data.Maybe
import Options.Applicative
import System.Console.ANSI
import System.Directory
import System.Exit
import System.IO
import System.Posix.IO ( stdOutput )
import System.Posix.Terminal ( queryTerminal )
import Text.Regex.PCRE
import Text.Regex.PCRE.ByteString.Lazy
import qualified Data.ByteString.Lazy.Char8 as BS
import qualified Data.List as DL
helpdoc = concat $ DL.intersperse " "
[
"grep(1) like tool, but \"record-oriented\", instead of line-oriented,",
"to search and print multi-line log entries separated by empty lines,",
"'----' or timestamps, etc.",
"If an argument in argument list is a name of",
"existing file or '-', that argument and",
"everything after that will be treated as filenames to read from.",
"Otherwise arguments are considered to be patterns. ('-' means stdin)",
"(could be confusing if you specify nonexistent filename!)",
"If a file name ends with .gz, .bz2 or .xz, uncompress it on-the-fly before",
"reading from it."
]
default_rs = "^$|^(====*|----)*$"
re_dow = "((Mon|Tue|Wed|Thu|Fri|Sat),?[ \t]+)?"
re_month = "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Dec),?[ \t]"
re_date = "[0-9]{1,2},?"
re_time = "[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?"
re_year = "(,?[ \t]20[0-9][0-9])?"
re_dty = re_date ++ "[ \t]" ++ re_time ++ re_year
re_isodate = "20[0-9][0-9]-(0[0-9]|11|12)-(0[1-9]|[12][0-9]|3[01])"
timestamp_rs = "^(" ++ re_dow ++ re_month ++ re_dty ++ "|"
++ re_isodate ++ ")"
----------------------------------------------------------------------------
data HmlGrepOpts = HmlGrepOpts {
opt_and :: Bool
, opt_rs :: Maybe String
, opt_timestamp :: Bool
, opt_count :: Bool
, opt_invert :: Bool
, ignoreCase :: Bool
, opt_highlight :: Bool
, opt_mono :: Bool
, opt_args :: [String]
}
-- type LogEntry = (Maybe String, [String])
type LogEntry = (Maybe BS.ByteString, [BS.ByteString])
type Log = [LogEntry]
type Pattern = Regex
----------------------------------------------------------------------------
--
-- Match Highlights
--
hlCode = setSGRCode [SetColor Foreground Vivid Red]
hlReset = setSGRCode [Reset]
highlightRange :: BS.ByteString -> (Int, Int) -> BS.ByteString
highlightRange str mch = BS.concat [ (BS.take start str), (BS.pack hlCode) ,
(BS.take len $ BS.drop start str), (BS.pack hlReset) ,
(BS.drop (start+len) str)]
-- ByteString versions of 'take', 'drop', etc. take Int64, not Int
where start = fromIntegral (fst mch) :: Int64
len = fromIntegral (snd mch) :: Int64
-- lighlight matches in ByteString with _reverse sorted_ list of matches
-- It has to be reversed so that we don't have to re-calculate offset everytime
-- control codes are inserted.
highlightRangesRSorted :: BS.ByteString -> [(Int, Int)] -> BS.ByteString
highlightRangesRSorted str [] = str
highlightRangesRSorted str (r:rs) = highlightRangesRSorted (highlightRange str r) rs
allMatchAsList re str = getAllMatches $
(match re str :: AllMatches [] (MatchOffset, MatchLength))
highlightAllMatches :: Regex -> BS.ByteString -> Maybe BS.ByteString
highlightAllMatches re str =
if m == []
then Nothing
else Just (highlightRangesRSorted str (reverse m))
where m = allMatchAsList re str
----------------------------------------------------------------------------
--
-- line parsing and regex matching
--
-- Similar to =~ but RHS is Regex
(==~) :: BS.ByteString -> Regex -> Bool
(==~) source re = match re source
-- toLogEntry :: Pattern -> [String] -> LogEntry
toLogEntry _ [] = (Nothing, [])
toLogEntry sep (l:ls) = if (l ==~ sep)
then (Just l, ls)
else (Nothing, (l:ls)) -- First record without header(separator)
lines2log sep [] = []
lines2log sep (l:ls) = head : tail
where head = toLogEntry sep $ l:(takeWhile notsep ls)
tail = lines2log sep (dropWhile notsep ls)
notsep line = not (line ==~ sep)
log2lines [] = []
log2lines ((Nothing, l):[]) = l
log2lines ((Just h, l):[]) = h : l
log2lines ((Nothing, l):logs) = l ++ (log2lines logs)
log2lines ((Just h, l):logs) = h : l ++ (log2lines logs)
logEntry2lines (hdr,ls) = case hdr of
Nothing -> ls
Just x -> (x:ls)
matchAny lines re = or $ map (match re) lines
matchRecord p le = matchAny (logEntry2lines le) p
matchRecordAll ps le = and $ map (matchAny $ logEntry2lines le) ps
----------------------------------------------------------------------------
--
-- main logic
--
toRE opts str = makeRegexOpts (ic) execBlank str
where ic = if (ignoreCase opts) then compCaseless else compBlank
-- all RE strings combined with '|'
-- used for OR search and highlights
-- composeRE :: [String] -> Regex
composeRE opts str = toRE opts $ DL.intercalate "|" str
{-
highlightAllMatchesInRecord :: Regex -> LogEntry -> Maybe LogEntry
highlightAllMatchesInRecord re (h,l)
map highlightAllMatches l
hmlgrep'' re log
| otherwise =
if hl
then hmlgrep_hl o_and regexs log
else filter (matcher) log
where matcher = matchRecord o_and regexs
highlightAllMatches
-}
hmlgrep' :: HmlGrepOpts -> Bool -> [String] -> Log -> Log
hmlgrep' _ _ [] log = []
hmlgrep' _ _ _ [] = []
hmlgrep' opts hl patterns log
| o_invert = filter (not.matcher) log -- never highlights
-- | hl && not opt_and = hmlgrep'' regexOR log
| otherwise = filter (matcher) log
where o_and = opt_and opts
o_invert = opt_invert opts
regexs = map (toRE opts) patterns
regexOR = composeRE opts patterns
matcher = if o_and
then matchRecordAll regexs
else matchRecord regexOR
-- when there's only one pattern, opt_and is meaningless
regex_or = (length patterns) == 1 || not o_and
hmlgrep :: HmlGrepOpts -> Bool -> [String] -> BS.ByteString -> (BS.ByteString, Bool)
hmlgrep opts hl patterns indata =
if do_command == []
then (toString do_command, False)
else (toString do_command, True)
where recsep = if opt_timestamp opts
then timestamp_rs
else withDefault default_rs $ opt_rs opts
logs = lines2log (toRE opts recsep) $ BS.lines indata
toString = if opt_count opts
then (\x -> BS.pack (((show.length) x) ++ "\n"))
else BS.unlines.log2lines
do_command = hmlgrep' opts hl patterns logs
----------------------------------------------------------------------------
--
-- Run as a Unix command-line filter (pipe)
--
-- runPipe :: (String -> (String, Bool)) -> Handle -> [Handle] -> IO Bool
runPipe cmd outHandle inHandles = do
streams <- forM inHandles BS.hGetContents
case (cmd $ BS.concat streams) of
(result, ret) -> do
BS.hPutStr outHandle result
return ret
withDefault :: a -> (Maybe a) -> a
withDefault def (Just val) = val
withDefault def Nothing = def
useColor opts istty =
if opt_invert opts
then False
else
if istty
then not $ opt_mono opts
else opt_highlight opts
runWithOptions :: HmlGrepOpts -> IO ()
runWithOptions opts = do
(ps, fs) <- splitArg (opt_args opts)
istty <- queryTerminal stdOutput
ret <- if fs == []
then runPipe (mainProc (useColor opts istty) ps) stdout [stdin]
else forM fs openRO >>= runPipe (mainProc (useColor opts istty) ps) stdout
if ret
then exitSuccess
else exitFailure
where
mainProc = hmlgrep opts
openRO fname
-- FIXME. optparse-applicative seems to strips off
-- all '-' and '--' occurrence in arguments
| fname == "-" = return stdin
| otherwise = openFile fname ReadMode
----------------------------------------------------------------------------
-- Parse ARG1 ARG2 [--] ARG3 ARG4 to ([ARG1, ARG2], [ARG3, ARG4])
splitArg' :: [String] -> [String] -> IO ([String], [String])
splitArg' ps [] = return (ps, [])
splitArg' ps (a:as)
| a == "--" = return (ps, as)
| otherwise = do
isFile <- doesFileExist a
if isFile
then return (ps, a:as)
else (splitArg' (ps++[a]) as)
splitArg = splitArg' []
main :: IO ()
main = execParser opts >>= runWithOptions
where
opts = info (helper <*> parser) ( fullDesc
<> progDesc helpdoc
)
parser = HmlGrepOpts
<$> switch (short 'a' <>
long "and" <>
help "Extract records with all of patterns (default: any)")
<*> (optional $ strOption (
short 'r' <>
long "rs" <>
metavar "RS_REGEX" <>
help ("Input record separator. default: /" ++ default_rs ++ "/") ) )
<*> switch (short 't' <> long "timestamp" <>
help ("Same as --rs=TIMESTAMP_REGEX, where the regex matches " ++
"timestamps often used in log files, e.g., " ++
"'2014-12-31 12:34:56' or 'Dec 31 12:34:56'."))
<*> switch (short 'c' <> long "count" <>
help "Print number of matches. (same as grep -c)")
<*> switch (short 'v' <> long "invert" <>
help "Select non-matching records (same as grep -v).")
<*> switch (short 'i' <> long "ignore-case" <>
help "Case insensitive matching. Default is case sensitive")
<*> switch (long "hl" <> long "highlight" <>
help "Highlight matches. Default is enabled iff stdout is a TTY")
<*> switch (short 'm' <> long "mono" <>
help "Do not Highlight matches.")
<*> some (argument str (metavar "PATTERN[...] [--] [FILES...]"))
-- vim: set makeprg=ghc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment