Skip to content

Instantly share code, notes, and snippets.

@ryo1kato
Created September 2, 2014 23:17
Show Gist options
  • Select an option

  • Save ryo1kato/8e5e888808cc9489ab23 to your computer and use it in GitHub Desktop.

Select an option

Save ryo1kato/8e5e888808cc9489ab23 to your computer and use it in GitHub Desktop.
fgrep with regex support for --rs
--
-- hmlgrep - Haskell Multi-Line Grep
--
{-
TODOs:
* FIX: '-' and '--' handling in optparse-applicative
* https://github.com/pcapriotti/optparse-applicative/pull/99
* Use Cabal for build?
* Use Boyer-Moore for non-regex patterns using stringsearch library:
http://hackage.haskell.org/package/stringsearch-0.3.3/docs/Data-ByteString-Search.html
INSTALL
$ cabal install directory
$ cabal install optparse-appricative
$ cabal install regex-pcre
$ cabal install ansi-terminal
$ ghc --make hmlgrep.hs
-}
import Control.Monad
import Data.Int
import Data.Maybe
import Options.Applicative
import System.Console.ANSI
import System.Directory
import System.Exit
import System.Posix.IO ( stdOutput )
import System.IO.MMap
import System.Posix.Terminal ( queryTerminal )
import Text.Regex.PCRE
import Data.ByteString.Search
import qualified Data.List as DL
import System.IO
import Text.Regex.PCRE.ByteString
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.Unsafe as BSUS
type ByteStr = BS.ByteString
type BSInt = Int
pack = BS.pack
helpdoc = concat $ DL.intersperse " "
[
"grep(1) like tool, but \"record-oriented\", instead of line-oriented,",
"to search and print multi-line log entries separated by empty lines,",
"'----' or timestamps, etc.",
"If an argument in argument list is a name of",
"existing file or '-', that argument and",
"everything after that will be treated as filenames to read from.",
"Otherwise arguments are considered to be patterns. ('-' means stdin)",
"(could be confusing if you specify nonexistent filename!)",
"If a file name ends with .gz, .bz2 or .xz, uncompress it on-the-fly before",
"reading from it."
]
default_rs = "\n\n|\n(=====*|-----*)$"
re_dow = "((Mon|Tue|Wed|Thu|Fri|Sat),?[ \t]+)?"
re_month = "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Dec),?[ \t]"
re_date = "[0-9]{1,2},?"
re_time = "[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?"
re_year = "(,?[ \t]20[0-9][0-9])?"
re_dty = re_date ++ "[ \t]" ++ re_time ++ re_year
re_isodate = "20[0-9][0-9]-(0[0-9]|11|12)-(0[1-9]|[12][0-9]|3[01])"
timestamp_rs = "^(" ++ re_dow ++ re_month ++ re_dty ++ "|"
++ re_isodate ++ ")"
----------------------------------------------------------------------------
data HmlGrepOpts = HmlGrepOpts {
opt_and :: Bool
, opt_rs :: Maybe String
, opt_timestamp :: Bool
, opt_count :: Bool
, opt_invert :: Bool
, opt_ignorecase :: Bool
, opt_highlight :: Bool
, opt_mono :: Bool
, opt_args :: [String]
}
-- type LogEntry = (Maybe String, [String])
type LogEntry = (Maybe ByteStr, [ByteStr])
type Log = [LogEntry]
type Pattern = Regex
----------------------------------------------------------------------------
--
-- plain text (non-regex) search
--
regexChars = "^$(|)[]{}.*"
regexCharsLast = ")]}.*"
toPlainString' :: String -> String -> Maybe String
toPlainString' s [] = Just s
toPlainString' s ('$':[]) = Just ('\n':s)
toPlainString' s (r:[]) = if r `elem` regexCharsLast
then Nothing
else Just (r:s)
toPlainString' s (r1:r2:re) = if r1 `elem` regexChars
then Nothing
else if r1 == '\\' && r2 `elem` ('\\':regexChars)
then toPlainString' (r2:s) re
else toPlainString' (r1:s) (r2:re)
-- Return Just String where string is un-escaped plain text string
-- '^' and '$' at very beginning / end will be converted to '\n'
-- e.g. /^\(/ -> '^('
toPlainString :: String -> Maybe String
toPlainString ('^':re) = liftM reverse $ toPlainString' "\n" re
toPlainString re = liftM reverse $ toPlainString' [] re
----------------------------------------------------------------------------
{- TODOs
- convert ^$ in regex to \n for breakNextRegex
- \n handling
-}
dropLast n bstr = BS.take (BS.length bstr - n) bstr
takeLast n bstr = BS.drop (BS.length bstr - n) bstr
reComp pat = makeRegexOpts compBlank execBlank pat
breakNextRegex re bstr =
if pos >= 0
then BS.splitAt pos bstr
else (bstr, BS.empty)
where (pos, len) = bstr =~ re :: (MatchOffset,MatchLength)
afterLast :: ByteStr -> ByteStr -> ByteStr
afterLast pat bstr = revSearch 0 bstr
where
revSearch n s
| BS.null s = bstr -- not found
| pat `BS.isSuffixOf` s = BS.drop (BS.length bstr - n - 1) bstr
| otherwise = revSearch (n+1) (BSUS.unsafeInit s)
afterLastRegex :: String -> ByteStr -> ByteStr
afterLastRegex re bstr = revSearchRE 0 bstr
where
revSearchRE n s
| BS.null s = bstr
| offset >= 0 = takeLast (consumed - offset) bstr
| otherwise = revSearchRE consumed remaining
where
lastline = afterLast (pack "\n") s
(offset, l) = lastline =~ re :: (MatchOffset,MatchLength)
consumed = n + BS.length lastline
remaining = dropLast consumed bstr
fgrep' afterLastRS breakNextRS rs pat bstr =
if BS.null bstr || BS.null t
then BS.empty
else BS.concat [a, rec1, b, rec2, c, remaining]
-- FIXME
-- if BS.null remaining
-- then BS.concat [rec1, rec2, newline]
-- else BS.concat [rec1, rec2, remaining]
where (h, t) = breakOn pat bstr
rec1 = afterLastRS h
(rec2, rem) = breakNextRS t
remaining = fgrep' afterLastRS breakNextRS rs pat rem
newline = if BS.null rem
then BS.empty
else (pack "\n")
a = (pack "==========\n")
b = (pack "<")
c = (pack ">>")
fgrep rsStr pat bstr =
if isNothing rsPlain
then -- Regex
fgrep' (afterLastRegex rsStr) (breakNextRegex rsStr)
(pack rsStr) (pack pat) bstr
else -- plain text pattern
fgrep' (afterLast rs) (breakOn rs)
(pack $ fromJust rsPlain) (pack pat) bstr
where
rsPlain = toPlainString rsStr
rs = (pack $ fromJust rsPlain)
rsRE = reComp rsStr
----------------------------------------------------------------------------
--
-- Match Highlights
--
-- hlCode = setSGRCode [SetColor Foreground Vivid Red]
hlCode = setSGRCode [SetSwapForegroundBackground True]
hlReset = setSGRCode [Reset]
highlightRange :: ByteStr -> (Int, Int) -> ByteStr
highlightRange str mch = BS.concat [ (BS.take start str), (pack hlCode) ,
(BS.take len $ BS.drop start str), (pack hlReset) ,
(BS.drop (start+len) str)]
-- ByteString versions of 'take', 'drop', etc. take Int64, not Int
where start = fromIntegral (fst mch) :: BSInt
len = fromIntegral (snd mch) :: BSInt
-- lighlight matches in ByteString with _reverse sorted_ list of matches
-- It has to be reversed so that we don't have to re-calculate offset everytime
-- control codes are inserted.
highlightRangesRSorted :: ByteStr -> [(Int, Int)] -> ByteStr
highlightRangesRSorted str [] = str
highlightRangesRSorted str (r:rs) = highlightRangesRSorted (highlightRange str r) rs
allMatchAsList re str = getAllMatches $
(match re str :: AllMatches [] (MatchOffset, MatchLength))
highlightAllMatches re str =
if m == []
then Nothing
else Just (highlightRangesRSorted str (reverse m))
where m = allMatchAsList re str
highlightAllMatchesLines re ls =
if concat ms == []
then Nothing
else Just (zipWith highlight ls ms)
where ms = map (allMatchAsList re) ls
highlight str m = highlightRangesRSorted str (reverse m)
----------------------------------------------------------------------------
--
-- line parsing and regex matching
--
-- Similar to =~ but RHS is Regex
(==~) :: ByteStr -> Regex -> Bool
(==~) source re = match re source
toLogEntry _ [] = (Nothing, [])
toLogEntry sep (l:ls) = if (l ==~ sep)
then (Just l, ls)
else (Nothing, (l:ls)) -- First record without header(separator)
lines2log sep [] = []
lines2log sep (l:ls) = head : tail
where head = toLogEntry sep $ l:(takeWhile notsep ls)
tail = lines2log sep (dropWhile notsep ls)
notsep line = not (line ==~ sep)
log2lines [] = []
log2lines ((Nothing, l):[]) = l
log2lines ((Just h, l):[]) = h : l
log2lines ((Nothing, l):logs) = l ++ (log2lines logs)
log2lines ((Just h, l):logs) = h : l ++ (log2lines logs)
logEntry2lines (hdr,ls) = case hdr of
Nothing -> ls
Just x -> (x:ls)
matchAny lines re = or $ map (match re) lines
matchRecord p le = matchAny (logEntry2lines le) p
matchRecordAll ps le = and $ map (matchAny $ logEntry2lines le) ps
matchRecordHighlight p (maybehdr,ls)
| isNothing maybehdr =
if isNothing hl_ls
then Nothing
else Just (Nothing, fromMaybe ls hl_ls)
| otherwise =
if isNothing hl_hdr && isNothing hl_ls
then Nothing
else Just (Just (fromMaybe hdr hl_hdr), fromMaybe ls hl_ls)
where
hl_hdr = highlightAllMatches p hdr
hl_ls = highlightAllMatchesLines p ls
hdr = fromJust maybehdr
----------------------------------------------------------------------------
--
-- main logic
--
toRE opts str = makeRegexOpts (ic) execBlank str
where ic = if (opt_ignorecase opts) then compCaseless else compBlank
-- all RE strings combined with '|'
-- used for OR search and highlights
composeRE opts str = toRE opts $ DL.intercalate "|" str
hmlgrep_hl re log = catMaybes $ map (matchRecordHighlight re) log
hmlgrep' :: HmlGrepOpts -> Bool -> [String] -> Log -> Log
hmlgrep' _ _ [] log = []
hmlgrep' _ _ _ [] = []
hmlgrep' opts hl patterns log
| o_invert = filter (not.matcher) log -- never highlights
| hl = if o_and
then hmlgrep_hl regexOR $ filter (matcher) log
else hmlgrep_hl regexOR log
| otherwise = filter (matcher) log
where
-- when there's only one pattern, opt_and is meaningless
o_and = (opt_and opts) && (length patterns) /= 1
o_invert = opt_invert opts
regexs = map (toRE opts) patterns
regexOR = composeRE opts patterns
matcher = if o_and
then matchRecordAll regexs
else matchRecord regexOR
hmlgrep :: HmlGrepOpts -> Bool -> [String] -> ByteStr -> (ByteStr, Bool)
hmlgrep opts hl patterns indata =
if isFgrep
then
if BS.null do_fgrep
then (BS.empty, False)
else (do_fgrep, True)
else
if do_command == []
then (toString do_command, False)
else (toString do_command, True)
where recsep = if opt_timestamp opts
then timestamp_rs
else fromMaybe default_rs (opt_rs opts)
logs = lines2log (toRE opts recsep) $ BS.lines indata
toString = if opt_count opts
then (\x -> pack (((show.length) x) ++ "\n"))
else BS.unlines.log2lines
do_command = hmlgrep' opts hl patterns logs
---- fgrep ----
strRS = toPlainString recsep
strPat = toPlainString (head patterns)
isFgrep = isJust strPat && (length patterns) == 1 &&
not (opt_ignorecase opts || opt_count opts || opt_invert opts)
-- FIXME: use fgrep for --count too.
do_fgrep = fgrep recsep (fromJust strPat) indata
----------------------------------------------------------------------------
--
-- Run as a Unix command-line filter (pipe)
--
runPipe' cmd outHandle inHandles = do
streams <- forM inHandles BS.hGetContents
case (cmd $ BS.concat streams) of
(result, ret) -> do
BS.hPutStr outHandle result
return ret
runPipe :: (ByteStr -> (ByteStr, Bool)) -> Handle -> Handle -> IO Bool
runPipe cmd outHandle inHandle = do
stream <- BS.hGetContents inHandle
case cmd stream of
(result, ret) -> do
BS.hPutStr outHandle result
return ret
runPipeMmap cmd outHandle fname = do
stream <- mmapFileByteString fname Nothing
case cmd stream of
(result, ret) -> do
BS.hPutStr outHandle result
return ret
runWithOptions :: HmlGrepOpts -> IO ()
runWithOptions opts = do
(ps, fs) <- splitArg (opt_args opts)
istty <- queryTerminal stdOutput
let runPipeCmd = runPipe (hmlgrep opts (useColor opts istty) ps) stdout
let runPipeCmdMmap = runPipeMmap (hmlgrep opts (useColor opts istty) ps) stdout
let runPipeCmdPrint fname =
hPutStr stdout (fname ++ ":") >> openRO fname >>= runPipeCmd
let runPipeCmdMmapPrint fname =
hPutStr stdout (fname ++ ":") >> runPipeCmdMmap fname
ret <- if fs == []
then runPipeCmd stdin
else if opt_count opts && length fs > 1
then (liftM or) (mapM runPipeCmdMmapPrint fs)
else (liftM or) (mapM runPipeCmdMmap fs)
if ret
then exitSuccess
else exitFailure
where
useColor opts istty =
if opt_invert opts
then False
else
if istty
then not $ opt_mono opts
else opt_highlight opts
openRO fname
| fname == "-" = return stdin
| otherwise = openFile fname ReadMode
----------------------------------------------------------------------------
-- Parse ARG1 ARG2 [--] ARG3 ARG4 to ([ARG1, ARG2], [ARG3, ARG4])
splitArg' :: [String] -> [String] -> IO ([String], [String])
splitArg' ps [] = return (ps, [])
splitArg' ps (a:as)
| a == "--" = return (ps, as)
| a == "-" = return (ps, (a:as))
| otherwise = do
isFile <- doesFileExist a
if isFile
then return (ps, a:as)
else (splitArg' (ps++[a]) as)
splitArg = splitArg' []
main :: IO ()
main = execParser opts >>= runWithOptions
where
opts = info (helper <*> parser) ( fullDesc
<> progDesc helpdoc
)
parser = HmlGrepOpts
<$> switch (short 'a' <>
long "and" <>
help "Extract records with all of patterns (default: any)")
<*> (optional $ strOption (
short 'r' <>
long "rs" <>
metavar "RS_REGEX" <>
help ("Input record separator. default: /" ++ default_rs ++ "/") ) )
<*> switch (short 't' <> long "timestamp" <>
help ("Same as --rs=TIMESTAMP_REGEX, where the regex matches " ++
"timestamps often used in log files, e.g., " ++
"'2014-12-31 12:34:56' or 'Dec 31 12:34:56'."))
<*> switch (short 'c' <> long "count" <>
help "Print number of matches. (same as grep -c)")
<*> switch (short 'v' <> long "invert" <>
help "Select non-matching records (same as grep -v).")
<*> switch (short 'i' <> long "ignore-case" <>
help "Case insensitive matching. Default is case sensitive")
<*> switch (long "color" <> long "hl" <>
help "Highlight matches. Default is enabled iff stdout is a TTY")
<*> switch (short 'm' <> long "mono" <>
help "Do not Highlight matches.")
<*> some (argument str (metavar "PATTERN[...] [--] [FILES...]"))
-- vim: set makeprg=ghc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment