Last active
December 17, 2015 19:38
-
-
Save nakamuray/5661331 to your computer and use it in GitHub Desktop.
指定されたファイルもしくは標準入力から正規表現にマッチする部分を抜きだし、その出現回数を表示する。
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE FlexibleContexts #-} | |
module Common | |
( Result | |
, ResultPair | |
, Regex -- re-export | |
, match | |
, addResult | |
, emptyResult | |
, showResultPair | |
, main_ | |
) where | |
import Control.Applicative ((<$>)) | |
import Data.Array (elems) | |
import System.Environment (getArgs, getProgName) | |
import System.IO (hPutStrLn, stderr) | |
--import Text.Regex.Posix (makeRegex, Regex, matchAllText) | |
--import Text.Regex.TDFA (makeRegex, Regex, matchAllText) | |
import Text.Regex.PCRE (makeRegex, Regex, RegexLike, matchAllText) | |
import qualified Data.ByteString.Char8 as B | |
import qualified Data.HashMap.Strict as M | |
type Result = M.HashMap [B.ByteString] Int | |
type ResultPair = ([B.ByteString], Int) | |
type Matches = [B.ByteString] | |
match :: Regex -> B.ByteString -> [B.ByteString] | |
--match :: Regex -> String -> [String] | |
--match :: RegexLike Regex s => Regex -> s -> [s] | |
match r bs = case elems <$> matchAllText r bs of | |
[] -> [] | |
(m:[]):_ -> [fst m] | |
(_:ms):_ -> map fst ms | |
addResult :: Result -> Matches -> Result | |
addResult r [] = r | |
addResult r ms = M.insertWith (+) ms 1 r | |
emptyResult :: Result | |
emptyResult = M.empty | |
showResultPair :: ResultPair -> B.ByteString | |
showResultPair (bss, i) = B.pack (show i ++ "\t") `B.append` B.intercalate (B.pack "\t") bss | |
printResult :: Result -> IO () | |
printResult = mapM_ (B.putStrLn . showResultPair) . M.toList | |
printUsage :: IO () | |
printUsage = do | |
name <- getProgName | |
hPutStrLn stderr $ "Usage: " ++ name ++ " regex [files]" | |
main_ :: (Regex -> [FilePath] -> IO Result) -> IO () | |
main_ f = do | |
args <- getArgs | |
case args of | |
[] -> printUsage | |
regex:[] -> printResult =<< f (makeRegex regex) ["/dev/stdin"] | |
regex:files -> printResult =<< f (makeRegex regex) files |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Data.Conduit | |
import Data.Monoid (mconcat) | |
import qualified Data.ByteString.Char8 as B | |
import qualified Data.Conduit.Binary as CB | |
import qualified Data.Conduit.List as CL | |
import Common | |
regc :: Monad m => Regex -> Sink B.ByteString m Result | |
regc regex = CL.map (match regex) =$ CL.fold addResult emptyResult | |
main :: IO () | |
main = main_ $ \regex files -> runResourceT $ (mconcat $ map CB.sourceFile files) $= CB.lines $$ regc regex |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"fmt" | |
"github.com/glenn-brown/golang-pkg-pcre/src/pkg/pcre" | |
"io" | |
"os" | |
"strings" | |
) | |
type reCompileError string | |
func (e reCompileError) Error() string { | |
return string(e) | |
} | |
func RegCount(regexString string, inputs []io.Reader) (error, map[string]int) { | |
regex, err := pcre.Compile(regexString, 0) | |
if err != nil { | |
return reCompileError(err.Message), nil | |
} | |
c := make(chan string, 4096) | |
go func() { | |
fileReader(inputs, c) | |
close(c) | |
}() | |
result := regexCounter(regex, c) | |
return nil, result | |
} | |
func fileReader(inputs []io.Reader, output chan string) { | |
for i := range inputs { | |
input := bufio.NewReader(inputs[i]) | |
for { | |
line, err := input.ReadString('\n') | |
if err == io.EOF { | |
break | |
} else if err != nil { | |
panic(err) | |
} | |
line = strings.TrimRight(line, "\r\n") | |
output <- line | |
} | |
} | |
} | |
func regexCounter(regex pcre.Regexp, input chan string) map[string]int { | |
result := make(map[string]int) | |
for { | |
line, ok := <-input | |
if !ok { | |
break | |
} | |
match := regex.MatcherString(line, 0) | |
if match.Matches() { | |
groups := match.Groups() | |
if groups == 0 { | |
result[match.GroupString(0)] += 1 | |
} else { | |
s := match.GroupString(1) | |
for i := 2; i <= groups; i++ { | |
s += "\n" + match.GroupString(i) | |
} | |
result[s] += 1 | |
} | |
} | |
} | |
return result | |
} | |
func main() { | |
if len(os.Args) == 1 { | |
fmt.Fprintln(os.Stderr, "usage: regc REGEX [files ...]") | |
os.Exit(1) | |
} | |
regex := os.Args[1] | |
inputs := []io.Reader{} | |
fileNames := os.Args[2:] | |
if len(fileNames) != 0 { | |
for i := range fileNames { | |
f, err := os.Open(fileNames[i]) | |
if err != nil { | |
fmt.Fprintln(os.Stderr, err) | |
} else { | |
inputs = append(inputs, f) | |
} | |
} | |
} else { | |
inputs = []io.Reader{os.Stdin} | |
} | |
if len(inputs) == 0 { | |
os.Exit(1) | |
} | |
err, result := RegCount(regex, inputs) | |
if err != nil { | |
panic(err) | |
} | |
sum := 0 | |
for k, v := range result { | |
fmt.Printf("%d\t%s\n", v, strings.Replace(k, "\n", "\t", -1)) | |
sum += v | |
} | |
fmt.Println(sum) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"fmt" | |
"github.com/glenn-brown/golang-pkg-pcre/src/pkg/pcre" | |
"io" | |
"os" | |
"strings" | |
) | |
type reCompileError string | |
func (e reCompileError) Error() string { | |
return string(e) | |
} | |
func RegCount(regexString string, inputs []io.Reader) (error, map[string]int) { | |
regex, err := pcre.Compile(regexString, 0) | |
if err != nil { | |
return reCompileError(err.Message), nil | |
} | |
result := make(map[string]int) | |
for i := range inputs { | |
input := inputs[i] | |
err := regCount(regex, bufio.NewReader(input), result) | |
if err != nil { | |
return err, nil | |
} | |
} | |
return nil, result | |
} | |
func regCount(regex pcre.Regexp, input *bufio.Reader, result map[string]int) error { | |
for { | |
line, err := input.ReadString('\n') | |
if err == io.EOF { | |
return nil | |
} else if err != nil { | |
return err | |
} | |
line = strings.TrimRight(line, "\r\n") | |
match := regex.MatcherString(line, 0) | |
if match.Matches() { | |
groups := match.Groups() | |
if groups == 0 { | |
result[match.GroupString(0)] += 1 | |
} else { | |
s := match.GroupString(1) | |
for i := 2; i <= groups; i++ { | |
s += "\n" + match.GroupString(i) | |
} | |
result[s] += 1 | |
} | |
} | |
} | |
return nil | |
} | |
func main() { | |
if len(os.Args) == 1 { | |
fmt.Fprintln(os.Stderr, "usage: regc REGEX [files ...]") | |
os.Exit(1) | |
} | |
regex := os.Args[1] | |
inputs := []io.Reader{} | |
fileNames := os.Args[2:] | |
if len(fileNames) != 0 { | |
for i := range fileNames { | |
f, err := os.Open(fileNames[i]) | |
if err != nil { | |
fmt.Fprintln(os.Stderr, err) | |
} else { | |
inputs = append(inputs, f) | |
} | |
} | |
} else { | |
inputs = []io.Reader{os.Stdin} | |
} | |
if len(inputs) == 0 { | |
os.Exit(1) | |
} | |
err, result := RegCount(regex, inputs) | |
if err != nil { | |
panic(err) | |
} | |
sum := 0 | |
for k, v := range result { | |
fmt.Printf("%d\t%s\n", v, strings.Replace(k, "\n", "\t", -1)) | |
sum += v | |
} | |
fmt.Println(sum) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE BangPatterns #-} | |
import Data.List (foldl') | |
import System.IO (openFile, IOMode(ReadMode)) | |
import qualified Data.ByteString.Lazy.Char8 as BL8 | |
import qualified Data.ByteString as B | |
import qualified Data.ByteString.Internal as BI | |
import qualified Data.ByteString.Lazy as BL | |
import qualified Data.ByteString.Lazy.Internal as BLI | |
import Foreign.ForeignPtr | |
import Foreign.Ptr | |
import Common | |
-- http://stackoverflow.com/questions/7815402/convert-a-lazy-bytestring-to-a-strict-bytestring | |
toStrict :: BL.ByteString -> B.ByteString | |
toStrict = B.concat . BL.toChunks | |
toStrict2 :: BL.ByteString -> B.ByteString | |
toStrict2 BLI.Empty = B.empty | |
toStrict2 (BLI.Chunk c BLI.Empty) = c | |
toStrict2 lb = BI.unsafeCreate len $ go lb | |
where | |
len = BLI.foldlChunks (\l sb -> l + B.length sb) 0 lb | |
go BLI.Empty _ = return () | |
go (BLI.Chunk (BI.PS fp s l) r) ptr = | |
withForeignPtr fp $ \p -> do | |
BI.memcpy ptr (p `plusPtr` s) (fromIntegral l) | |
go r (ptr `plusPtr` l) | |
regc :: Regex -> [B.ByteString] -> Result | |
regc regex = foldl' addResult emptyResult . map (match regex) | |
main :: IO () | |
main = main_ $ \regex files -> do | |
input <- sequence $ map (\f -> openFile f ReadMode >>= BL.hGetContents) files | |
return $ regc regex $ map toStrict2 $ concatMap BL8.lines input |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/node | |
function lineReader(reader, callback) { | |
var buf = ""; | |
reader.on('data', function(chunk) { | |
buf += chunk; | |
if (buf.indexOf("\n") >= 0) { | |
var lines = buf.split("\n"); | |
buf = lines.pop(); | |
for (var i in lines) { | |
callback(lines[i]); | |
} | |
} | |
}); | |
} | |
function main(args) { | |
if (args.length === 0) { | |
console.error("usage: regc.js REGEX"); | |
process.exit(1); | |
} | |
var regex = new RegExp(args[0]); | |
var result = {}; | |
// TODO: read from files if specified | |
lineReader(process.stdin, function(line) { | |
var m = line.match(regex); | |
if (m) { | |
var words; | |
if (m.length == 1) { | |
words = m[0]; | |
} else { | |
words = m.slice(1).join("\n"); | |
} | |
if (result[words] === undefined) { | |
result[words] = 1; | |
} else { | |
result[words] += 1; | |
} | |
} | |
}); | |
process.stdin.on('end', function() { | |
var sum = 0; | |
for (var k in result) { | |
var v = result[k]; | |
console.log(v + "\t" + k.replace(/\n/g, "\t")); | |
sum += v; | |
} | |
console.log(sum); | |
}); | |
process.stdin.resume(); | |
} | |
// strip node and script name | |
main(process.argv.slice(2)); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use warnings; | |
use strict; | |
my $re = shift; | |
my %result = (); | |
while (my $line = <>) { | |
if (my @matches = ($line =~ /$re/po)) { | |
if (defined $1) { | |
$result{join("\n", @matches)} += 1; | |
} else { | |
$result{${^MATCH}} += 1; | |
} | |
} | |
} | |
my $sum = 0; | |
for my $k (keys %result) { | |
my $v = $result{$k}; | |
$k =~ s/\n/\t/go; | |
print $v, "\t", $k, "\n"; | |
$sum += $v; | |
} | |
print $sum, "\n"; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# vim: fileencoding=utf-8 | |
''' | |
regc.py - regex counter | |
指定されたファイルもしくは標準入力から正規表現にマッチする部分を抜きだし、 | |
その出現回数を表示する。 | |
正規表現がグルーピングを含む場合はグループ部分だけ、 | |
そうでなければ全体を抜き出してカウントする。 | |
''' | |
import re | |
def regex_counter(regex, input, flags=0): | |
r = re.compile(regex, flags) | |
result = {} | |
for line in input: | |
mobj = r.search(line) | |
if mobj: | |
k = mobj.groups() or mobj.group() | |
try: | |
result[k] += 1 | |
except KeyError: | |
result[k] = 1 | |
return result | |
def main(): | |
import sys | |
from optparse import OptionParser | |
parser = OptionParser('Usage: %prog [OPTION]... REGEX [FILE]...\n' | |
'Search REGEX in each FILE or standard input,\n' | |
'count occurrences.') | |
parser.add_option('-i', '--ignore-case', | |
action='store_true', dest='ignore_case', | |
help='Perform case-insensitive matching.') | |
parser.add_option('-l', '--locale', | |
action='store_true', dest='locale', | |
help=r'Make \w, \W, \b, \B, dependent on the current locale.') | |
parser.add_option('-u', '--unicode', | |
action='store_true', dest='unicode', | |
help=r'Make \w, \W, \b, \B, dependent on the Unicode locale.') | |
opt, args = parser.parse_args() | |
if not args: | |
parser.error('REGEX is required') | |
flags = 0 | |
if opt.ignore_case: | |
flags = flags | re.IGNORECASE | |
if opt.locale: | |
flags = flags | re.LOCALE | |
if opt.unicode: | |
flags = flags | re.UNICODE | |
regex = args[0] | |
if len(args) > 1: | |
import itertools | |
input = itertools.chain(*map(open, args[1:])) | |
else: | |
input = sys.stdin | |
sum_ = 0 | |
for k, v in regex_counter(regex, input, flags=flags).iteritems(): | |
if isinstance(k, tuple): | |
k = '\t'.join(k) | |
print '%d\t%s' % (v, k) | |
sum_ += v | |
print sum_ | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment