Skip to content

Instantly share code, notes, and snippets.

@nakamuray
Last active December 17, 2015 19:38
Show Gist options
  • Save nakamuray/5661331 to your computer and use it in GitHub Desktop.
Save nakamuray/5661331 to your computer and use it in GitHub Desktop.
指定されたファイルもしくは標準入力から正規表現にマッチする部分を抜きだし、その出現回数を表示する。
{-# LANGUAGE FlexibleContexts #-}
module Common
( Result
, ResultPair
, Regex -- re-export
, match
, addResult
, emptyResult
, showResultPair
, main_
) where
import Control.Applicative ((<$>))
import Data.Array (elems)
import System.Environment (getArgs, getProgName)
import System.IO (hPutStrLn, stderr)
--import Text.Regex.Posix (makeRegex, Regex, matchAllText)
--import Text.Regex.TDFA (makeRegex, Regex, matchAllText)
import Text.Regex.PCRE (makeRegex, Regex, RegexLike, matchAllText)
import qualified Data.ByteString.Char8 as B
import qualified Data.HashMap.Strict as M
type Result = M.HashMap [B.ByteString] Int
type ResultPair = ([B.ByteString], Int)
type Matches = [B.ByteString]
match :: Regex -> B.ByteString -> [B.ByteString]
--match :: Regex -> String -> [String]
--match :: RegexLike Regex s => Regex -> s -> [s]
match r bs = case elems <$> matchAllText r bs of
[] -> []
(m:[]):_ -> [fst m]
(_:ms):_ -> map fst ms
addResult :: Result -> Matches -> Result
addResult r [] = r
addResult r ms = M.insertWith (+) ms 1 r
emptyResult :: Result
emptyResult = M.empty
showResultPair :: ResultPair -> B.ByteString
showResultPair (bss, i) = B.pack (show i ++ "\t") `B.append` B.intercalate (B.pack "\t") bss
printResult :: Result -> IO ()
printResult = mapM_ (B.putStrLn . showResultPair) . M.toList
printUsage :: IO ()
printUsage = do
name <- getProgName
hPutStrLn stderr $ "Usage: " ++ name ++ " regex [files]"
main_ :: (Regex -> [FilePath] -> IO Result) -> IO ()
main_ f = do
args <- getArgs
case args of
[] -> printUsage
regex:[] -> printResult =<< f (makeRegex regex) ["/dev/stdin"]
regex:files -> printResult =<< f (makeRegex regex) files
import Data.Conduit
import Data.Monoid (mconcat)
import qualified Data.ByteString.Char8 as B
import qualified Data.Conduit.Binary as CB
import qualified Data.Conduit.List as CL
import Common
regc :: Monad m => Regex -> Sink B.ByteString m Result
regc regex = CL.map (match regex) =$ CL.fold addResult emptyResult
main :: IO ()
main = main_ $ \regex files -> runResourceT $ (mconcat $ map CB.sourceFile files) $= CB.lines $$ regc regex
package main
import (
"bufio"
"fmt"
"github.com/glenn-brown/golang-pkg-pcre/src/pkg/pcre"
"io"
"os"
"strings"
)
type reCompileError string
func (e reCompileError) Error() string {
return string(e)
}
func RegCount(regexString string, inputs []io.Reader) (error, map[string]int) {
regex, err := pcre.Compile(regexString, 0)
if err != nil {
return reCompileError(err.Message), nil
}
c := make(chan string, 4096)
go func() {
fileReader(inputs, c)
close(c)
}()
result := regexCounter(regex, c)
return nil, result
}
func fileReader(inputs []io.Reader, output chan string) {
for i := range inputs {
input := bufio.NewReader(inputs[i])
for {
line, err := input.ReadString('\n')
if err == io.EOF {
break
} else if err != nil {
panic(err)
}
line = strings.TrimRight(line, "\r\n")
output <- line
}
}
}
func regexCounter(regex pcre.Regexp, input chan string) map[string]int {
result := make(map[string]int)
for {
line, ok := <-input
if !ok {
break
}
match := regex.MatcherString(line, 0)
if match.Matches() {
groups := match.Groups()
if groups == 0 {
result[match.GroupString(0)] += 1
} else {
s := match.GroupString(1)
for i := 2; i <= groups; i++ {
s += "\n" + match.GroupString(i)
}
result[s] += 1
}
}
}
return result
}
func main() {
if len(os.Args) == 1 {
fmt.Fprintln(os.Stderr, "usage: regc REGEX [files ...]")
os.Exit(1)
}
regex := os.Args[1]
inputs := []io.Reader{}
fileNames := os.Args[2:]
if len(fileNames) != 0 {
for i := range fileNames {
f, err := os.Open(fileNames[i])
if err != nil {
fmt.Fprintln(os.Stderr, err)
} else {
inputs = append(inputs, f)
}
}
} else {
inputs = []io.Reader{os.Stdin}
}
if len(inputs) == 0 {
os.Exit(1)
}
err, result := RegCount(regex, inputs)
if err != nil {
panic(err)
}
sum := 0
for k, v := range result {
fmt.Printf("%d\t%s\n", v, strings.Replace(k, "\n", "\t", -1))
sum += v
}
fmt.Println(sum)
}
package main
import (
"bufio"
"fmt"
"github.com/glenn-brown/golang-pkg-pcre/src/pkg/pcre"
"io"
"os"
"strings"
)
type reCompileError string
func (e reCompileError) Error() string {
return string(e)
}
func RegCount(regexString string, inputs []io.Reader) (error, map[string]int) {
regex, err := pcre.Compile(regexString, 0)
if err != nil {
return reCompileError(err.Message), nil
}
result := make(map[string]int)
for i := range inputs {
input := inputs[i]
err := regCount(regex, bufio.NewReader(input), result)
if err != nil {
return err, nil
}
}
return nil, result
}
func regCount(regex pcre.Regexp, input *bufio.Reader, result map[string]int) error {
for {
line, err := input.ReadString('\n')
if err == io.EOF {
return nil
} else if err != nil {
return err
}
line = strings.TrimRight(line, "\r\n")
match := regex.MatcherString(line, 0)
if match.Matches() {
groups := match.Groups()
if groups == 0 {
result[match.GroupString(0)] += 1
} else {
s := match.GroupString(1)
for i := 2; i <= groups; i++ {
s += "\n" + match.GroupString(i)
}
result[s] += 1
}
}
}
return nil
}
func main() {
if len(os.Args) == 1 {
fmt.Fprintln(os.Stderr, "usage: regc REGEX [files ...]")
os.Exit(1)
}
regex := os.Args[1]
inputs := []io.Reader{}
fileNames := os.Args[2:]
if len(fileNames) != 0 {
for i := range fileNames {
f, err := os.Open(fileNames[i])
if err != nil {
fmt.Fprintln(os.Stderr, err)
} else {
inputs = append(inputs, f)
}
}
} else {
inputs = []io.Reader{os.Stdin}
}
if len(inputs) == 0 {
os.Exit(1)
}
err, result := RegCount(regex, inputs)
if err != nil {
panic(err)
}
sum := 0
for k, v := range result {
fmt.Printf("%d\t%s\n", v, strings.Replace(k, "\n", "\t", -1))
sum += v
}
fmt.Println(sum)
}
{-# LANGUAGE BangPatterns #-}
import Data.List (foldl')
import System.IO (openFile, IOMode(ReadMode))
import qualified Data.ByteString.Lazy.Char8 as BL8
import qualified Data.ByteString as B
import qualified Data.ByteString.Internal as BI
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Lazy.Internal as BLI
import Foreign.ForeignPtr
import Foreign.Ptr
import Common
-- http://stackoverflow.com/questions/7815402/convert-a-lazy-bytestring-to-a-strict-bytestring
toStrict :: BL.ByteString -> B.ByteString
toStrict = B.concat . BL.toChunks
toStrict2 :: BL.ByteString -> B.ByteString
toStrict2 BLI.Empty = B.empty
toStrict2 (BLI.Chunk c BLI.Empty) = c
toStrict2 lb = BI.unsafeCreate len $ go lb
where
len = BLI.foldlChunks (\l sb -> l + B.length sb) 0 lb
go BLI.Empty _ = return ()
go (BLI.Chunk (BI.PS fp s l) r) ptr =
withForeignPtr fp $ \p -> do
BI.memcpy ptr (p `plusPtr` s) (fromIntegral l)
go r (ptr `plusPtr` l)
regc :: Regex -> [B.ByteString] -> Result
regc regex = foldl' addResult emptyResult . map (match regex)
main :: IO ()
main = main_ $ \regex files -> do
input <- sequence $ map (\f -> openFile f ReadMode >>= BL.hGetContents) files
return $ regc regex $ map toStrict2 $ concatMap BL8.lines input
#!/usr/bin/node
function lineReader(reader, callback) {
var buf = "";
reader.on('data', function(chunk) {
buf += chunk;
if (buf.indexOf("\n") >= 0) {
var lines = buf.split("\n");
buf = lines.pop();
for (var i in lines) {
callback(lines[i]);
}
}
});
}
function main(args) {
if (args.length === 0) {
console.error("usage: regc.js REGEX");
process.exit(1);
}
var regex = new RegExp(args[0]);
var result = {};
// TODO: read from files if specified
lineReader(process.stdin, function(line) {
var m = line.match(regex);
if (m) {
var words;
if (m.length == 1) {
words = m[0];
} else {
words = m.slice(1).join("\n");
}
if (result[words] === undefined) {
result[words] = 1;
} else {
result[words] += 1;
}
}
});
process.stdin.on('end', function() {
var sum = 0;
for (var k in result) {
var v = result[k];
console.log(v + "\t" + k.replace(/\n/g, "\t"));
sum += v;
}
console.log(sum);
});
process.stdin.resume();
}
// strip node and script name
main(process.argv.slice(2));
#!/usr/bin/perl
use warnings;
use strict;
my $re = shift;
my %result = ();
while (my $line = <>) {
if (my @matches = ($line =~ /$re/po)) {
if (defined $1) {
$result{join("\n", @matches)} += 1;
} else {
$result{${^MATCH}} += 1;
}
}
}
my $sum = 0;
for my $k (keys %result) {
my $v = $result{$k};
$k =~ s/\n/\t/go;
print $v, "\t", $k, "\n";
$sum += $v;
}
print $sum, "\n";
#!/usr/bin/python
# vim: fileencoding=utf-8
'''
regc.py - regex counter
指定されたファイルもしくは標準入力から正規表現にマッチする部分を抜きだし、
その出現回数を表示する。
正規表現がグルーピングを含む場合はグループ部分だけ、
そうでなければ全体を抜き出してカウントする。
'''
import re
def regex_counter(regex, input, flags=0):
r = re.compile(regex, flags)
result = {}
for line in input:
mobj = r.search(line)
if mobj:
k = mobj.groups() or mobj.group()
try:
result[k] += 1
except KeyError:
result[k] = 1
return result
def main():
import sys
from optparse import OptionParser
parser = OptionParser('Usage: %prog [OPTION]... REGEX [FILE]...\n'
'Search REGEX in each FILE or standard input,\n'
'count occurrences.')
parser.add_option('-i', '--ignore-case',
action='store_true', dest='ignore_case',
help='Perform case-insensitive matching.')
parser.add_option('-l', '--locale',
action='store_true', dest='locale',
help=r'Make \w, \W, \b, \B, dependent on the current locale.')
parser.add_option('-u', '--unicode',
action='store_true', dest='unicode',
help=r'Make \w, \W, \b, \B, dependent on the Unicode locale.')
opt, args = parser.parse_args()
if not args:
parser.error('REGEX is required')
flags = 0
if opt.ignore_case:
flags = flags | re.IGNORECASE
if opt.locale:
flags = flags | re.LOCALE
if opt.unicode:
flags = flags | re.UNICODE
regex = args[0]
if len(args) > 1:
import itertools
input = itertools.chain(*map(open, args[1:]))
else:
input = sys.stdin
sum_ = 0
for k, v in regex_counter(regex, input, flags=flags).iteritems():
if isinstance(k, tuple):
k = '\t'.join(k)
print '%d\t%s' % (v, k)
sum_ += v
print sum_
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment