nakamuray · December 17, 2015 19:38
diff --git a/Common.hs b/Common.hs
 {-# LANGUAGE FlexibleContexts #-}
 module Common
    ( Result
    , ResultPair
    , Regex     -- re-export
    , match
    , addResult
    , emptyResult
    , showResultPair
    , main_
    ) where

 import Control.Applicative ((<$>))
 import Data.Array (elems)
 import System.Environment (getArgs, getProgName)
 import System.IO (hPutStrLn, stderr)
 --import Text.Regex.Posix (makeRegex, Regex, matchAllText)
 --import Text.Regex.TDFA (makeRegex, Regex, matchAllText)
 import Text.Regex.PCRE (makeRegex, Regex, RegexLike, matchAllText)

 import qualified Data.ByteString.Char8 as B
 import qualified Data.HashMap.Strict as M

 type Result = M.HashMap [B.ByteString] Int
 type ResultPair = ([B.ByteString], Int)
 type Matches = [B.ByteString]

 match :: Regex -> B.ByteString -> [B.ByteString]
 --match :: Regex -> String -> [String]
 --match :: RegexLike Regex s => Regex -> s -> [s]
 match r bs = case elems <$> matchAllText r bs of
                  []       -> []
                  (m:[]):_ -> [fst m]
                  (_:ms):_ -> map fst ms

 addResult :: Result -> Matches -> Result
 addResult r [] = r
 addResult r ms = M.insertWith (+) ms 1 r

 emptyResult :: Result
 emptyResult = M.empty

 showResultPair :: ResultPair -> B.ByteString
 showResultPair (bss, i) = B.pack (show i ++ "\t") `B.append` B.intercalate (B.pack "\t") bss

 printResult :: Result -> IO ()
 printResult = mapM_ (B.putStrLn . showResultPair) . M.toList

 printUsage :: IO ()
 printUsage = do
    name <- getProgName
    hPutStrLn stderr $ "Usage: " ++ name ++ " regex [files]"

 main_ :: (Regex -> [FilePath] -> IO Result) -> IO ()
 main_ f = do
    args <- getArgs
    case args of
        []          -> printUsage
        regex:[]    -> printResult =<< f (makeRegex regex) ["/dev/stdin"]
        regex:files -> printResult =<< f (makeRegex regex) files
diff --git a/regc-conduit.hs b/regc-conduit.hs
 import Data.Conduit

 import Data.Monoid (mconcat)

 import qualified Data.ByteString.Char8 as B
 import qualified Data.Conduit.Binary as CB
 import qualified Data.Conduit.List as CL

 import Common


 regc :: Monad m => Regex -> Sink B.ByteString m Result
 regc regex = CL.map (match regex) =$ CL.fold addResult emptyResult

 main :: IO ()
 main = main_ $ \regex files -> runResourceT $ (mconcat $ map CB.sourceFile files) $= CB.lines $$ regc regex
diff --git a/regc-goroutine.go b/regc-goroutine.go
 package main

 import (
 	"bufio"
 	"fmt"
 	"github.com/glenn-brown/golang-pkg-pcre/src/pkg/pcre"
 	"io"
 	"os"
 	"strings"
 )

 type reCompileError string

 func (e reCompileError) Error() string {
 	return string(e)
 }

 func RegCount(regexString string, inputs []io.Reader) (error, map[string]int) {
 	regex, err := pcre.Compile(regexString, 0)
 	if err != nil {
 		return reCompileError(err.Message), nil
 	}

 	c := make(chan string, 4096)

 	go func() {
 		fileReader(inputs, c)
 		close(c)
 	}()

 	result := regexCounter(regex, c)

 	return nil, result
 }

 func fileReader(inputs []io.Reader, output chan string) {
 	for i := range inputs {
 		input := bufio.NewReader(inputs[i])

 		for {
 			line, err := input.ReadString('\n')

 			if err == io.EOF {
 				break
 			} else if err != nil {
 				panic(err)
 			}

 			line = strings.TrimRight(line, "\r\n")
 			output <- line
 		}
 	}
 }

 func regexCounter(regex pcre.Regexp, input chan string) map[string]int {
 	result := make(map[string]int)

 	for {
 		line, ok := <-input

 		if !ok {
 			break
 		}

 		match := regex.MatcherString(line, 0)

 		if match.Matches() {
 			groups := match.Groups()
 			if groups == 0 {
 				result[match.GroupString(0)] += 1
 			} else {
 				s := match.GroupString(1)
 				for i := 2; i <= groups; i++ {
 					s += "\n" + match.GroupString(i)
 				}
 				result[s] += 1
 			}
 		}
 	}

 	return result
 }

 func main() {
 	if len(os.Args) == 1 {
 		fmt.Fprintln(os.Stderr, "usage: regc REGEX [files ...]")
 		os.Exit(1)
 	}

 	regex := os.Args[1]
 	inputs := []io.Reader{}
 	fileNames := os.Args[2:]

 	if len(fileNames) != 0 {
 		for i := range fileNames {
 			f, err := os.Open(fileNames[i])

 			if err != nil {
 				fmt.Fprintln(os.Stderr, err)
 			} else {
 				inputs = append(inputs, f)
 			}
 		}
 	} else {
 		inputs = []io.Reader{os.Stdin}
 	}

 	if len(inputs) == 0 {
 		os.Exit(1)
 	}

 	err, result := RegCount(regex, inputs)
 	if err != nil {
 		panic(err)
 	}

 	sum := 0

 	for k, v := range result {
 		fmt.Printf("%d\t%s\n", v, strings.Replace(k, "\n", "\t", -1))

 		sum += v
 	}

 	fmt.Println(sum)
 }
diff --git a/regc.go b/regc.go
 package main

 import (
 	"bufio"
 	"fmt"
 	"github.com/glenn-brown/golang-pkg-pcre/src/pkg/pcre"
 	"io"
 	"os"
 	"strings"
 )

 type reCompileError string

 func (e reCompileError) Error() string {
 	return string(e)
 }

 func RegCount(regexString string, inputs []io.Reader) (error, map[string]int) {
 	regex, err := pcre.Compile(regexString, 0)
 	if err != nil {
 		return reCompileError(err.Message), nil
 	}

 	result := make(map[string]int)

 	for i := range inputs {
 		input := inputs[i]

 		err := regCount(regex, bufio.NewReader(input), result)

 		if err != nil {
 			return err, nil
 		}
 	}

 	return nil, result
 }

 func regCount(regex pcre.Regexp, input *bufio.Reader, result map[string]int) error {
 	for {
 		line, err := input.ReadString('\n')

 		if err == io.EOF {
 			return nil
 		} else if err != nil {
 			return err
 		}

 		line = strings.TrimRight(line, "\r\n")
 		match := regex.MatcherString(line, 0)

 		if match.Matches() {
 			groups := match.Groups()
 			if groups == 0 {
 				result[match.GroupString(0)] += 1
 			} else {
 				s := match.GroupString(1)
 				for i := 2; i <= groups; i++ {
 					s += "\n" + match.GroupString(i)
 				}
 				result[s] += 1
 			}
 		}
 	}

 	return nil
 }

 func main() {
 	if len(os.Args) == 1 {
 		fmt.Fprintln(os.Stderr, "usage: regc REGEX [files ...]")
 		os.Exit(1)
 	}

 	regex := os.Args[1]
 	inputs := []io.Reader{}
 	fileNames := os.Args[2:]

 	if len(fileNames) != 0 {
 		for i := range fileNames {
 			f, err := os.Open(fileNames[i])

 			if err != nil {
 				fmt.Fprintln(os.Stderr, err)
 			} else {
 				inputs = append(inputs, f)
 			}
 		}
 	} else {
 		inputs = []io.Reader{os.Stdin}
 	}

 	if len(inputs) == 0 {
 		os.Exit(1)
 	}

 	err, result := RegCount(regex, inputs)
 	if err != nil {
 		panic(err)
 	}

 	sum := 0

 	for k, v := range result {
 		fmt.Printf("%d\t%s\n", v, strings.Replace(k, "\n", "\t", -1))

 		sum += v
 	}

 	fmt.Println(sum)
 }
diff --git a/regc.hs b/regc.hs
 {-# LANGUAGE BangPatterns #-}
 import Data.List (foldl')
 import System.IO (openFile, IOMode(ReadMode))

 import qualified Data.ByteString.Lazy.Char8 as BL8

 import qualified Data.ByteString               as B
 import qualified Data.ByteString.Internal      as BI
 import qualified Data.ByteString.Lazy          as BL
 import qualified Data.ByteString.Lazy.Internal as BLI
 import           Foreign.ForeignPtr
 import           Foreign.Ptr

 import Common


 -- http://stackoverflow.com/questions/7815402/convert-a-lazy-bytestring-to-a-strict-bytestring
 toStrict :: BL.ByteString -> B.ByteString
 toStrict = B.concat . BL.toChunks

 toStrict2 :: BL.ByteString -> B.ByteString
 toStrict2 BLI.Empty = B.empty
 toStrict2 (BLI.Chunk c BLI.Empty) = c
 toStrict2 lb = BI.unsafeCreate len $ go lb
  where
    len = BLI.foldlChunks (\l sb -> l + B.length sb) 0 lb

    go  BLI.Empty                   _   = return ()
    go (BLI.Chunk (BI.PS fp s l) r) ptr =
        withForeignPtr fp $ \p -> do
            BI.memcpy ptr (p `plusPtr` s) (fromIntegral l)
            go r (ptr `plusPtr` l)

 regc :: Regex -> [B.ByteString] -> Result
 regc regex = foldl' addResult emptyResult . map (match regex)


 main :: IO ()
 main = main_ $ \regex files -> do
    input <- sequence $ map (\f -> openFile f ReadMode >>= BL.hGetContents) files
    return $ regc regex $ map toStrict2 $ concatMap BL8.lines input
diff --git a/regc.js b/regc.js
 #!/usr/bin/node
 function lineReader(reader, callback) {
    var buf = "";

    reader.on('data', function(chunk) {
        buf += chunk;

        if (buf.indexOf("\n") >= 0) {
            var lines = buf.split("\n");

            buf = lines.pop();

            for (var i in lines) {
                callback(lines[i]);
            }
        }
    });
 }

 function main(args) {
    if (args.length === 0) {
        console.error("usage: regc.js REGEX");
        process.exit(1);
    }

    var regex = new RegExp(args[0]);
    var result = {};

    // TODO: read from files if specified
    lineReader(process.stdin, function(line) {
        var m = line.match(regex);

        if (m) {
            var words;
            if (m.length == 1) {
                words = m[0];
            } else {
                words = m.slice(1).join("\n");
            }

            if (result[words] === undefined) {
                result[words] = 1;
            } else {
                result[words] += 1;
            }
        }
    });

    process.stdin.on('end', function() {
        var sum = 0;

        for (var k in result) {
            var v = result[k];
            console.log(v + "\t" + k.replace(/\n/g, "\t"));

            sum += v;
        }

        console.log(sum);
    });
    process.stdin.resume();
 }

 // strip node and script name
 main(process.argv.slice(2));
diff --git a/regc.pl b/regc.pl
 #!/usr/bin/perl

 use warnings;
 use strict;

 my $re = shift;
 my %result = ();

 while (my $line = <>) {
    if (my @matches = ($line =~ /$re/po)) {
        if (defined $1) {
            $result{join("\n", @matches)} += 1;
        } else {
            $result{${^MATCH}} += 1;
        }
    }
 }

 my $sum = 0;

 for my $k (keys %result) {
    my $v = $result{$k};
    $k =~ s/\n/\t/go;
    print $v, "\t", $k, "\n";

    $sum += $v;
 }

 print $sum, "\n";
diff --git a/regc.py b/regc.py
 #!/usr/bin/python
 # vim: fileencoding=utf-8
 '''
 regc.py - regex counter

 指定されたファイルもしくは標準入力から正規表現にマッチする部分を抜きだし、
 その出現回数を表示する。

 正規表現がグルーピングを含む場合はグループ部分だけ、
 そうでなければ全体を抜き出してカウントする。
 '''
 import re

 def regex_counter(regex, input, flags=0):
    r = re.compile(regex, flags)
    result = {}
    for line in input:
        mobj = r.search(line)
        if mobj:
            k = mobj.groups() or mobj.group()

            try:
                result[k] += 1

            except KeyError:
                result[k] = 1

    return result


 def main():
    import sys

    from optparse import OptionParser

    parser = OptionParser('Usage: %prog [OPTION]... REGEX [FILE]...\n'
                          'Search REGEX in each FILE or standard input,\n'
                          'count occurrences.')
    parser.add_option('-i', '--ignore-case',
                  action='store_true', dest='ignore_case',
                  help='Perform case-insensitive matching.')
    parser.add_option('-l', '--locale',
                  action='store_true', dest='locale',
                  help=r'Make \w, \W, \b, \B, dependent on the current locale.')
    parser.add_option('-u', '--unicode',
                  action='store_true', dest='unicode',
                  help=r'Make \w, \W, \b, \B, dependent on the Unicode locale.')

    opt, args = parser.parse_args()

    if not args:
        parser.error('REGEX is required')

    flags = 0
    if opt.ignore_case:
        flags = flags | re.IGNORECASE
    if opt.locale:
        flags = flags | re.LOCALE
    if opt.unicode:
        flags = flags | re.UNICODE

    regex = args[0]

    if len(args) > 1:
        import itertools
        input = itertools.chain(*map(open, args[1:]))

    else:
        input = sys.stdin

    sum_ = 0
    for k, v in regex_counter(regex, input, flags=flags).iteritems():
        if isinstance(k, tuple):
            k = '\t'.join(k)

        print '%d\t%s' % (v, k)
        sum_ += v

    print sum_


 if __name__ == '__main__':
    main()
	{-# LANGUAGE FlexibleContexts #-}
	module Common
	( Result
	, ResultPair
	, Regex -- re-export
	, match
	, addResult
	, emptyResult
	, showResultPair
	, main_
	) where

	import Control.Applicative ((<$>))
	import Data.Array (elems)
	import System.Environment (getArgs, getProgName)
	import System.IO (hPutStrLn, stderr)
	--import Text.Regex.Posix (makeRegex, Regex, matchAllText)
	--import Text.Regex.TDFA (makeRegex, Regex, matchAllText)
	import Text.Regex.PCRE (makeRegex, Regex, RegexLike, matchAllText)

	import qualified Data.ByteString.Char8 as B
	import qualified Data.HashMap.Strict as M

	type Result = M.HashMap [B.ByteString] Int
	type ResultPair = ([B.ByteString], Int)
	type Matches = [B.ByteString]

	match :: Regex -> B.ByteString -> [B.ByteString]
	--match :: Regex -> String -> [String]
	--match :: RegexLike Regex s => Regex -> s -> [s]
	match r bs = case elems <$> matchAllText r bs of
	[] -> []
	(m:[]):_ -> [fst m]
	(_:ms):_ -> map fst ms

	addResult :: Result -> Matches -> Result
	addResult r [] = r
	addResult r ms = M.insertWith (+) ms 1 r

	emptyResult :: Result
	emptyResult = M.empty

	showResultPair :: ResultPair -> B.ByteString
	showResultPair (bss, i) = B.pack (show i ++ "\t") `B.append` B.intercalate (B.pack "\t") bss

	printResult :: Result -> IO ()
	printResult = mapM_ (B.putStrLn . showResultPair) . M.toList

	printUsage :: IO ()
	printUsage = do
	name <- getProgName
	hPutStrLn stderr $ "Usage: " ++ name ++ " regex [files]"

	main_ :: (Regex -> [FilePath] -> IO Result) -> IO ()
	main_ f = do
	args <- getArgs
	case args of
	[] -> printUsage
	regex:[] -> printResult =<< f (makeRegex regex) ["/dev/stdin"]
	regex:files -> printResult =<< f (makeRegex regex) files
	import Data.Conduit

	import Data.Monoid (mconcat)

	import qualified Data.ByteString.Char8 as B
	import qualified Data.Conduit.Binary as CB
	import qualified Data.Conduit.List as CL

	import Common


	regc :: Monad m => Regex -> Sink B.ByteString m Result
	regc regex = CL.map (match regex) =$ CL.fold addResult emptyResult

	main :: IO ()
	main = main_ $ \regex files -> runResourceT $ (mconcat $ map CB.sourceFile files) $= CB.lines $$ regc regex
	package main

	import (
	"bufio"
	"fmt"
	"github.com/glenn-brown/golang-pkg-pcre/src/pkg/pcre"
	"io"
	"os"
	"strings"
	)

	type reCompileError string

	func (e reCompileError) Error() string {
	return string(e)
	}

	func RegCount(regexString string, inputs []io.Reader) (error, map[string]int) {
	regex, err := pcre.Compile(regexString, 0)
	if err != nil {
	return reCompileError(err.Message), nil
	}

	c := make(chan string, 4096)

	go func() {
	fileReader(inputs, c)
	close(c)
	}()

	result := regexCounter(regex, c)

	return nil, result
	}

	func fileReader(inputs []io.Reader, output chan string) {
	for i := range inputs {
	input := bufio.NewReader(inputs[i])

	for {
	line, err := input.ReadString('\n')

	if err == io.EOF {
	break
	} else if err != nil {
	panic(err)
	}

	line = strings.TrimRight(line, "\r\n")
	output <- line
	}
	}
	}

	func regexCounter(regex pcre.Regexp, input chan string) map[string]int {
	result := make(map[string]int)

	for {
	line, ok := <-input

	if !ok {
	break
	}

	match := regex.MatcherString(line, 0)

	if match.Matches() {
	groups := match.Groups()
	if groups == 0 {
	result[match.GroupString(0)] += 1
	} else {
	s := match.GroupString(1)
	for i := 2; i <= groups; i++ {
	s += "\n" + match.GroupString(i)
	}
	result[s] += 1
	}
	}
	}

	return result
	}

	func main() {
	if len(os.Args) == 1 {
	fmt.Fprintln(os.Stderr, "usage: regc REGEX [files ...]")
	os.Exit(1)
	}

	regex := os.Args[1]
	inputs := []io.Reader{}
	fileNames := os.Args[2:]

	if len(fileNames) != 0 {
	for i := range fileNames {
	f, err := os.Open(fileNames[i])

	if err != nil {
	fmt.Fprintln(os.Stderr, err)
	} else {
	inputs = append(inputs, f)
	}
	}
	} else {
	inputs = []io.Reader{os.Stdin}
	}

	if len(inputs) == 0 {
	os.Exit(1)
	}

	err, result := RegCount(regex, inputs)
	if err != nil {
	panic(err)
	}

	sum := 0

	for k, v := range result {
	fmt.Printf("%d\t%s\n", v, strings.Replace(k, "\n", "\t", -1))

	sum += v
	}

	fmt.Println(sum)
	}
	{-# LANGUAGE BangPatterns #-}
	import Data.List (foldl')
	import System.IO (openFile, IOMode(ReadMode))

	import qualified Data.ByteString.Lazy.Char8 as BL8

	import qualified Data.ByteString as B
	import qualified Data.ByteString.Internal as BI
	import qualified Data.ByteString.Lazy as BL
	import qualified Data.ByteString.Lazy.Internal as BLI
	import Foreign.ForeignPtr
	import Foreign.Ptr

	import Common


	-- http://stackoverflow.com/questions/7815402/convert-a-lazy-bytestring-to-a-strict-bytestring
	toStrict :: BL.ByteString -> B.ByteString
	toStrict = B.concat . BL.toChunks

	toStrict2 :: BL.ByteString -> B.ByteString
	toStrict2 BLI.Empty = B.empty
	toStrict2 (BLI.Chunk c BLI.Empty) = c
	toStrict2 lb = BI.unsafeCreate len $ go lb
	where
	len = BLI.foldlChunks (\l sb -> l + B.length sb) 0 lb

	go BLI.Empty _ = return ()
	go (BLI.Chunk (BI.PS fp s l) r) ptr =
	withForeignPtr fp $ \p -> do
	BI.memcpy ptr (p `plusPtr` s) (fromIntegral l)
	go r (ptr `plusPtr` l)

	regc :: Regex -> [B.ByteString] -> Result
	regc regex = foldl' addResult emptyResult . map (match regex)


	main :: IO ()
	main = main_ $ \regex files -> do
	input <- sequence $ map (\f -> openFile f ReadMode >>= BL.hGetContents) files
	return $ regc regex $ map toStrict2 $ concatMap BL8.lines input
	#!/usr/bin/node
	function lineReader(reader, callback) {
	var buf = "";

	reader.on('data', function(chunk) {
	buf += chunk;

	if (buf.indexOf("\n") >= 0) {
	var lines = buf.split("\n");

	buf = lines.pop();

	for (var i in lines) {
	callback(lines[i]);
	}
	}
	});
	}

	function main(args) {
	if (args.length === 0) {
	console.error("usage: regc.js REGEX");
	process.exit(1);
	}

	var regex = new RegExp(args[0]);
	var result = {};

	// TODO: read from files if specified
	lineReader(process.stdin, function(line) {
	var m = line.match(regex);

	if (m) {
	var words;
	if (m.length == 1) {
	words = m[0];
	} else {
	words = m.slice(1).join("\n");
	}

	if (result[words] === undefined) {
	result[words] = 1;
	} else {
	result[words] += 1;
	}
	}
	});

	process.stdin.on('end', function() {
	var sum = 0;

	for (var k in result) {
	var v = result[k];
	console.log(v + "\t" + k.replace(/\n/g, "\t"));

	sum += v;
	}

	console.log(sum);
	});
	process.stdin.resume();
	}

	// strip node and script name
	main(process.argv.slice(2));
	#!/usr/bin/perl

	use warnings;
	use strict;

	my $re = shift;
	my %result = ();

	while (my $line = <>) {
	if (my @matches = ($line =~ /$re/po)) {
	if (defined $1) {
	$result{join("\n", @matches)} += 1;
	} else {
	$result{${^MATCH}} += 1;
	}
	}
	}

	my $sum = 0;

	for my $k (keys %result) {
	my $v = $result{$k};
	$k =~ s/\n/\t/go;
	print $v, "\t", $k, "\n";

	$sum += $v;
	}

	print $sum, "\n";
	#!/usr/bin/python
	# vim: fileencoding=utf-8
	'''
	regc.py - regex counter

	指定されたファイルもしくは標準入力から正規表現にマッチする部分を抜きだし、
	その出現回数を表示する。

	正規表現がグルーピングを含む場合はグループ部分だけ、
	そうでなければ全体を抜き出してカウントする。
	'''
	import re

	def regex_counter(regex, input, flags=0):
	r = re.compile(regex, flags)
	result = {}
	for line in input:
	mobj = r.search(line)
	if mobj:
	k = mobj.groups() or mobj.group()

	try:
	result[k] += 1

	except KeyError:
	result[k] = 1

	return result


	def main():
	import sys

	from optparse import OptionParser

	parser = OptionParser('Usage: %prog [OPTION]... REGEX [FILE]...\n'
	'Search REGEX in each FILE or standard input,\n'
	'count occurrences.')
	parser.add_option('-i', '--ignore-case',
	action='store_true', dest='ignore_case',
	help='Perform case-insensitive matching.')
	parser.add_option('-l', '--locale',
	action='store_true', dest='locale',
	help=r'Make \w, \W, \b, \B, dependent on the current locale.')
	parser.add_option('-u', '--unicode',
	action='store_true', dest='unicode',
	help=r'Make \w, \W, \b, \B, dependent on the Unicode locale.')

	opt, args = parser.parse_args()

	if not args:
	parser.error('REGEX is required')

	flags = 0
	if opt.ignore_case:
	flags = flags \| re.IGNORECASE
	if opt.locale:
	flags = flags \| re.LOCALE
	if opt.unicode:
	flags = flags \| re.UNICODE

	regex = args[0]

	if len(args) > 1:
	import itertools
	input = itertools.chain(*map(open, args[1:]))

	else:
	input = sys.stdin

	sum_ = 0
	for k, v in regex_counter(regex, input, flags=flags).iteritems():
	if isinstance(k, tuple):
	k = '\t'.join(k)

	print '%d\t%s' % (v, k)
	sum_ += v

	print sum_


	if __name__ == '__main__':
	main()