Skip to content

Instantly share code, notes, and snippets.

@fukata
Created July 29, 2012 14:50
Show Gist options
  • Save fukata/3199348 to your computer and use it in GitHub Desktop.
Save fukata/3199348 to your computer and use it in GitHub Desktop.
extract noun use mecab.fluentd out_exec_filter.
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;
use Encode;
use Text::MeCab;
use JSON::XS;
use Data::MessagePack;
my $mecab = Text::MeCab->new;
my $mp = Data::MessagePack->new;
while(my $json = <STDIN>){
my $decode = eval { decode_json($json) };
next if $@;
my $query = $decode->{query};
$query =~ s/^\s*(.*?)\s*$/$1/;
next unless $query;
my $node = $mecab->parse( $query );
while ($node) {
my $keyword = decode('utf8', $node->surface) || '';
if ($query ne $keyword and $keyword) {
my @features = split ',', decode('utf8', $node->feature);
if ( $keyword ne '名詞' and grep('名詞', @features) ) {
print $mp->pack({keyword => $keyword});
}
}
$node = $node->next;
}
}
<source>
type tail
format /^q\:(?<query>.*), page\:(?<page>[0-9]+), slag\:(?<slag>[^ ]+), time\:(?<time_max>[0-9]*), sort\:(?<sort>[^ ]+), order\:(?<order>[^ ]+)$/
path /var/log/hoge/foo.log
tag hoge.queries
pos_file /var/log/td-agent/foo.queries.pos
</source>
<match hoge.queries>
type exec_filter
command /hoge/foo/parse.pl
in_format json
out_format msgpack
tag hoge.queries.parsed
flush_interval 1s
</match>
<match hoge.queries.parsed>
type copy
<store>
type tdlog
apikey ${API KEY}
auto_create_table false
database hoge
table queries
flush_interval 300s
use_ssl true
buffer_type file
buffer_path /var/log/td-agent/buffer/hoge_queries_td
</store>
<store>
type mongo
database hoge
collection queries
host 127.0.0.1
port 27017
flush_interval 10s
buffer_type file
buffer_path /var/log/td-agent/buffer/hoge_queries_mongo
</store>
</match>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment