Created
March 4, 2015 03:39
-
-
Save fuba/135f3459e9104d40f4c4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package NLPDocument::MeCabNode; | |
use strict; | |
use warnings; | |
use utf8; | |
use Encode; | |
use YAML::Syck; | |
use Lingua::JA::Numbers; | |
use base qw( Class::Accessor::Fast ); | |
__PACKAGE__->mk_accessors(qw/ | |
id pos pos_detail form type fund kana pron is_number statistics | |
line length | |
/); | |
sub new { | |
my ($pkg, $opt) = @_; | |
my $hash = {}; | |
if ($opt->{cache}) { | |
$hash = $opt->{cache}; | |
} | |
elsif ($opt->{mecab_node}) { | |
my $p = $opt->{mecab_node}; | |
my $feature = decode('utf-8', $p->feature); | |
$hash = parse_mecab_feature($feature); | |
my $surface = decode('utf-8', $p->surface); | |
$surface = '' unless (defined $surface); | |
my $pseudo_pron_for_empty_pron = $surface; | |
$pseudo_pron_for_empty_pron =~ s/・//g; | |
$hash->{line} = "$surface\t$feature\n"; | |
return if ($hash->{pos} eq 'BOS/EOS'); | |
if (!$hash->{pron} && $surface =~ /^\d+$/) { | |
$hash->{pron} = num2ja($surface, {style => 'katakana'}); | |
} | |
if (!$hash->{pron} && $pseudo_pron_for_empty_pron =~ /^\p{InKatakana}+$/) { | |
$hash->{pron} = $pseudo_pron_for_empty_pron; | |
} | |
$opt->{sentence_id} = (defined $opt->{sentence_id}) ? $opt->{sentence_id} : 0; | |
my $pid = (defined $p->id) ? $p->id : 0; | |
$hash->{id} = $opt->{sentence_id}.'-'.$p->id; | |
$hash->{surface} = $surface; | |
if ($hash->{pos_detail}->[0] eq '数') { | |
#$hash->{surface} = '<num>'; | |
$hash->{pos_detail}->[0] = ''; | |
$hash->{is_number} = 1; | |
} | |
$hash->{length} = length($hash->{surface}); | |
my @chars = ($hash->{surface} =~ m/(.)/g); | |
$hash->{chars} = \@chars; | |
} | |
elsif ($opt->{morp}) { | |
$hash = $opt->{morp}; | |
} | |
bless $hash, $pkg; | |
} | |
sub surface { | |
my $self = shift; | |
my %opt = @_; | |
if ($opt{normalize_num} && $self->is_number) { | |
return '<num>'; | |
} | |
return $self->{surface}; | |
} | |
sub chars { | |
return @{shift->{chars}}; | |
} | |
sub parse_mecab_feature { | |
my $csv = shift; | |
return {} unless ($csv); | |
my @v = split /\,/, $csv; | |
#0 品詞 | |
#1 品詞細分類1 | |
#2 品詞細分類2 | |
#3 品詞細分類3 | |
#4 活用型 | |
#5 活用形 | |
#6 基本形 | |
#7 読み | |
#8 発音 | |
return { | |
pos => $v[0], | |
pos_detail => [ $v[1], $v[2], $v[3] ], | |
form => $v[4], | |
type => $v[5], | |
fund => $v[6], | |
kana => $v[7], | |
pron => $v[8], | |
}; | |
} | |
sub is_noun { | |
my ($self, $option) = @_; | |
my $bool = 0; | |
$bool = 1 if ($self->pos eq '名詞'); | |
$bool = 0 if ($self->surface =~ /\W/); | |
$bool = 1 if ($self->is_number); | |
if ($option && !$option->{is_head}) { | |
} | |
else { | |
my $prebool = $bool; | |
for my $avoid (qw/ | |
副詞可能 非自立 接尾 接続詞的 代名詞 ナイ形容詞語幹 | |
接続詞的 代名詞 動詞非自立的 特殊 | |
/) { | |
$bool = 0 if ($self->in_pos_detail($avoid)); | |
last unless ($bool); | |
} | |
# warn 'is not head '.$self->surface if (!$bool && $prebool); | |
} | |
return $bool; | |
} | |
sub in_pos_detail { | |
my ($self, $kw) = @_; | |
return 1 if (grep {$kw eq $_} @{$self->pos_detail}); | |
return; | |
} | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment