Last active
August 29, 2015 14:06
-
-
Save nemupm/24ba835a4da10f81d774 to your computer and use it in GitHub Desktop.
Make Wikipedia dictionary for MeCab - make_csv_for_mecab
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use utf8; | |
use encoding 'utf8'; | |
use MeCab; | |
my $model = new MeCab::Model(join " ", @ARGV); | |
my $c = $model->createTagger(); | |
my $file1 = "jawiki-latest-all-titles-in-ns0_formatted"; | |
my $file2 = "wikipedia.csv"; | |
open(IN, "$file1"); | |
open(OUT, ">$file2"); | |
binmode OUT, ":utf8"; | |
for(<IN>) { | |
chomp($_); | |
## pass needless words | |
next if $_ =~ /_\(.*\)$/; # ex. "apple_(company)" | |
next if $_ =~ /^[a-zA-Z]$/; # one alphabet | |
next if $_ =~ /^[0-9|!-\/:-@\[-`\{-~]*$/; # one number or mark | |
next if $_ =~ /^[ぁ-んァ-ヴ・ー]$/; # one hiragana or katakana | |
next if $_ =~ /(曖昧さの回避)/; | |
## convert underbar to space | |
$_ =~ s/^_|_$|,//g; | |
$_ =~ s/_/ /g; | |
## remove terms already existing | |
$c->parse($_); | |
my $m = $c->parseToNode($_); | |
$m = $m->{next}; # 1st word} | |
# go next only if the sentence is parsed as one word already known. | |
next if $m->{stat} == 0 && $m->{next}->{stat} == 3; | |
print OUT "$_,0,0," | |
.max(-36000,-400 * (length($_)^1.5)) | |
.",名詞,固有名詞,*,*,*,*,$_,*,*,wikipedia_word,\n"; # 4th parameter is cost. | |
} | |
sub max { | |
my $comp = shift @_; | |
my $val = shift @_; | |
my $max = $comp; | |
if ( $comp <= $val ) { | |
$max = $val; | |
} | |
return int($max); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment