nemupm · August 29, 2015 14:06
diff --git a/make_csv_for_mecab.pl b/make_csv_for_mecab.pl
 #!/usr/bin/perl

 use strict;
 use warnings;
 use utf8;
 use encoding 'utf8';
 use MeCab;

 my $model = new MeCab::Model(join " ", @ARGV);
 my $c = $model->createTagger();

 my $file1 = "jawiki-latest-all-titles-in-ns0_formatted";
 my $file2 = "wikipedia.csv";

 open(IN, "$file1");
 open(OUT, ">$file2");
 binmode OUT, ":utf8";

 for(<IN>) {
  chomp($_);

  ## pass needless words
  next if $_ =~ /_\(.*\)$/; # ex. "apple_(company)"
  next if $_ =~ /^[a-zA-Z]$/; # one alphabet
  next if $_ =~ /^[0-9|!-\/:-@\[-`\{-~]*$/; # one number or mark
  next if $_ =~ /^[ぁ-んァ-ヴ・ー]$/; # one hiragana or katakana
  next if $_ =~ /(曖昧さの回避)/;

  ## convert underbar to space
  $_ =~ s/^_|_$|,//g;
  $_ =~ s/_/ /g;

  ## remove terms already existing
  $c->parse($_);
  my $m = $c->parseToNode($_);
  $m = $m->{next}; # 1st word}
  # go next only if the sentence is parsed as one word already known.
  next if $m->{stat} == 0 && $m->{next}->{stat} == 3;
  
  print OUT "$_,0,0,"
  .max(-36000,-400 * (length($_)^1.5))
  .",名詞,固有名詞,*,*,*,*,$_,*,*,wikipedia_word,\n"; # 4th parameter is cost.
 }

 sub max {
  my $comp = shift @_;
  my $val  = shift @_;
  my $max  = $comp;
  if ( $comp <= $val ) {
    $max = $val;
  }
  return int($max);
 }
	#!/usr/bin/perl

	use strict;
	use warnings;
	use utf8;
	use encoding 'utf8';
	use MeCab;

	my $model = new MeCab::Model(join " ", @ARGV);
	my $c = $model->createTagger();

	my $file1 = "jawiki-latest-all-titles-in-ns0_formatted";
	my $file2 = "wikipedia.csv";

	open(IN, "$file1");
	open(OUT, ">$file2");
	binmode OUT, ":utf8";

	for(<IN>) {
	chomp($_);

	## pass needless words
	next if $_ =~ /_\(.*\)$/; # ex. "apple_(company)"
	next if $_ =~ /^[a-zA-Z]$/; # one alphabet
	next if $_ =~ /^[0-9\|!-\/:-@\[-`\{-~]*$/; # one number or mark
	next if $_ =~ /^[ぁ-んァ-ヴ・ー]$/; # one hiragana or katakana
	next if $_ =~ /(曖昧さの回避)/;

	## convert underbar to space
	$_ =~ s/^_\|_$\|,//g;
	$_ =~ s/_/ /g;

	## remove terms already existing
	$c->parse($_);
	my $m = $c->parseToNode($_);
	$m = $m->{next}; # 1st word}
	# go next only if the sentence is parsed as one word already known.
	next if $m->{stat} == 0 && $m->{next}->{stat} == 3;

	print OUT "$_,0,0,"
	.max(-36000,-400 * (length($_)^1.5))
	.",名詞,固有名詞,,,,,$_,,,wikipedia_word,\n"; # 4th parameter is cost.
	}

	sub max {
	my $comp = shift @_;
	my $val = shift @_;
	my $max = $comp;
	if ( $comp <= $val ) {
	$max = $val;
	}
	return int($max);
	}