|
#!/usr/bin/perl |
|
|
|
use strict; |
|
|
|
use Data::Dumper; |
|
$Data::Dumper::Indent= 1; |
|
|
|
binmode( STDOUT, ':utf8' ); autoflush STDOUT 1; |
|
binmode( STDERR, ':utf8' ); autoflush STDERR 1; |
|
binmode( STDIN, ':utf8' ); |
|
|
|
my $fnwk= new fnwkxx; |
|
$fnwk->parse_csv ('fnwkxx_sense_id_pairs.csv', 'en'); |
|
$fnwk->parse_csv ('fnwkde_sense_id_pairs.csv', 'de'); |
|
$fnwk->parse_csv ('fnwk_acl_2013_gold_standard.csv', 'gs', 1); |
|
|
|
$fnwk->remove_unaligned(); |
|
|
|
get_csv ($fnwk, 'en'); |
|
get_csv ($fnwk, 'de'); |
|
|
|
print_matching ($fnwk); |
|
|
|
if (open (DUMP, '>:utf8', 'wkt.dump')) |
|
{ |
|
print DUMP "fnwk: ", Dumper ($fnwk); |
|
close (DUMP); |
|
} |
|
|
|
exit; |
|
|
|
sub item_info |
|
{ |
|
my ($x, $lang)= @_; |
|
|
|
my $s= $x->{$lang}->{str}; |
|
my $y= $x->{$lang}->{wkt}; |
|
|
|
return (defined ($y)) |
|
? ($s, scalar @$y, $y->[0]->[9]) # NOTE: there might be more than one language item here |
|
: ($s, 0, undef); |
|
} |
|
|
|
sub gs_info |
|
{ |
|
my $x= shift; |
|
|
|
my $y= $x->{gs}; |
|
|
|
return (defined ($y)) |
|
? map { $y->{$_} } qw(class pos lemma), |
|
: (undef, undef, undef); |
|
} |
|
|
|
sub print_matching |
|
{ |
|
my $fnwk= shift; |
|
|
|
my $fn= $fnwk->{fn}; |
|
my $count_dropped= 0; |
|
my $count_matching= 0; |
|
|
|
open (FO1, '>:utf8', 'no_matching_ids.tsv') or die; |
|
open (FO2, '>:utf8', 'matching_ids.tsv') or die; |
|
|
|
my @col_items= qw(fn_item class pos lemma en_id en_cnt en_title de_id de_cnt de_title); |
|
print FO1 join ("\t", @col_items), "\n"; |
|
print FO2 join ("\t", @col_items), "\n"; |
|
|
|
ITEM: foreach my $item (sort { $a <=> $b } keys %$fn) |
|
{ |
|
my $x= $fn->{$item}; |
|
# print "x: ", main::Dumper ($x); |
|
|
|
my ($en_str, $en_wkt_c, $en_wkt_t)= item_info ($x, 'en'); |
|
my ($de_str, $de_wkt_c, $de_wkt_t)= item_info ($x, 'de'); |
|
my @gs= gs_info ($x); |
|
|
|
unless (exists ($x->{en}->{wkt}) && exists ($x->{de}->{wkt})) |
|
{ |
|
$count_dropped++; |
|
print FO1 join ("\t", $item, @gs, |
|
$en_str, $en_wkt_c, $en_wkt_t, |
|
$de_str, $de_wkt_c, $de_wkt_t), "\n"; |
|
|
|
next ITEM; |
|
} |
|
|
|
$count_matching++; |
|
|
|
print FO2 join ("\t", $item, @gs, |
|
$en_str, $en_wkt_c, $en_wkt_t, |
|
$de_str, $de_wkt_c, $de_wkt_t), "\n"; |
|
|
|
# printf ("%6d %-10s %-10s en=(%d)[%s] de=(%d)[%s]\n", |
|
# $item, $x->{en}->{str}, $x->{de}->{str}, |
|
# $en_wkt_c, $en_wkt_t, |
|
# $de_wkt_c, $de_wkt_t, "\n"; |
|
} |
|
|
|
print "NOTE: $count_dropped items dropped, no matching pair found\n"; |
|
print "NOTE: $count_matching matching pairs found\n"; |
|
} |
|
|
|
sub get_csv |
|
{ |
|
my $fnwk= shift; |
|
my $lang= shift; |
|
|
|
my $csv_fnm= join ('/', $lang, 'items.csv'); |
|
|
|
my $fn= $fnwk->{fn}; |
|
|
|
# map wiktionary ids to framenet ids |
|
# NOTE: one wiktionary id can map to several frament ids! |
|
my %lang_wkt_ids= (); |
|
foreach my $fn_id (keys %$fn) |
|
{ |
|
my $wkt_id= $fn->{$fn_id}->{$lang}->{id}; |
|
push (@{$lang_wkt_ids{$wkt_id}}, $fn_id); |
|
} |
|
# print "lang_wkt_ids: ", Dumper(\%lang_wkt_ids); |
|
|
|
=begin comment |
|
|
|
$ tsv --hdr en/items.csv |
|
columns: |
|
0 line |
|
1 pos |
|
2 fo_count |
|
3 fo_pos_beg |
|
4 fo_pos_end |
|
5 id |
|
6 ns |
|
7 rev_id |
|
8 rev_sha1 |
|
9 title |
|
|
|
=end comment |
|
=cut |
|
|
|
open (CSV, '<:utf8', $csv_fnm) or die "can't read $csv_fnm"; |
|
my $count= 0; |
|
while (<CSV>) |
|
{ |
|
chop; |
|
my @d= split (/\t/); |
|
my $wkt_id= $d[5]; |
|
|
|
next unless exists ($lang_wkt_ids{$wkt_id}); |
|
my $fn_ids= $lang_wkt_ids{$wkt_id}; |
|
|
|
# print "INFO: match: ", join (' ', @d), "\n"; |
|
|
|
foreach my $fn_id (@$fn_ids) |
|
{ |
|
push (@{$fn->{$fn_id}->{$lang}->{wkt}}, \@d); |
|
$count++; |
|
} |
|
} |
|
close (CSV); |
|
print "NOTE: matched $count items in $csv_fnm\n"; |
|
} |
|
|
|
|
|
package fnwkxx; |
|
|
|
sub new |
|
{ |
|
bless { fn => {} }, shift; |
|
} |
|
|
|
sub parse_csv |
|
{ |
|
my $self= shift; |
|
my $fnm= shift; |
|
my $lang= shift; |
|
my $gs_flag= shift; |
|
|
|
open (FI, '<:utf8', $fnm) or die "can't open $fnm"; |
|
my $fn= $self->{fn}; |
|
my $count= 0; |
|
while (<FI>) |
|
{ |
|
chop; |
|
next if (m/^#/); |
|
my ($fn_id, $wk_id_str, @rest)= split (','); |
|
my ($wk_id, $wk_p1, $wk_p2)= split (':', $wk_id_str); |
|
|
|
my $o= { id => $wk_id, str => $wk_id_str }; |
|
($o->{class}, $o->{pos}, $o->{lemma})= @rest if ($gs_flag); |
|
$fn->{$fn_id}->{$lang}= $o; |
|
|
|
$count++; |
|
} |
|
close (FI); |
|
|
|
print "NOTE: read $count items from $fnm\n"; |
|
} |
|
|
|
sub remove_unaligned |
|
{ |
|
my $self= shift; |
|
|
|
my $fn= $self->{fn}; |
|
my $count_unaligned= 0; |
|
my $count_aligned= 0; |
|
foreach my $fn_id (keys %$fn) |
|
{ |
|
my $x= $fn->{$fn_id}; |
|
unless (exists ($x->{en}) && exists ($x->{de})) |
|
{ |
|
delete ($fn->{$fn_id}); |
|
$count_unaligned++; |
|
} |
|
else |
|
{ |
|
$count_aligned++; |
|
} |
|
} |
|
|
|
print "NOTE: removed $count_unaligned unaligned(?) items\n"; |
|
print "NOTE: keeping $count_aligned aligned(?) items\n"; |
|
} |
|
|