gonter · August 8, 2017 06:52
diff --git a/README.textile b/README.textile
diff --git a/README.txt b/README.txt
 This archive contains additional material to our paper:

 Silvana Hartmann and Iryna Gurevych: FrameNet on the Way to Babel: Creating a Bilingual FrameNet using Wiktionary as Interlingual Connection. In: Proceedings of the 51th Annual Meeting of the Association for Computational Linguistics, August 2013, Sofia, Bulgaria.

 Please cite this paper if you use our datasets in your work.

 Source: http://www.ukp.tu-darmstadt.de/data/fnwkde
 ---------------------------------------------------------------

 The following files are available from our homepage:

 1) fnwk_acl_2013_uby

 Contains the FrameNet- English Wiktionary alignment (fnwkxx) , and the FrameNet - German Wiktionary alignment (fnwkde) as detailed in the paper. 

 Data format:
 - Data are provided in .sql format compatible with UBY API release 0.1.1 (http://code.google.com/p/uby)
 - The database contains the UBY version of FrameNet 1.5, the English Wiktionary dump from 2011/04/02 and the German Wiktionary dump from 2011/04/06
 - The actual alignment is modeled in the SenseAxis table. The following alignments are contained:  
  - FrameNet senses to English Wiktionary senses: LexicalResourceId="Uby_Alignments_fnwkxx_hartmann_2013" 
  - FrameNet senses to German Wiktionary senses: LexicalResourceId="Uby_Alignments_fnwkde_hartmann_2013"
  - English Wiktonary Senses to German Wiktionary senses (from Meyer & Gurevych (2012), see: http://www.ukp.tu-darmstadt.de/data/lexical-resources/wordnet-wiktionary-alignment/): LexicalResourceId="Uby_Alignments_wkten_wktde_meyer_2012"
  - English Wiktionary Senses to their synonyms in Wiktionary (from Meyer & Gurevych (2012), see: http://www.ukp.tu-darmstadt.de/data/lexical-resources/wordnet-wiktionary-alignment/): LexicalResourceId="Uby_Alignments_wkten_synonyms_meyer_2012"

 2) fnwk_acl_2013_alignment

 Only the sense id pairs from fnwkxx and fnwkde (see 1)) 

 Data format:
 - .csv files, columns containing the following entries
  - FN 1.5 lexical unit id
  - English Wiktionary sense id as in 1) for fnwkxx_sense_id_pairs.csv, German Wiktionary sense id as in 2) for fnwkde_sense_id_pairs.csv

 3) fnwk_acl_2013_gold_standard

 The manually annotated dataset used for the intrinsic evaluation of our automatic alignment. 

 Data format: 
 - .csv file, columns containing the following entries:
  - FrameNet 1.5 lexical unit id
  - Wiktionary sense id of the English Wiktionary dump from 2011/04/02 parsed with JWKTL 15.2
  - Classification: 1="aligned", 0="not aligned"
  - Part-of-speech abbreviation (as in FrameNet)
  - lemma 

 4) fnwkde_alignment_example.pdf

 Example illustrating the alignment 

 5) fnwkxx_statistics.pdf

 Overview table on Wiktionary translations available for FrameNet senses in fnwkxx (see 1))


 ---------------------------------------------------------------

 Licenses

 This dataset is available under the Creative Commons Attribution/Share-Alike License (CC-BY-SA) in accordance with the resources used in the creation of the dataset. (See http://creativecommons.org/licenses/by-sa/3.0/ for details.)

 The Wiktionary dataset is available under the Creative Commons Attribution/Share-Alike License (CC-BY-SA). See http://creativecommons.org/licenses/by-sa/3.0/ and http://www.wiktionary.org/ for details.

 The FrameNet dataset is likewise available under the Creative Commons Attribution-Only License. See http://creativecommons.org/licenses/by/3.0/ and http://framenet.icsi.berkeley.edu for details.

 The dataset linking the German and English Wiktionary was kindly provided by Christian M. Meyer. See: http://www.ukp.tu-darmstadt.de/data/lexical-resources/wordnet-wiktionary-alignment/.

 ---------------------------------------------------------------

 Contact 

 If you have any questions, please contact the main author Silvana Hartmann (www.ukp.tu-darmstadt.de/people/doctoral-researchers/silvana-hartmann)
diff --git a/matching_ids.tsv b/matching_ids.tsv
diff --git a/no_matching_ids.tsv b/no_matching_ids.tsv
diff --git a/t1.pl b/t1.pl
 #!/usr/bin/perl

 use strict;

 use Data::Dumper;
 $Data::Dumper::Indent= 1;

 binmode( STDOUT, ':utf8' ); autoflush STDOUT 1;
 binmode( STDERR, ':utf8' ); autoflush STDERR 1;
 binmode( STDIN,  ':utf8' );

 my $fnwk= new fnwkxx;
 $fnwk->parse_csv ('fnwkxx_sense_id_pairs.csv', 'en');
 $fnwk->parse_csv ('fnwkde_sense_id_pairs.csv', 'de');
 $fnwk->parse_csv ('fnwk_acl_2013_gold_standard.csv', 'gs', 1);

 $fnwk->remove_unaligned();

 get_csv ($fnwk, 'en');
 get_csv ($fnwk, 'de');

 print_matching ($fnwk);

 if (open (DUMP, '>:utf8', 'wkt.dump'))
 {
  print DUMP "fnwk: ", Dumper ($fnwk);
  close (DUMP);
 }

 exit;

 sub item_info
 {
  my ($x, $lang)= @_;

  my $s= $x->{$lang}->{str};
  my $y= $x->{$lang}->{wkt};

  return (defined ($y))
         ? ($s, scalar @$y, $y->[0]->[9]) # NOTE: there might be more than one language item here
         : ($s, 0, undef);
 }

 sub gs_info
 {
  my $x= shift;

  my $y= $x->{gs};

  return (defined ($y))
         ? map { $y->{$_} } qw(class pos lemma),
         : (undef, undef, undef);
 }

 sub print_matching
 {
  my $fnwk= shift;

  my $fn= $fnwk->{fn};
  my $count_dropped= 0;
  my $count_matching= 0;

  open (FO1, '>:utf8', 'no_matching_ids.tsv') or die;
  open (FO2, '>:utf8', 'matching_ids.tsv') or die;

  my @col_items= qw(fn_item class pos lemma en_id en_cnt en_title de_id de_cnt de_title);
  print FO1 join ("\t", @col_items), "\n";
  print FO2 join ("\t", @col_items), "\n";

  ITEM: foreach my $item (sort { $a <=> $b } keys %$fn)
  {
    my $x= $fn->{$item};
    # print "x: ", main::Dumper ($x);

    my ($en_str, $en_wkt_c, $en_wkt_t)= item_info ($x, 'en');
    my ($de_str, $de_wkt_c, $de_wkt_t)= item_info ($x, 'de');
    my @gs= gs_info ($x);

    unless (exists ($x->{en}->{wkt}) && exists ($x->{de}->{wkt}))
    {
      $count_dropped++;
      print FO1 join ("\t", $item, @gs,
                      $en_str, $en_wkt_c, $en_wkt_t,
                      $de_str, $de_wkt_c, $de_wkt_t), "\n";

      next ITEM;
    }

    $count_matching++;

    print FO2 join ("\t", $item, @gs,
                    $en_str, $en_wkt_c, $en_wkt_t,
                    $de_str, $de_wkt_c, $de_wkt_t), "\n";

 #   printf ("%6d %-10s %-10s en=(%d)[%s] de=(%d)[%s]\n",
 #           $item, $x->{en}->{str}, $x->{de}->{str},
 #           $en_wkt_c, $en_wkt_t,
 #           $de_wkt_c, $de_wkt_t, "\n";
  }

  print "NOTE: $count_dropped items dropped, no matching pair found\n";
  print "NOTE: $count_matching matching pairs found\n";
 }

 sub get_csv
 {
  my $fnwk= shift;
  my $lang= shift;

  my $csv_fnm= join ('/', $lang, 'items.csv');

  my $fn= $fnwk->{fn};

  # map wiktionary ids to framenet ids
  # NOTE: one wiktionary id can map to several frament ids!
  my %lang_wkt_ids= ();
  foreach my $fn_id (keys %$fn)
  {
    my $wkt_id= $fn->{$fn_id}->{$lang}->{id};
    push (@{$lang_wkt_ids{$wkt_id}}, $fn_id);
  }
  # print "lang_wkt_ids: ", Dumper(\%lang_wkt_ids);

 =begin comment

 $ tsv --hdr en/items.csv 
 columns:
    0 line
    1 pos
    2 fo_count
    3 fo_pos_beg
    4 fo_pos_end
    5 id
    6 ns
    7 rev_id
    8 rev_sha1
    9 title

 =end comment
 =cut

  open (CSV, '<:utf8', $csv_fnm) or die "can't read $csv_fnm";
  my $count= 0;
  while (<CSV>)
  {
    chop;
    my @d= split (/\t/);
    my $wkt_id= $d[5];

    next unless exists ($lang_wkt_ids{$wkt_id});
    my $fn_ids= $lang_wkt_ids{$wkt_id};

    # print "INFO: match: ", join (' ', @d), "\n";

    foreach my $fn_id (@$fn_ids)
    {
      push (@{$fn->{$fn_id}->{$lang}->{wkt}}, \@d);
      $count++;
    }
  }
  close (CSV);
  print "NOTE: matched $count items in $csv_fnm\n";
 }


 package fnwkxx;

 sub new
 {
  bless { fn => {} }, shift;
 }

 sub parse_csv
 {
  my $self= shift;
  my $fnm=  shift;
  my $lang= shift;
  my $gs_flag= shift;

  open (FI, '<:utf8', $fnm) or die "can't open $fnm";
  my $fn= $self->{fn};
  my $count= 0;
  while (<FI>)
  {
    chop;
    next if (m/^#/);
    my ($fn_id, $wk_id_str, @rest)= split (',');
    my ($wk_id, $wk_p1, $wk_p2)= split (':', $wk_id_str);

    my $o= { id => $wk_id, str => $wk_id_str };
    ($o->{class}, $o->{pos}, $o->{lemma})= @rest if ($gs_flag);
    $fn->{$fn_id}->{$lang}= $o;

    $count++;
  }
  close (FI);

  print "NOTE: read $count items from $fnm\n";
 }

 sub remove_unaligned
 {
  my $self= shift;

  my $fn= $self->{fn};
  my $count_unaligned= 0;
  my $count_aligned= 0;
  foreach my $fn_id (keys %$fn)
  {
    my $x= $fn->{$fn_id};
    unless (exists ($x->{en}) && exists ($x->{de}))
    {
      delete ($fn->{$fn_id});
      $count_unaligned++;
    }
    else
    {
      $count_aligned++;
    }
  }

  print "NOTE: removed $count_unaligned unaligned(?) items\n";
  print "NOTE: keeping $count_aligned aligned(?) items\n";
 }
	This archive contains additional material to our paper:

	Silvana Hartmann and Iryna Gurevych: FrameNet on the Way to Babel: Creating a Bilingual FrameNet using Wiktionary as Interlingual Connection. In: Proceedings of the 51th Annual Meeting of the Association for Computational Linguistics, August 2013, Sofia, Bulgaria.

	Please cite this paper if you use our datasets in your work.

	Source: http://www.ukp.tu-darmstadt.de/data/fnwkde
	---------------------------------------------------------------

	The following files are available from our homepage:

	1) fnwk_acl_2013_uby

	Contains the FrameNet- English Wiktionary alignment (fnwkxx) , and the FrameNet - German Wiktionary alignment (fnwkde) as detailed in the paper.

	Data format:
	- Data are provided in .sql format compatible with UBY API release 0.1.1 (http://code.google.com/p/uby)
	- The database contains the UBY version of FrameNet 1.5, the English Wiktionary dump from 2011/04/02 and the German Wiktionary dump from 2011/04/06
	- The actual alignment is modeled in the SenseAxis table. The following alignments are contained:
	- FrameNet senses to English Wiktionary senses: LexicalResourceId="Uby_Alignments_fnwkxx_hartmann_2013"
	- FrameNet senses to German Wiktionary senses: LexicalResourceId="Uby_Alignments_fnwkde_hartmann_2013"
	- English Wiktonary Senses to German Wiktionary senses (from Meyer & Gurevych (2012), see: http://www.ukp.tu-darmstadt.de/data/lexical-resources/wordnet-wiktionary-alignment/): LexicalResourceId="Uby_Alignments_wkten_wktde_meyer_2012"
	- English Wiktionary Senses to their synonyms in Wiktionary (from Meyer & Gurevych (2012), see: http://www.ukp.tu-darmstadt.de/data/lexical-resources/wordnet-wiktionary-alignment/): LexicalResourceId="Uby_Alignments_wkten_synonyms_meyer_2012"

	2) fnwk_acl_2013_alignment

	Only the sense id pairs from fnwkxx and fnwkde (see 1))

	Data format:
	- .csv files, columns containing the following entries
	- FN 1.5 lexical unit id
	- English Wiktionary sense id as in 1) for fnwkxx_sense_id_pairs.csv, German Wiktionary sense id as in 2) for fnwkde_sense_id_pairs.csv

	3) fnwk_acl_2013_gold_standard

	The manually annotated dataset used for the intrinsic evaluation of our automatic alignment.

	Data format:
	- .csv file, columns containing the following entries:
	- FrameNet 1.5 lexical unit id
	- Wiktionary sense id of the English Wiktionary dump from 2011/04/02 parsed with JWKTL 15.2
	- Classification: 1="aligned", 0="not aligned"
	- Part-of-speech abbreviation (as in FrameNet)
	- lemma

	4) fnwkde_alignment_example.pdf

	Example illustrating the alignment

	5) fnwkxx_statistics.pdf

	Overview table on Wiktionary translations available for FrameNet senses in fnwkxx (see 1))


	---------------------------------------------------------------

	Licenses

	This dataset is available under the Creative Commons Attribution/Share-Alike License (CC-BY-SA) in accordance with the resources used in the creation of the dataset. (See http://creativecommons.org/licenses/by-sa/3.0/ for details.)

	The Wiktionary dataset is available under the Creative Commons Attribution/Share-Alike License (CC-BY-SA). See http://creativecommons.org/licenses/by-sa/3.0/ and http://www.wiktionary.org/ for details.

	The FrameNet dataset is likewise available under the Creative Commons Attribution-Only License. See http://creativecommons.org/licenses/by/3.0/ and http://framenet.icsi.berkeley.edu for details.

	The dataset linking the German and English Wiktionary was kindly provided by Christian M. Meyer. See: http://www.ukp.tu-darmstadt.de/data/lexical-resources/wordnet-wiktionary-alignment/.

	---------------------------------------------------------------

	Contact

	If you have any questions, please contact the main author Silvana Hartmann (www.ukp.tu-darmstadt.de/people/doctoral-researchers/silvana-hartmann)
fn_item	class	pos	lemma	en_id	en_cnt	en_title	de_id	de_cnt	de_title
8803				77499:0:1	1	ecstatic	131743:0:1	0
12555				42325:0:1	1	so far	59740:0:1	0
12558				244457:0:1	1	as yet	59740:0:1	0
	#!/usr/bin/perl

	use strict;

	use Data::Dumper;
	$Data::Dumper::Indent= 1;

	binmode( STDOUT, ':utf8' ); autoflush STDOUT 1;
	binmode( STDERR, ':utf8' ); autoflush STDERR 1;
	binmode( STDIN, ':utf8' );

	my $fnwk= new fnwkxx;
	$fnwk->parse_csv ('fnwkxx_sense_id_pairs.csv', 'en');
	$fnwk->parse_csv ('fnwkde_sense_id_pairs.csv', 'de');
	$fnwk->parse_csv ('fnwk_acl_2013_gold_standard.csv', 'gs', 1);

	$fnwk->remove_unaligned();

	get_csv ($fnwk, 'en');
	get_csv ($fnwk, 'de');

	print_matching ($fnwk);

	if (open (DUMP, '>:utf8', 'wkt.dump'))
	{
	print DUMP "fnwk: ", Dumper ($fnwk);
	close (DUMP);
	}

	exit;

	sub item_info
	{
	my ($x, $lang)= @_;

	my $s= $x->{$lang}->{str};
	my $y= $x->{$lang}->{wkt};

	return (defined ($y))
	? ($s, scalar @$y, $y->[0]->[9]) # NOTE: there might be more than one language item here
	: ($s, 0, undef);
	}

	sub gs_info
	{
	my $x= shift;

	my $y= $x->{gs};

	return (defined ($y))
	? map { $y->{$_} } qw(class pos lemma),
	: (undef, undef, undef);
	}

	sub print_matching
	{
	my $fnwk= shift;

	my $fn= $fnwk->{fn};
	my $count_dropped= 0;
	my $count_matching= 0;

	open (FO1, '>:utf8', 'no_matching_ids.tsv') or die;
	open (FO2, '>:utf8', 'matching_ids.tsv') or die;

	my @col_items= qw(fn_item class pos lemma en_id en_cnt en_title de_id de_cnt de_title);
	print FO1 join ("\t", @col_items), "\n";
	print FO2 join ("\t", @col_items), "\n";

	ITEM: foreach my $item (sort { $a <=> $b } keys %$fn)
	{
	my $x= $fn->{$item};
	# print "x: ", main::Dumper ($x);

	my ($en_str, $en_wkt_c, $en_wkt_t)= item_info ($x, 'en');
	my ($de_str, $de_wkt_c, $de_wkt_t)= item_info ($x, 'de');
	my @gs= gs_info ($x);

	unless (exists ($x->{en}->{wkt}) && exists ($x->{de}->{wkt}))
	{
	$count_dropped++;
	print FO1 join ("\t", $item, @gs,
	$en_str, $en_wkt_c, $en_wkt_t,
	$de_str, $de_wkt_c, $de_wkt_t), "\n";

	next ITEM;
	}

	$count_matching++;

	print FO2 join ("\t", $item, @gs,
	$en_str, $en_wkt_c, $en_wkt_t,
	$de_str, $de_wkt_c, $de_wkt_t), "\n";

	# printf ("%6d %-10s %-10s en=(%d)[%s] de=(%d)[%s]\n",
	# $item, $x->{en}->{str}, $x->{de}->{str},
	# $en_wkt_c, $en_wkt_t,
	# $de_wkt_c, $de_wkt_t, "\n";
	}

	print "NOTE: $count_dropped items dropped, no matching pair found\n";
	print "NOTE: $count_matching matching pairs found\n";
	}

	sub get_csv
	{
	my $fnwk= shift;
	my $lang= shift;

	my $csv_fnm= join ('/', $lang, 'items.csv');

	my $fn= $fnwk->{fn};

	# map wiktionary ids to framenet ids
	# NOTE: one wiktionary id can map to several frament ids!
	my %lang_wkt_ids= ();
	foreach my $fn_id (keys %$fn)
	{
	my $wkt_id= $fn->{$fn_id}->{$lang}->{id};
	push (@{$lang_wkt_ids{$wkt_id}}, $fn_id);
	}
	# print "lang_wkt_ids: ", Dumper(\%lang_wkt_ids);

	=begin comment

	$ tsv --hdr en/items.csv
	columns:
	0 line
	1 pos
	2 fo_count
	3 fo_pos_beg
	4 fo_pos_end
	5 id
	6 ns
	7 rev_id
	8 rev_sha1
	9 title

	=end comment
	=cut

	open (CSV, '<:utf8', $csv_fnm) or die "can't read $csv_fnm";
	my $count= 0;
	while (<CSV>)
	{
	chop;
	my @d= split (/\t/);
	my $wkt_id= $d[5];

	next unless exists ($lang_wkt_ids{$wkt_id});
	my $fn_ids= $lang_wkt_ids{$wkt_id};

	# print "INFO: match: ", join (' ', @d), "\n";

	foreach my $fn_id (@$fn_ids)
	{
	push (@{$fn->{$fn_id}->{$lang}->{wkt}}, \@d);
	$count++;
	}
	}
	close (CSV);
	print "NOTE: matched $count items in $csv_fnm\n";
	}


	package fnwkxx;

	sub new
	{
	bless { fn => {} }, shift;
	}

	sub parse_csv
	{
	my $self= shift;
	my $fnm= shift;
	my $lang= shift;
	my $gs_flag= shift;

	open (FI, '<:utf8', $fnm) or die "can't open $fnm";
	my $fn= $self->{fn};
	my $count= 0;
	while (<FI>)
	{
	chop;
	next if (m/^#/);
	my ($fn_id, $wk_id_str, @rest)= split (',');
	my ($wk_id, $wk_p1, $wk_p2)= split (':', $wk_id_str);

	my $o= { id => $wk_id, str => $wk_id_str };
	($o->{class}, $o->{pos}, $o->{lemma})= @rest if ($gs_flag);
	$fn->{$fn_id}->{$lang}= $o;

	$count++;
	}
	close (FI);

	print "NOTE: read $count items from $fnm\n";
	}

	sub remove_unaligned
	{
	my $self= shift;

	my $fn= $self->{fn};
	my $count_unaligned= 0;
	my $count_aligned= 0;
	foreach my $fn_id (keys %$fn)
	{
	my $x= $fn->{$fn_id};
	unless (exists ($x->{en}) && exists ($x->{de}))
	{
	delete ($fn->{$fn_id});
	$count_unaligned++;
	}
	else
	{
	$count_aligned++;
	}
	}

	print "NOTE: removed $count_unaligned unaligned(?) items\n";
	print "NOTE: keeping $count_aligned aligned(?) items\n";
	}