Created
December 6, 2016 18:14
-
-
Save llamasoft/6b18fdd75e9ebf575fa37ae5d44b0b5d to your computer and use it in GitHub Desktop.
Princeton WordNet Database Parser Example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
# Parses a synonym set line from a data.* file into a Synset hash | |
sub parse_synset($) { | |
my $line = shift(@_); | |
# Definitions from wnutil.c, function getpos | |
my %parts_of_speech = ( | |
'n' => 'NOUN', | |
'a' => 'ADJECTIVE', | |
's' => 'ADJECTIVE', | |
'v' => 'VERB', | |
'r' => 'ADVERB' | |
); | |
# Parsing taken from search.c, function parse_synset | |
# This parses a WordNet database line into a Synset structure | |
my @tokens = split(/ /, $line); | |
# Byte offset within file | |
my $hereiam = int(shift(@tokens)); | |
# File number that the synonym set comes from | |
my $fnum = int(shift(@tokens)); | |
# The part of speech these words represent, called "pos" | |
my $pos = $parts_of_speech{shift(@tokens)} || 'UNKNOWN'; | |
# The number of words in the synonym set encoded as two-digit hex | |
my $wcount = hex(shift(@tokens)); | |
my @words = (); | |
my @lexid = (); | |
foreach (1 .. $wcount) { | |
# The word itself, spaces replaced with underscores, comments/adjective type in parens | |
my $word = shift(@tokens); | |
$word =~ tr/_/ /; | |
$word =~ s/\(.*\)//; | |
push(@words, $word); | |
# The lexical ID of the current word | |
push(@lexid, shift(@tokens)); | |
} | |
# Pointers and relations to other words | |
my $ptrcount = int(shift(@tokens)); | |
my @ptrtyp = (); | |
my @ptroff = (); | |
my @ppos = (); | |
my @pfrom = (); | |
my @pto = (); | |
foreach (1 .. $ptrcount) { | |
push(@ptrtyp, shift(@tokens)); | |
push(@ptroff, int(shift(@tokens))); | |
push(@ppos, $parts_of_speech{shift(@tokens)} || 'UNKNOWN'); | |
my $tofrom = shift(@tokens); | |
push(@pfrom, hex(substr($tofrom, 0, 2))); | |
push(@pto, hex(substr($tofrom, 2, 2))); | |
} | |
# Verbs contain additional information | |
my $fcount = 0; | |
my @frmid = (); | |
my @frmto = (); | |
if ( $pos eq 'VERB' ) { | |
$fcount = int(shift(@tokens)); | |
foreach (1 .. $fcount) { | |
# Removing dummy frame pointer (+) | |
shift(@tokens); | |
push(@frmid, int(shift(@tokens))); | |
push(@frmto, hex(shift(@tokens))); | |
} | |
} | |
# If anything remains, it's the definition (optional) | |
my $defn = ''; | |
if ( scalar(@tokens) > 0 ) { | |
# Removing dummy delimiter (|) | |
shift(@tokens); | |
$defn = join(' ', @tokens); | |
} | |
# Return the Synset struct as a hash | |
# Elements with little value to us have been commented out | |
return ( | |
# 'hereiam' => $hereiam, | |
# 'fnum' => $fnum, | |
'pos' => $pos, | |
'wcount' => $wcount, | |
'words' => \@words, | |
# 'lexid' => \@lexid, | |
# 'ptrcount' => $ptrcount, | |
# 'ptrtyp' => \@ptrtyp, | |
# 'ptroff' => \@ptroff, | |
# 'ppos' => \@ppos, | |
# 'pfrom' => \@pfrom, | |
# 'pto' => \@pto, | |
# 'fcount' => $fcount, | |
# 'frmid' => \@frmid, | |
# 'frmto' => \@frmto, | |
'defn' => $defn | |
); | |
} | |
while ( my $line = <> ) { | |
# If no 8-digit byte offset is present, skip this line | |
if ( $line !~ /^[0-9]{8}\s/ ) { next; } | |
chomp($line); | |
my %syn = parse_synset($line); | |
my @words = @{ $syn{'words'} }; | |
# Example: just print the words, one word per line | |
print join("\n", @words), "\n"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment