llamasoft · December 6, 2016 18:14
diff --git a/wordnet_parse.pl b/wordnet_parse.pl
 #!/usr/bin/perl

 use strict;
 use warnings;


 # Parses a synonym set line from a data.* file into a Synset hash
 sub parse_synset($) {
    my $line = shift(@_);

    # Definitions from wnutil.c, function getpos
    my %parts_of_speech = (
        'n' => 'NOUN',
        'a' => 'ADJECTIVE',
        's' => 'ADJECTIVE',
        'v' => 'VERB',
        'r' => 'ADVERB'
    );


    # Parsing taken from search.c, function parse_synset
    # This parses a WordNet database line into a Synset structure
    my @tokens = split(/ /, $line);


    # Byte offset within file
    my $hereiam = int(shift(@tokens));

    # File number that the synonym set comes from
    my $fnum = int(shift(@tokens));

    # The part of speech these words represent, called "pos"
    my $pos = $parts_of_speech{shift(@tokens)} || 'UNKNOWN';


    # The number of words in the synonym set encoded as two-digit hex
    my $wcount = hex(shift(@tokens));

    my @words = ();
    my @lexid = ();
    foreach (1 .. $wcount) {
        # The word itself, spaces replaced with underscores, comments/adjective type in parens
        my $word  = shift(@tokens);
        $word =~ tr/_/ /;
        $word =~ s/\(.*\)//;
        push(@words, $word);

        # The lexical ID of the current word
        push(@lexid, shift(@tokens));
    }


    # Pointers and relations to other words
    my $ptrcount = int(shift(@tokens));
    my @ptrtyp = ();
    my @ptroff = ();
    my @ppos   = ();
    my @pfrom  = ();
    my @pto    = ();
    foreach (1 .. $ptrcount) {
        push(@ptrtyp, shift(@tokens));
        push(@ptroff, int(shift(@tokens)));
        push(@ppos,   $parts_of_speech{shift(@tokens)} || 'UNKNOWN');

        my $tofrom = shift(@tokens);
        push(@pfrom, hex(substr($tofrom, 0, 2)));
        push(@pto,   hex(substr($tofrom, 2, 2)));
    }


    # Verbs contain additional information
    my $fcount = 0;
    my @frmid = ();
    my @frmto = ();
    if ( $pos eq 'VERB' ) {
        $fcount = int(shift(@tokens));

        foreach (1 .. $fcount) {
            # Removing dummy frame pointer (+)
            shift(@tokens);

            push(@frmid, int(shift(@tokens)));
            push(@frmto, hex(shift(@tokens)));
        }
    }


    # If anything remains, it's the definition (optional)
    my $defn = '';
    if ( scalar(@tokens) > 0 ) {
        # Removing dummy delimiter (|)
        shift(@tokens);
        $defn = join(' ', @tokens);
    }


    # Return the Synset struct as a hash
    # Elements with little value to us have been commented out
    return (
    #   'hereiam'  => $hereiam,
    #   'fnum'     => $fnum,
        'pos'      => $pos,
        'wcount'   => $wcount,
        'words'    => \@words,
    #   'lexid'    => \@lexid,
    #   'ptrcount' => $ptrcount,
    #   'ptrtyp'   => \@ptrtyp,
    #   'ptroff'   => \@ptroff,
    #   'ppos'     => \@ppos,
    #   'pfrom'    => \@pfrom,
    #   'pto'      => \@pto,
    #   'fcount'   => $fcount,
    #   'frmid'    => \@frmid,
    #   'frmto'    => \@frmto,
        'defn'     => $defn
    );
 }



 while ( my $line = <> ) {
    # If no 8-digit byte offset is present, skip this line
    if ( $line !~ /^[0-9]{8}\s/ ) { next; }
    chomp($line);

    my %syn = parse_synset($line);
    my @words = @{ $syn{'words'} };

    # Example: just print the words, one word per line
    print join("\n", @words), "\n";
 }
	#!/usr/bin/perl

	use strict;
	use warnings;


	# Parses a synonym set line from a data.* file into a Synset hash
	sub parse_synset($) {
	my $line = shift(@_);

	# Definitions from wnutil.c, function getpos
	my %parts_of_speech = (
	'n' => 'NOUN',
	'a' => 'ADJECTIVE',
	's' => 'ADJECTIVE',
	'v' => 'VERB',
	'r' => 'ADVERB'
	);


	# Parsing taken from search.c, function parse_synset
	# This parses a WordNet database line into a Synset structure
	my @tokens = split(/ /, $line);


	# Byte offset within file
	my $hereiam = int(shift(@tokens));

	# File number that the synonym set comes from
	my $fnum = int(shift(@tokens));

	# The part of speech these words represent, called "pos"
	my $pos = $parts_of_speech{shift(@tokens)} \|\| 'UNKNOWN';


	# The number of words in the synonym set encoded as two-digit hex
	my $wcount = hex(shift(@tokens));

	my @words = ();
	my @lexid = ();
	foreach (1 .. $wcount) {
	# The word itself, spaces replaced with underscores, comments/adjective type in parens
	my $word = shift(@tokens);
	$word =~ tr/_/ /;
	$word =~ s/\(.*\)//;
	push(@words, $word);

	# The lexical ID of the current word
	push(@lexid, shift(@tokens));
	}


	# Pointers and relations to other words
	my $ptrcount = int(shift(@tokens));
	my @ptrtyp = ();
	my @ptroff = ();
	my @ppos = ();
	my @pfrom = ();
	my @pto = ();
	foreach (1 .. $ptrcount) {
	push(@ptrtyp, shift(@tokens));
	push(@ptroff, int(shift(@tokens)));
	push(@ppos, $parts_of_speech{shift(@tokens)} \|\| 'UNKNOWN');

	my $tofrom = shift(@tokens);
	push(@pfrom, hex(substr($tofrom, 0, 2)));
	push(@pto, hex(substr($tofrom, 2, 2)));
	}


	# Verbs contain additional information
	my $fcount = 0;
	my @frmid = ();
	my @frmto = ();
	if ( $pos eq 'VERB' ) {
	$fcount = int(shift(@tokens));

	foreach (1 .. $fcount) {
	# Removing dummy frame pointer (+)
	shift(@tokens);

	push(@frmid, int(shift(@tokens)));
	push(@frmto, hex(shift(@tokens)));
	}
	}


	# If anything remains, it's the definition (optional)
	my $defn = '';
	if ( scalar(@tokens) > 0 ) {
	# Removing dummy delimiter (\|)
	shift(@tokens);
	$defn = join(' ', @tokens);
	}


	# Return the Synset struct as a hash
	# Elements with little value to us have been commented out
	return (
	# 'hereiam' => $hereiam,
	# 'fnum' => $fnum,
	'pos' => $pos,
	'wcount' => $wcount,
	'words' => \@words,
	# 'lexid' => \@lexid,
	# 'ptrcount' => $ptrcount,
	# 'ptrtyp' => \@ptrtyp,
	# 'ptroff' => \@ptroff,
	# 'ppos' => \@ppos,
	# 'pfrom' => \@pfrom,
	# 'pto' => \@pto,
	# 'fcount' => $fcount,
	# 'frmid' => \@frmid,
	# 'frmto' => \@frmto,
	'defn' => $defn
	);
	}



	while ( my $line = <> ) {
	# If no 8-digit byte offset is present, skip this line
	if ( $line !~ /^[0-9]{8}\s/ ) { next; }
	chomp($line);

	my %syn = parse_synset($line);
	my @words = @{ $syn{'words'} };

	# Example: just print the words, one word per line
	print join("\n", @words), "\n";
	}
No results found