Skip to content

Instantly share code, notes, and snippets.

@sng2c
Last active December 19, 2015 21:09
Show Gist options
  • Save sng2c/6018524 to your computer and use it in GitHub Desktop.
Save sng2c/6018524 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
use lib './lib';
use Parse::Token::Lite;
use LWP::Simple;
my $urlpat = qr@https?://[0-9a-zA-Z-_#\?=&\.\@\%:/]+@;
my %rules = (
MAIN=>[
{name=>HTML_ENTITY_VAL, re=>qr/&\S+?;/},
{name=>HTML_COMMENT, re=>qr/<!--.+?-->/ms},
{name=>TAG_START_IN, re=>qr/<\!?\w+/, state=>[qw(+TAG_IN)] },
{name=>TAG_END, re=>qr@</\w+>@ },
{name=>URL_VAL, re => $urlpat },
{name=>WHITE_SPACE, re=>qr/\s+/ },
{name=>TEXT_VAL, re => qr/[a-zA-Z_0-9]+/ },
{name=>SIGIL_VAL, re => qr/[^a-zA-Z_0-9\s&]+/ },
],
TAG_IN=>[
{name=>TAG_PROP_IN1_VAL, re=>qr@\w+\s*=\s*'@, state=>['+PROP_IN1']},
{name=>TAG_PROP_IN2_VAL, re=>qr@\w+\s*=\s*"@, state=>['+PROP_IN2']},
{name=>TAG_PROP_IN3_VAL, re=>qr@\w+\s*=\s*@, state=>['+PROP_IN3']},
{name=>TAG_PROP_SINGLE_VAL, re=>qr@\w+@},
{name=>TAG_START_OUT, re=>qr@/?>@, state=>[qw(-TAG_IN)]},
{name=>TAG_WHITE_SPACE, re=>qr/\s+/ },
{name=>TAG_ERR, re => qr/.+/, func=>sub{die $_[1]->data;} },
],
PROP_IN1=>[
{name=>URL_VAL, re => $urlpat },
{name=>PROP1_VAL, re=>qr@[^']+@},
{name=>PROP1_OUT, re=>qr@'@, state=>['-PROP_IN1']},
{name=>PROP1_ERR, re => qr/.+/, func=>sub{die $_[1]->data;} },
],
PROP_IN2=>[
{name=>URL_VAL, re => $urlpat },
{name=>PROP2_VAL, re=>qr@[^"]+@},
{name=>PROP2_OUT, re=>qr@"@, state=>['-PROP_IN2']},
{name=>PROP2_ERR, re => qr/.+/, func=>sub{die $_[1]->data;} },
],
PROP_IN3=>[
{name=>URL_VAL, re => $urlpat },
{name=>PROP3_VAL, re=>qr@[^>\s]+@},
{name=>PROP3_OUT, re=>qr@[^>\S]+@, state=>['-PROP_IN3']},
{name=>PROP3_TAG_OUT, re=>qr@>@, state=>['-PROP_IN3','-TAG_IN']},
{name=>PROP3_ERR, re => qr/.+/, func=>sub{die $_[1]->data;} },
],
);
my $parser = Parse::Token::Lite->new(rulemap=>\%rules);
my $html = <<'HTML';
<html>
<body>
ABC
<img src="http://metacpan.org">
</body>
</html>
HTML
$html = get('https://metacpan.org/module/Parse::Token::Lite');
$parser->from($html);
while( ! $parser->eof ){
my($token, @rest) = $parser->nextToken;
my $state_tag = $token->rule->name;
my $data = $token->data;
if( $state_tag =~ /VAL$/ ){
print "$state_tag -->$data<--\n";
#print "\n[ ".join(">",@{$parser->state_stack})." ]\n";
}
}
__DATA__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment