Skip to content

Instantly share code, notes, and snippets.

@kimmel
Created August 26, 2012 18:23
Show Gist options
  • Save kimmel/3482317 to your computer and use it in GitHub Desktop.
Save kimmel/3482317 to your computer and use it in GitHub Desktop.
perl regexp html parsing
$html =~ m{
>\s*$num\.</td>\s*<td>\s*<center>\s*<a\s+id=up_
(\d+) # $1 -> id
\s+href="vote\?for=\g1&dir=up&whence=[%a-e0-9]+">\s*<img\s+src="http://yc
ombinator\.com/images/grayarrow\.gif"\s+border=\d+\s+vspace=\d+\s+hspace=
\d+>\s*</a>\s*<span\s+id=down_\g1>\s*</span>\s*</center>\s*</td>\s*<td\s+
class="title">\s*<a\s+href="
([^"]+) # $2 -> uri
">
([^<]+) # $3 -> desc
</a>\s*<span\s*class="comhead">\s*\(
([\w\.]+) # $4 -> dom
\)\s*</span>\s*</td>\s*</tr>\s*<tr>\s*<td\s+colspan=\d+>\s*</td>\s*<td\s+
class="subtext">\s*<span\s+id=score_\g1>
(\d+) # $5 -> score
\s+point(s)?\s*</span>\s+by\s+<a\s+href="user\?id=
([^"]+) # $7 -> user
">\g7</a>\s*
(\d+) # $8 -> age_qty
\s+
(hour|minute) # $9 age_unit
(s)?\s+ago\s+\|\s*<a\s+href="item\?id=\g1">\s*
(\d+) # $11 -> comments
\s+comment(s)?\s*</a>
}ix
)
{
$data->{ 'id' } = $1;
$data->{ 'uri' } = $2;
$data->{ 'desc' } = $3;
$data->{ 'dom' } = $4;
$data->{ 'score' } = $5;
$data->{ 'user' } = $7;
$data->{ 'age_qty' } = $8;
$data->{ 'age_unit' } = $9;
$data->{ 'comments' } = $11;
# for debugging purposes
$data->{'pattern'} = 1;
} ## end if ( $html =~ m{ ) (})
elsif (
$html =~ m{
>\s*$num\.</td>\s*<td>\s*<center>\s*<a\s+id=up_
(\d+) # $1 id
\s+href="vote\?for=\g1&dir=up&whence=[%a-e0-9]+">\s*<img\s+src="http://yc
ombinator\.com/images/grayarrow\.gif"\s+
border=\d+
\s+vspace=
\d+\s+hspace=
\d+>\s*</a>
\s*<span\s+id=down_\g1></span>\s*</center>\s*</td>\s*<td\s+class="title">
\s*<a\s+href="
([^"]+) # $2 uri
"\s+rel="nofollow">
([^<]+) # $3 desc
</a>\s*<span\s+class="comhead">\s*\(
([^)]+) # $4 dom
\)\s*</span>\s*</td>\s*</tr>\s*<tr>\s*<td\s+colspan=\d+>\s*</td>\s*<td\s+
class="subtext">\s*<span\s+id=score_\g1>
(\d+) # $5 score
\s+point(s)?\s*</span>\s*by\s+<a\s+href="user\?id=
([^"]+) # $7 user
">\g7</a>\s*
(\d+) # $8 age_qty
\s+
(hour|minute) # $9 age_unit
(s)\s+ago\s+\|\s*<a\s+href="item\?id=\g1">\s*discuss\s*</a>
}imx
)
{
$data->{ 'id' } = $1;
$data->{ 'uri' } = $2;
$data->{ 'desc' } = $3;
$data->{ 'dom' } = $4;
$data->{ 'score' } = $5;
$data->{ 'user' } = $7;
$data->{ 'age_qty' } = $8;
$data->{ 'age_unit' } = $9;
$data->{ 'comments' } = 0;
# for debugging purposes
$data->{'pattern'} = 2;
} ## end elsif ( $html =~ m{ ) [ if ( $html =~ m{ ) (})](})
elsif (
$html =~ m{
>\s*$num\.\s*</td>\s*<td>\s*</td>\s*<td\s+class="title">\s*<a\s+href="
item\?id=
(\d+) # $1 id
">
([^<]+) # $2 desc
</a>\s*</td>\s*</tr>\s*<tr>\s*<td\s+colspan=\d+>\s*</td>\s*<td\s+clas
s="subtext">\s*
(\d+) # $3 age_qty
\s+
(hour|minute)(s)? # $4 age_unit
\s+ago\s*</td>
}imx
)
{
$data->{ 'id' } = $1;
$data->{ 'uri' } = 'http://news.ycombinator.com/item?id=' . $1;
$data->{ 'dom' } = 'ycombinator.com';
$data->{ 'desc' } = $2;
$data->{ 'age_qty' } = $3;
$data->{ 'age_unit' } = $4;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment