Created
December 22, 2010 18:52
-
-
Save hippietrail/751910 to your computer and use it in GitHub Desktop.
Strip HTML but retain block/inline structure
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# TODO img alt text? | |
# TODO turn into a module so randomwikipage.pl can use it | |
use strict; | |
use Encode 'decode_utf8'; | |
use File::DosGlob; | |
use HTML::Parser; | |
binmode(STDOUT, ':utf8'); | |
my %unktags; # unknown tags | |
# from HTML 4.0 DTD | |
# | |
# <!-- %inline; covers inline or "text-level" elements --> | |
# <!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;"> | |
# <!ENTITY % fontstyle | |
# "TT | I | B | BIG | SMALL"> | |
# <!ENTITY % phrase "EM | STRONG | DFN | CODE | | |
# SAMP | KBD | VAR | CITE | ABBR | ACRONYM" > | |
# <!ENTITY % special | |
# "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO"> | |
# <!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON"> | |
# | |
# | |
# <!ENTITY % block | |
# "P | %heading; | %list; | %preformatted; | DL | DIV | NOSCRIPT | | |
# BLOCKQUOTE | FORM | HR | TABLE | FIELDSET | ADDRESS"> | |
# <!ENTITY % heading "H1|H2|H3|H4|H5|H6"> | |
# <!ENTITY % list "UL | OL"> | |
# <!ENTITY % preformatted "PRE"> | |
my %tagtype = ( | |
'TT' => 'inline', 'I' => 'inline', 'B' => 'inline', 'BIG' => 'inline', | |
'SMALL' => 'inline', | |
'EM' => 'inline', 'STRONG' => 'inline', 'DFN' => 'inline', | |
'CODE' => 'inline', | |
'SAMP' => 'inline', 'KBD' => 'inline', 'VAR' => 'inline', | |
'CITE' => 'inline', 'ABBR' => 'inline', 'ACRONYM' => 'inline', | |
'A' => 'inline', 'IMG' => 'inline', 'OBJECT' => 'inline', 'BR' => 'inline', | |
'SCRIPT' => 'inline', 'MAP' => 'inline', 'Q' => 'inline', 'SUB' => 'inline', | |
'SUP' => 'inline', 'SPAN' => 'inline', 'BDO' => 'inline', | |
'INPUT' => 'inline', 'SELECT' => 'inline', 'TEXTAREA' => 'inline', | |
'LABEL' => 'inline', 'BUTTON' => 'inline', | |
'P' => 'block', 'DL' => 'block', 'DIV' => 'block', 'NOSCRIPT' => 'block', | |
'BLOCKQUOTE' => 'block', 'FORM' => 'block', 'HR' => 'block', | |
'TABLE' => 'block', 'FIELDSET' => 'block', 'ADDRESS' => 'block', | |
'H1' => 'block', 'H2' => 'block', 'H3' => 'block', 'H4' => 'block', | |
'H5' => 'block', 'H6' => 'block', | |
'UL' => 'block', 'OL' => 'block', | |
'PRE' => 'block', | |
# not defined as block in the dtd but usually breaks lines | |
'LI' => 'block', 'DD' => 'block', 'DT' => 'block', | |
'IFRAME' => 'block', 'TBODY' => 'block', 'TR' => 'block', 'TD' => 'block', | |
# not defined as inline in the dtd but testing shows they behave so | |
'OPTION' => 'inline', 'PARAM' => 'inline', 'EMBED' => 'inline', | |
'FONT' => 'inline', | |
# peculiarity of HTML::Parser?? | |
'BR/' => 'inline', | |
# TODO BASE, CENTER, AREA, TH, LEGEND, INS, THEAD, BLINK, DEL, MARQUEE | |
); | |
my @files = glob "@ARGV"; | |
print STDERR join("\n", @files), "\n"; | |
for (@files) { | |
process_file($_); | |
} | |
print STDERR "----\n\n"; | |
#dump_unknown_tags(); | |
exit; | |
#################################################################### | |
sub process_file { | |
my $filename = shift; | |
print STDERR "stripping $filename...\n"; | |
our %inside = (); | |
# slurp in an HTML file | |
open(FH, $filename); | |
my @content = <FH>; | |
close(FH); | |
my $content = join('', @content); | |
our $txt = ''; | |
my $parser = HTML::Parser->new( | |
start_h => [\&tag, "tagname, '+1'"], | |
end_h => [\&tag, "tagname, '-1'"], | |
text_h => [\&text, "dtext"], | |
); | |
$parser->parse( decode_utf8( $content ) ); | |
$parser->eof; | |
# clean up whitespace etc | |
# turn lines with just space to blank lines | |
$txt =~ s/^[ \t]*(.*?)[ \t]*$/\1/mg; | |
# minify paragraph breaks | |
$txt =~ s/\n\n\n+/\n\n/g; | |
# minify spaces and tabs | |
$txt =~ s/[ \t]+/ /g; | |
print $txt; | |
dump_unknown_tags(); | |
return; | |
################################################################ | |
sub tag { | |
my($tag, $opt_n) = @_; | |
$inside{$tag} += $opt_n; | |
my $type = $tagtype{uc $tag}; | |
# do nothing for inline tags, script and style tags | |
if ($type eq 'inline' || $tag eq 'script' || $tag eq 'style') { | |
# \n for <br> | |
if ($tag eq 'br' || $tag eq 'br/') { | |
$txt .= "\n"; | |
} elsif ($tag eq 'img') { | |
$txt .= ' '; | |
} | |
# \n\n for block tags | |
} elsif ($type eq 'block') { | |
$txt .= "\n\n"; | |
# neither inline nor block! | |
} else { | |
unless ($inside{script} || $inside{style}) { | |
unless (grep (/^$tag$/, ('html', 'head', 'title', 'meta', 'link', 'body'))) { | |
++ $unktags{$tag} if $opt_n == 1; | |
} | |
} | |
} | |
} | |
sub text { | |
return if $inside{script} || $inside{style}; | |
return if $inside{head} && !$inside{title}; | |
my $t = $_[0]; | |
$t =~ s/\s+/ /g; | |
$txt .= $t; | |
} | |
} | |
sub dump_unknown_tags { | |
foreach (sort {$unktags{$b} <=> $unktags{$a}} keys %unktags) { | |
print STDERR "$_: $unktags{$_}\n"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is my answer (so far) to my own question on stackoverflow:
http://stackoverflow.com/questions/4396497/stripping-html-but-retaining-block-inline-structure