Created
March 20, 2017 22:53
-
-
Save apg/69bef672e1e1e85ca90c5e43706fbdbf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### ==================================================================== | |
### @Awk-file{ | |
### author = "Nelson H. F. Beebe", | |
### version = "1.06", | |
### date = "24 October 1997", | |
### time = "21:34:34 MDT", | |
### filename = "man2html.awk", | |
### address = "Center for Scientific Computing | |
### University of Utah | |
### Department of Mathematics, 105 JWB | |
### 155 S 1400 E RM 233 | |
### Salt Lake City, UT 84112-0090 | |
### USA", | |
### telephone = "+1 801 581 5254", | |
### FAX = "+1 801 581 4148", | |
### URL = "http://www.math.utah.edu/~beebe", | |
### checksum = "01400 968 2975 23193", | |
### email = "[email protected] (Internet)", | |
### codetable = "ISO/ASCII", | |
### keywords = "nroff, troff, UNIX manual page", | |
### supported = "yes", | |
### docstring = "This program converts UNIX manual pages | |
### in nroff/troff markup to strictly-conformant | |
### HTML 2.0, 3.0, or 3.2. [Actually, only two | |
### HTML 3.x entities (` ' and `­') | |
### are used, and those rarely; otherwise, the | |
### syntax conforms strictly to HTML 2.0.] | |
### | |
### Usage: | |
### nawk -f man2html.awk [HTML=2|3|3.2] \ | |
### manpage-file >html-file | |
### | |
### The single option, HTML=2, HTML=3, or | |
### HTML=3.2, selects the HTML grammar level. | |
### The default is HTML=2. | |
### | |
### This program is normally run via a shell | |
### wrapper that offers an option for setting the | |
### output file name. It has been used to | |
### successfully convert entire man-page | |
### collections on several UNIX systems to HTML | |
### form for convenient World-Wide Web browser | |
### access. | |
### | |
### Of those nroff/troff commands defined in the | |
### -man format used for UNIX manual pages, only | |
### the most commonly-used ones are supported; | |
### unrecognized ones will be warned about, and | |
### preserved as HTML comments in the output. | |
### | |
### UNIX man pages tend to be written in a | |
### highly-stylized fashion that we apply | |
### heuristics to in order to recover high-level | |
### HTML structure from low-level nroff/troff | |
### markup. Deviations from conventional | |
### man-page writing practice will likely result | |
### in less-than-perfect translation to HTML. | |
### | |
### Although there are several other `man2html' | |
### translators available on the Internet, this | |
### one is entirely of my own authorship, with no | |
### code borrowing from anywhere else. | |
### | |
### The checksum field above contains a CRC-16 | |
### checksum as the first value, followed by the | |
### equivalent of the standard UNIX wc (word | |
### count) utility output of lines, words, and | |
### characters. This is produced by Robert | |
### Solovay's checksum utility.", | |
### } | |
### ==================================================================== | |
BEGIN { initialize() } | |
/^[.]ie +t +[.]ds/ { getline } # fall through: next line should be .el | |
/^[.]el +.ds/ { define($3); next } | |
/^[.']\\"/ { cmd_comment($0); next } # save comments | |
/^[.]if +n *\\\{/ { cmd_comment($0); next } | |
/^[.]if +t *\\\{/ { cmd_comment_block($0); next } # convert troff directives to comments | |
/^ *\\\}/ { cmd_comment($0); next } | |
/^[.]if +t/ { cmd_comment($0); next } # convert troff directives to comments | |
/^[.]if +n +[.]ds/ { define($4); next } | |
/^[.]if +n +[.]ti/ { cmd_comment($0); next } # convert nroff spacing directives to comments | |
/^[.]ie +n +[.]ds/ { define($4); next } | |
/^[.]SH/ { cmd_SH(); next } | |
/^[.]SS/ { cmd_SS(); next } | |
/^[.]TH/ { cmd_TH(); next } | |
/^[.]B / { cmd_B(); next } | |
/^[.]I / { cmd_I(); next } | |
/^[.]IX / { cmd_IX(); next } | |
/^[.]R / { cmd_R(); next } | |
/^[.]ad/ { cmd_ad(); next } | |
/^[.][BIR]$/ { cmd_BIR(); next } | |
/^[.]BI / { cmd_XY("B","I"); next } | |
/^[.]br/ { cmd_br(); next } | |
/^[.]BR / { cmd_XY("B","R"); next } | |
/^[.]ce[ 0-9]*$/ { cmd_ce(); next } | |
/^[.]hw/ { cmd_hw(); next } | |
/^[.]IB / { cmd_XY("I","B"); next } | |
/^[.]IR / { cmd_XY("I","R"); next } | |
/^[.]ne/ { cmd_ne(); next } | |
/^[.]RB / { cmd_XY("R","B"); next } | |
/^[.]RI / { cmd_XY("R","I"); next } | |
/^[.]nf/ { cmd_nf(); next } | |
/^[.]fi/ { cmd_fi(); next } | |
/^[.]IP/ { cmd_IP(); next } | |
/^[.]LP/ { cmd_LP(); next } | |
/^[.]na/ { cmd_na(); next } | |
/^[.]PP/ { cmd_PP(); next } | |
/^[.]RE/ { cmd_RE(); next } | |
/^[.]RS/ { cmd_RS(); next } | |
/^[.]sp/ { cmd_sp(); next } | |
/^[.]TP/ { cmd_TP(); next } | |
/^[.]TS/ { cmd_TS(); next } | |
/^[.][A-Za-z]/ { cmd_unknown(); next } | |
{ print_line(strtohtml($0)) } | |
END { terminate(); } | |
# The anchor() function is adapted from my bibtex-to-html.awk file | |
function anchor(s,type,pattern,offset,prefix,save_label, name,rstart,rlength,save) | |
{ | |
# Add anchors <A type="....">...</A> around text in s matching | |
# pattern. A non-zero offset discards that many characters from | |
# the start of the match, allowing the pattern to contain leading | |
# context which goes outside the anchored region. The prefix is | |
# attached to the start of the matched string, inside the value | |
# quotes in the anchor. | |
if (match(s,pattern)) | |
{ | |
rstart = RSTART # need private copies of these globals because | |
rlength = RLENGTH # recursion will change them | |
rstart += offset # adjust by offset to discard leading | |
rlength -= offset # context in pattern | |
name = substr(s,rstart,rlength) | |
sub(/ +at +/,"@",name) # reduce "user at host" to "user@host" | |
s = substr(s,1,rstart-1) \ | |
"<A " type "=\"" prefix name "\">" \ | |
((type == "NAME") ? "<STRONG>" : "") \ | |
substr(s,rstart,rlength) \ | |
((type == "NAME") ? "</STRONG>" : "") \ | |
"</A>" \ | |
anchor(substr(s,rstart+rlength),type,pattern,offset,prefix,save) | |
} | |
return (s) | |
} | |
function begin_toc() | |
{ | |
print_toc("<H1>") | |
print_toc("Table of contents") | |
print_toc("</H1>") | |
print_toc("<UL>") | |
In_TOC_Item = 0 | |
} | |
function cmd_ad() | |
{ # .ad: turn on adjust (flush-left-and-right justification) | |
cmd_comment($0) # no HTML equivalent | |
} | |
function cmd_B( s) | |
{ | |
end_font() | |
if (match($0,/^[.]B *\"/)) | |
{ | |
s = substr($0,RSTART+RLENGTH) | |
gsub(/[" ]*$/,"",s) | |
print_line("<STRONG>" strtohtml(s) "</STRONG>") | |
} | |
else | |
print_line("<STRONG>" strtohtml($2) "</STRONG>") | |
} | |
function cmd_BIR() | |
{ | |
end_font() | |
print_line(strtohtml("\\f" substr($0,2,1))) # Remap .B into \fB etc | |
} | |
function cmd_br() | |
{ | |
cmd_PP() | |
} | |
function cmd_ce( k,n) | |
{ | |
# .ce nnn: turn on centering for next nnn lines (nnn = 0 turns it off) | |
n = $2 | |
cmd_comment($0) | |
if (n > 0) | |
{ | |
# The HTML 3.2 grammar supports <CENTER> ... </CENTER> as a | |
# shorthand for the more general <DIV ALIGN=CENTER> ... </DIV> | |
# (CENTER can be replaced by LEFT or RIGHT). However, except | |
# for amaya (W3C's testbed for HTML 3.2), none of the current | |
# browsers support DIV. grail, hotjava, netscape all recognize | |
# CENTER. arena, chimera, lynx, and xmosaic do not recognize it | |
# either. | |
if (HTML == "3.2") | |
print_line("<CENTER>") | |
for (k = 1; k <= n; ++k) | |
{ | |
getline | |
print_line(strtohtml($0) "<BR>") | |
} | |
if (HTML == "3.2") | |
print_line("</CENTER>") | |
} | |
} | |
function cmd_comment(s) | |
{ | |
In_Comment = 1 | |
sub(/^[.']\\"/,"",s) # remove troff comment prefix: it confuses html-pretty | |
print_line("<!-- " strtohtml(s) " -->") | |
In_Comment = 0 | |
} | |
function cmd_comment_block(s) | |
{ | |
cmd_comment(s) | |
In_Comment = 1 | |
while (getline s > 0) | |
{ | |
cmd_comment(s) | |
if (s ~ /^ *\\\}/) | |
break # found end of block | |
} | |
In_Comment = 0 | |
} | |
function cmd_I( s) | |
{ | |
end_font() | |
if (match($0,/^[.]I *\"/)) | |
{ | |
s = substr($0,RSTART+RLENGTH) | |
gsub(/[" ]*$/,"",s) | |
print_line("<EM>" strtohtml(s) "</EM>") | |
} | |
else | |
print_line("<EM>" strtohtml($2) "</EM>") | |
} | |
function cmd_IX() | |
{ | |
# .IX index entry lines are simply discarded | |
while (match($0,/\\$/) && (getline > 0)) | |
; # discard continuation lines | |
} | |
function cmd_fi() | |
{ | |
end_font() | |
if (In_PRE) | |
{ | |
print_line("</PRE>") | |
In_PRE = 0 | |
} | |
else | |
cmd_comment($0) | |
} | |
function cmd_hw() | |
{ # .hw word-hyph-en-a-tion ex-cep-tions | |
cmd_comment($0) | |
} | |
function cmd_IP() | |
{ | |
end_font() | |
PP++ | |
print_line("<P>") | |
} | |
function cmd_LP() | |
{ | |
end_font() | |
PP++ | |
print_line("<P>") | |
} | |
function cmd_na() | |
{ # .na: no adjust: turn off flush-left-and-right justification, producing ragged-right | |
cmd_comment($0) # no HTML equivalent | |
} | |
function cmd_ne() | |
{ # .ne dimen: need dimen vertical space before end of page | |
# otherwise, force a page break (e.g. to prevent page | |
# breaks after headings) | |
cmd_comment($0) | |
} | |
function cmd_nf() | |
{ | |
end_font() | |
if (In_PRE) | |
cmd_comment($0) | |
else | |
{ | |
print_line("<PRE>") | |
In_PRE = 1 | |
} | |
} | |
function cmd_PP() | |
{ | |
end_font() | |
PP++ | |
if (In_PRE) # <P> tags are illegal in <PRE>...</PRE> environments | |
print_line("") | |
else | |
print_line("<P>") | |
end_TP() | |
} | |
function cmd_R( s) | |
{ | |
end_font() | |
if (match($0,/^[.]R *\"/)) | |
{ | |
s = substr($0,RSTART+RLENGTH) | |
gsub(/[" ]*$/,"",s) | |
print_line(strtohtml(s)) | |
} | |
else | |
print_line(strtohtml($2)) | |
} | |
function cmd_RE() | |
{ | |
end_font() | |
if (In_PRE) # should not happen, but some man pages | |
cmd_fi() # are irregular | |
while (List_Level > RSE_List_Level[RSE_Level]) | |
end_TP() | |
if (RSE_Level > 0) | |
RSE_Level-- | |
print_line("</BLOCKQUOTE>") | |
} | |
function cmd_RS() | |
{ | |
end_font() | |
RSE_List_Level[++RSE_Level] = List_Level | |
List_Level++ # new .TP level too | |
print_line("<BLOCKQUOTE>") | |
} | |
function cmd_SH( s) | |
{ # section heading | |
cmd_SH_SS("H1") | |
} | |
function cmd_SS( s) | |
{ # subsection heading | |
cmd_SH_SS("H2") | |
} | |
function cmd_SH_SS(tag, s) | |
{ # [sub]section heading | |
if (!TH_seen) # should not happen, but some man pages are | |
cmd_TH(substr($0,5)) # irregular | |
end_font() | |
while (RSE_Level > 0) | |
cmd_RE() | |
while (List_Level > 0) | |
end_TP() | |
if (tag == "H1") | |
{ | |
H1++ | |
if (H1 == 1) | |
begin_toc() | |
if (H2 > 0) | |
{ | |
print_toc("</LI>") | |
print_toc("</UL>") | |
} | |
H2 = 0 | |
if (H1 > 1) | |
print_line("<HR>") # a separating horizontal rule is a nice touch | |
} | |
else if (tag == "H2") | |
{ | |
H2++ | |
} | |
s = substr($0,5) | |
sub(/^ *\"/,"",s) | |
sub(/\" *$/,"",s) | |
s = strtohtml(s) | |
SH_SS_count = "." H1 | |
if (H2 > 0) | |
SH_SS_count = SH_SS_count "." H2 | |
print_line("<" tag ">") | |
print_line("<A NAME=\"HDR" SH_SS_count "\">") | |
print_line(s) | |
print_line("</A>") | |
print_line("</" tag ">") | |
if (In_TOC_Item && (H2 != 1)) | |
print_toc("</LI>") | |
if (H2 == 1) | |
print_toc("<UL>") | |
In_TOC_Item = 1 | |
print_toc("<LI>") | |
print_toc("<A HREF=\"#HDR" SH_SS_count "\">") | |
print_toc(s) | |
print_toc("</A>") | |
} | |
function cmd_sp() | |
{ # .sp nnn: vertical space | |
cmd_comment($0) # no sensible HTML equivalent | |
} | |
function cmd_TH( line) | |
{ | |
end_font() | |
print_line("<HTML>") | |
print_line("<HEAD>") | |
print_line("<TITLE>") | |
line = $0 | |
while (line ~ /\\$/) | |
{ | |
getline | |
line = substr(line,1,length(line)-1) $0 | |
} | |
print_line(strtohtml(substr(line,4))) | |
print_line("</TITLE>") | |
print_line("<LINK REV=\"made\" HREF=\"mailto:" LOGNAME "@" HOSTNAME "\">") | |
print_line("</HEAD>") | |
print_line("") | |
print_line("<BODY>") | |
print_line("") | |
TH_seen = 1 | |
} | |
function cmd_TP() | |
{ | |
end_font() | |
getline # this is the item label, usually "\(bu" or ".B ..." | |
if (Item_Count[List_Level] == 0) # then first item of new list | |
{ | |
List_Level++ | |
Item_Count[List_Level] = 0 | |
if ($0 == "\\(bu") | |
{ | |
List_Name[List_Level] = "UL" | |
List_Item[List_Level] = "LI" | |
} | |
else | |
{ | |
List_Name[List_Level] = "DL" | |
List_Item[List_Level] = "DT" | |
} | |
if (Item_Count[List_Level] == 0) | |
print_line("<" List_Name[List_Level] ">") | |
} | |
Item_Count[List_Level]++ | |
if (List_Name[List_Level] == "DL") | |
{ | |
if (Item_Count[List_Level] > 1) | |
print_line("</DD>") | |
print_line("<DT>") | |
if ($0 ~ /^[.]B /) | |
cmd_B() | |
else if ($0 ~ /^[.]I /) | |
cmd_I() | |
else if ($0 ~ /^[.]R /) | |
cmd_R() | |
else if ($0 ~ /^[.]BR/) | |
cmd_XY("B","R") | |
else if ($0 ~ /^[.]BI/) | |
cmd_XY("B","I") | |
else if ($0 ~ /^[.]IB/) | |
cmd_XY("I","B") | |
else if ($0 ~ /^[.]IR/) | |
cmd_XY("I","R") | |
else if ($0 ~ /^[.]RB/) | |
cmd_XY("R","B") | |
else if ($0 ~ /^[.]RI/) | |
cmd_XY("R","I") | |
else | |
print_line(strtohtml($0)) | |
end_font() | |
if (In_PRE) # should not happen, but some man pages | |
cmd_fi() # are irregular | |
print_line("</DT>") | |
print_line("<DD>") | |
} | |
else # must be <UL> <LI> ... </LI> </UL> type list | |
{ | |
if (Item_Count[List_Level] > 1) | |
print_line("</LI>") | |
print_line("<LI>") | |
} | |
} | |
function cmd_TS( tbl_nroff_cmd) | |
{ | |
# Copy the table to a temporary file | |
print $0 >TBLFILE | |
while (getline > 0) | |
{ | |
print $0 >TBLFILE | |
if ($0 ~ /^[.]TE/) # then end of table found | |
break | |
} | |
close (TBLFILE) | |
# Run tbl, nroff, and col to convert the table to | |
# formatted text, and include it as a preformatted | |
# environment. | |
tbl_nroff_cmd = "tbl " TBLFILE " | nroff -man | col -b" | |
print_line("<PRE>") | |
while ((tbl_nroff_cmd | getline) > 0) | |
print_line(strtohtml($0)) | |
print_line("</PRE>") | |
close (tbl_nroff_cmd) | |
delete_file(TBLFILE) | |
} | |
function cmd_unknown() | |
{ | |
end_font() | |
warning("Unrecognized nroff/troff command in [" $0 "] changed to comment") | |
cmd_comment($0) | |
} | |
function cmd_XY(x,y, font,k) | |
{ | |
end_font() | |
protect_quoted_args() | |
for (k = 2; k <= NF; ++k) | |
{ | |
font = Font_Map[(k % 2) ? y : x] | |
printf("%s%s%s", html_font_begintag(font), strtohtml(unprotect_quoted_arg($k)), \ | |
html_font_endtag(font)) > TMPFILE | |
} | |
print_line("") | |
} | |
function define(name, regexp) | |
{ | |
# Typical values: | |
# .if n .ds Bi BibTeX | |
# .el .ds Bi BibTeX | |
# Macro used as \*(Bi, but stored as a regexp | |
regexp = "\\\\\\*\\(" name | |
Macro[regexp] = substr($0,index($0,name)+3) | |
} | |
function delete_file(s) | |
{ | |
system("/bin/rm -f " s) | |
} | |
function end_font() | |
{ | |
for (; Font_Level > 0; Font_Level--) | |
print_line(html_font_endtag(HTML_Font_Name[Font_Level])) | |
} | |
function end_toc() | |
{ | |
print_toc("</LI>") | |
print_toc("</UL>") | |
print_toc("<HR>") | |
close (TOCFILE) | |
} | |
function end_TP() | |
{ | |
if (Item_Count[List_Level] > 0) | |
{ | |
if (List_Name[List_Level] == "DL") | |
{ | |
print_line("</DD>") | |
print_line("</DL>") | |
} | |
else | |
{ | |
print_line("</LI>") | |
print_line("</UL>") | |
} | |
} | |
Item_Count[List_Level] = 0 | |
if (List_Level > 0) | |
List_Level-- | |
} | |
function font_sub(s, tag) | |
{ | |
while (match(s,/\\f[BCIPRST]/)) | |
{ | |
if (substr(s,RSTART+2,1) == "P") # revert to previous font | |
{ | |
tag = html_font_endtag(HTML_Font_Name[Font_Level]) | |
if (Font_Level > 0) | |
Font_Level-- | |
} | |
else # set explicit font | |
{ | |
Font_Level++ | |
HTML_Font_Name[Font_Level] = Font_Map[substr(s,RSTART+2,1)] | |
tag = html_font_begintag(HTML_Font_Name[Font_Level]) | |
# Handle ...\fB...\fR... style by ending previous font | |
if (Font_Level > 1) | |
{ | |
tag = html_font_endtag(HTML_Font_Name[Font_Level-1]) tag | |
HTML_Font_Name[Font_Level-1] = HTML_Font_Name[Font_Level] | |
Font_Level-- | |
} | |
} | |
s = substr(s,1,RSTART-1) tag substr(s,RSTART+3) | |
} | |
return (s) | |
} | |
function html_font_begintag(name) | |
{ | |
if (name == "") | |
return "" | |
else | |
return "<" name ">" | |
} | |
function html_font_endtag(name) | |
{ | |
if (name == "") | |
return "" | |
else | |
return "</" name ">" | |
} | |
function initialize() | |
{ | |
# Change these two lines whenever the program is modified | |
VERSION_NUMBER = "1.06" | |
VERSION_DATE = "[24-Oct-1997]" | |
VERSION = "Version " VERSION_NUMBER " " VERSION_DATE | |
"echo $LOGNAME" | getline LOGNAME | |
"hostname" | getline HOSTNAME | |
"date" | getline DATE | |
if (HTML == "") | |
HTML = 2 | |
if ((HTML != 2) && (HTML != 3) && (HTML != "3.2")) | |
{ | |
warning("Unsupported HTML level " HTML " requested: defaulting to HTML level 2") | |
HTML = 2 | |
} | |
Font_Map["B"] = "STRONG" | |
Font_Map["C"] = "TT" | |
Font_Map["I"] = "EM" | |
Font_Map["R"] = "" | |
Font_Map["S"] = "" # cannot map symbol font yet | |
Font_Map["T"] = "TT" | |
Macro["\\\\e"] = "\\" | |
if (HTML == 2) | |
Macro["\\\\0"] = "\\ " # change non-breakable space to numeric entity | |
else if (HTML >= 3) | |
Macro["\\\\0"] = "\\ " # can finally use named entity | |
else | |
warning("No conversion implemented for \\\\0 (non-breakable space) in HTML level", HTML) | |
TOCFILE = "/tmp/man2html.toc" | |
TBLFILE = "/tmp/man2html.tbl" | |
TMPFILE = "/tmp/man2html.tmp" | |
H1 = 0 | |
H2 = 0 | |
Macro["\\\\\\(bu"] = "\\¤" | |
Macro["\\\\\\(em"] = "---" | |
Macro["\\\\\\(en"] = "--" | |
# The following fragment for setting URL_xxx variables | |
# is borrowed intact from my bibtex-to-html.awk file: | |
# | |
# According to Internet RFC 1614 (May 1994), a URL is | |
# defined in the document T. Berners-Lee, ``Uniform | |
# Resource Locators'', March 1993, available at URL | |
# ftp://info.cern.ch/pub/ietf/url4.ps. Unfortunately, | |
# that address is no longer valid. However, I was able to | |
# track down pointers from http://www.w3.org/ to locate a | |
# suitable description in Internet RFC 1630 (June 1994). | |
# NB: We additionally disallow & in a URL because it is | |
# needed in SGML entities "&name;". We also disallow = | |
# and | because these are commonly used in \path=...= and | |
# \path|...| strings in BibTeX files. These restrictions | |
# could be removed if we went to the trouble of first | |
# encoding these special characters in %xy hexadecimal | |
# format, but they are rare enough that I am not going to | |
# do so for now. The worst that will happen from this | |
# decision is that an occasional URL in a BibTeX file will | |
# be missing a surrounding anchor. | |
# Bug fix [24-Oct-1997]: Add < and > to the set of excluded | |
# characters, to avoid incorrectly including SGML markup inside a | |
# URL. Before this fix, "\fChttp://www/\fP" got translated | |
# incorrectly to | |
# <TT><A HREF="http://www/</TT>">http://www/</TT></A> | |
# instead of the correct | |
# <TT><A HREF="http://www">http://www</A></TT> | |
URL_PATTERN = "[A-Za-z]+://[^ \",&=|<>]+" | |
URL_OFFSET = 0 | |
URL_PREFIX = "" | |
URL_SAVE_LABEL = 0 | |
E_MAIL_PATTERN = "[A-Za-z0-9_-]+@[A-Za-z0-9-]+([.][A-Za-z0-9-]+)*" | |
E_MAIL_OFFSET = 0 | |
E_MAIL_PREFIX = "mailto:" | |
E_MAIL_SAVE_LABEL = 0 | |
print_header() | |
} | |
function print_header() | |
{ | |
print_line("<!-- Warning: Do NOT edit this file. -->") | |
print_line("<!-- It was created automatically by man2html.awk " VERSION " on " DATE " -->") | |
print_line("<!-- from the file " strtohtml(FILENAME) " at " HOSTNAME " -->") | |
print_line("") | |
if (HTML == 2) | |
print_line("<!DOCTYPE HTML public \"-//IETF//DTD HTML//EN\">") | |
else if (HTML == 3) # We need level 3 HTML only because of our use of and ­ | |
print_line("<!DOCTYPE HTML public \"-//IETF//DTD HTML 3.0//EN\">") | |
else if (HTML == "3.2") # HTML 3.2 released 5-Nov-1996 at http://www.w3.org/pub/WWW | |
print_line("<!DOCTYPE HTML public \"-//W3C//DTD HTML 3.2//EN\">") | |
} | |
function print_line(s) | |
{ | |
print s >TMPFILE | |
} | |
function print_toc(s) | |
{ | |
print s >TOCFILE | |
} | |
function protect_quoted_args( inside,k,s) | |
{ | |
if (index($0,"\"") == 0) | |
return | |
s = $0 | |
inside = 0 | |
for (k = 1; k <= length(s); ++k) | |
{ | |
if (substr(s,k,1) == "\"") | |
inside = !inside | |
else if (inside && (substr(s,k,1) == " ")) | |
s = substr(s,1,k-1) "\177" substr(s,k+1) | |
} | |
$0 = s | |
} | |
function strtohtml(s, name) | |
{ | |
gsub(/\\$/,"",s) # discard backslash-newline | |
gsub(/\\-/,"-",s) # show troff minus as ASCII minus | |
gsub(/\\[&]/,"",s) # remove no-op macros | |
# gsub(/\\[|]/," ",s) # change thin space to space | |
gsub(/\\[|]/,"",s) # delete thin space (nroff does too) | |
gsub(/[&]/,"\\&",s) # protect 3 or 4 | |
gsub(/</,"\\<",s) # special SGML | |
gsub(/>/,"\\>",s) # characters | |
if (HTML == 2) | |
{ | |
gsub(/\\ /,"\\ ",s)# represent literal space by numeric entity | |
gsub(/\\%/,"",s) # squeeze out discretionary hyphens | |
} | |
else if (HTML >= 3) | |
{ | |
gsub(/\\ /,"\\ ",s) # preserve literal spaces | |
# NB: several browers fail to implement soft hyphen properly: they show | |
# it as an explicit hyphen when the word is not broken at end of line, | |
# instead of discarding it. We translate it correctly, and hope that | |
# broken browsers eventually get fixed, sigh... | |
gsub(/\\%/,"\\­",s) # discretionary hyphen -> soft hyphen | |
} | |
if (In_Comment) | |
gsub(/--/,"__",s) # must hide -- pairs to avoid grammar error | |
else if (HTML == "3.2") | |
gsub(/\"/,"\\"",s) # " was left out of HTML 3.2, sigh... | |
else | |
gsub(/\"/,"\\"",s) # but other versions, and SGML, have " | |
# It is curious that browsers can display a bullet, but there is no | |
# HTML markup to represent it, and it is absent from the standard | |
# ISO8859-1 fonts | |
# gsub(/\\\(bu/,"\\¤",s) # change bullets to general currency sign | |
# ¤ but use numeric code because | |
# xmosaic does not recognize it | |
for (name in Macro) # substitute macro names | |
gsub(name,Macro[name],s) | |
s = font_sub(s) | |
gsub(/\\\\/,"\\",s) # reduce troff doubled backslash to single HTML one | |
# if (index(s,"\\") > 0) # check for anything we missed | |
# warning("Possible unrecognized nroff/troff markup in [" s "]") | |
if (!In_Comment) # no link inside comment; otherwise, browser shows text | |
{ | |
s = anchor(s,"HREF",URL_PATTERN,URL_OFFSET,URL_PREFIX,URL_SAVE_LABEL) | |
s = anchor(s,"HREF",E_MAIL_PATTERN,E_MAIL_OFFSET,E_MAIL_PREFIX, \ | |
E_MAIL_SAVE_LABEL) | |
} | |
return (s) | |
} | |
function terminate( x,y) | |
{ | |
print_line("</BODY>") | |
print_line("</HTML>") | |
close (TMPFILE) | |
end_toc() | |
while (getline x < TMPFILE > 0) | |
{ | |
if (x == "<H1>") | |
break | |
print x | |
} | |
while (getline y < TOCFILE > 0) | |
print y | |
close (TOCFILE) | |
delete_file(TOCFILE) | |
print x | |
while (getline x < TMPFILE > 0) | |
print x | |
close (TMPFILE) | |
delete_file(TMPFILE) | |
} | |
function unprotect_quoted_arg(s) | |
{ | |
sub(/^"/,"",s) # remove leading and | |
sub(/"$/,"",s) # trailing quotes and | |
gsub(/\177/," ",s) # restore spaces | |
return (s) | |
} | |
function warning(message) | |
{ | |
print FILENAME ":" FNR ":%%" message >"/dev/stderr" | |
} | |
### ==================================================================== | |
### @Awk-file{ | |
### author = "Nelson H. F. Beebe", | |
### version = "1.06", | |
### date = "24 October 1997", | |
### time = "21:34:34 MDT", | |
### filename = "man2html.awk", | |
### address = "Center for Scientific Computing | |
### University of Utah | |
### Department of Mathematics, 105 JWB | |
### 155 S 1400 E RM 233 | |
### Salt Lake City, UT 84112-0090 | |
### USA", | |
### telephone = "+1 801 581 5254", | |
### FAX = "+1 801 581 4148", | |
### URL = "http://www.math.utah.edu/~beebe", | |
### checksum = "01400 968 2975 23193", | |
### email = "[email protected] (Internet)", | |
### codetable = "ISO/ASCII", | |
### keywords = "nroff, troff, UNIX manual page", | |
### supported = "yes", | |
### docstring = "This program converts UNIX manual pages | |
### in nroff/troff markup to strictly-conformant | |
### HTML 2.0, 3.0, or 3.2. [Actually, only two | |
### HTML 3.x entities (` ' and `­') | |
### are used, and those rarely; otherwise, the | |
### syntax conforms strictly to HTML 2.0.] | |
### | |
### Usage: | |
### nawk -f man2html.awk [HTML=2|3|3.2] \ | |
### manpage-file >html-file | |
### | |
### The single option, HTML=2, HTML=3, or | |
### HTML=3.2, selects the HTML grammar level. | |
### The default is HTML=2. | |
### | |
### This program is normally run via a shell | |
### wrapper that offers an option for setting the | |
### output file name. It has been used to | |
### successfully convert entire man-page | |
### collections on several UNIX systems to HTML | |
### form for convenient World-Wide Web browser | |
### access. | |
### | |
### Of those nroff/troff commands defined in the | |
### -man format used for UNIX manual pages, only | |
### the most commonly-used ones are supported; | |
### unrecognized ones will be warned about, and | |
### preserved as HTML comments in the output. | |
### | |
### UNIX man pages tend to be written in a | |
### highly-stylized fashion that we apply | |
### heuristics to in order to recover high-level | |
### HTML structure from low-level nroff/troff | |
### markup. Deviations from conventional | |
### man-page writing practice will likely result | |
### in less-than-perfect translation to HTML. | |
### | |
### Although there are several other `man2html' | |
### translators available on the Internet, this | |
### one is entirely of my own authorship, with no | |
### code borrowing from anywhere else. | |
### | |
### The checksum field above contains a CRC-16 | |
### checksum as the first value, followed by the | |
### equivalent of the standard UNIX wc (word | |
### count) utility output of lines, words, and | |
### characters. This is produced by Robert | |
### Solovay's checksum utility.", | |
### } | |
### ==================================================================== | |
BEGIN { initialize() } | |
/^[.]ie +t +[.]ds/ { getline } # fall through: next line should be .el | |
/^[.]el +.ds/ { define($3); next } | |
/^[.']\\"/ { cmd_comment($0); next } # save comments | |
/^[.]if +n *\\\{/ { cmd_comment($0); next } | |
/^[.]if +t *\\\{/ { cmd_comment_block($0); next } # convert troff directives to comments | |
/^ *\\\}/ { cmd_comment($0); next } | |
/^[.]if +t/ { cmd_comment($0); next } # convert troff directives to comments | |
/^[.]if +n +[.]ds/ { define($4); next } | |
/^[.]if +n +[.]ti/ { cmd_comment($0); next } # convert nroff spacing directives to comments | |
/^[.]ie +n +[.]ds/ { define($4); next } | |
/^[.]SH/ { cmd_SH(); next } | |
/^[.]SS/ { cmd_SS(); next } | |
/^[.]TH/ { cmd_TH(); next } | |
/^[.]B / { cmd_B(); next } | |
/^[.]I / { cmd_I(); next } | |
/^[.]IX / { cmd_IX(); next } | |
/^[.]R / { cmd_R(); next } | |
/^[.]ad/ { cmd_ad(); next } | |
/^[.][BIR]$/ { cmd_BIR(); next } | |
/^[.]BI / { cmd_XY("B","I"); next } | |
/^[.]br/ { cmd_br(); next } | |
/^[.]BR / { cmd_XY("B","R"); next } | |
/^[.]ce[ 0-9]*$/ { cmd_ce(); next } | |
/^[.]hw/ { cmd_hw(); next } | |
/^[.]IB / { cmd_XY("I","B"); next } | |
/^[.]IR / { cmd_XY("I","R"); next } | |
/^[.]ne/ { cmd_ne(); next } | |
/^[.]RB / { cmd_XY("R","B"); next } | |
/^[.]RI / { cmd_XY("R","I"); next } | |
/^[.]nf/ { cmd_nf(); next } | |
/^[.]fi/ { cmd_fi(); next } | |
/^[.]IP/ { cmd_IP(); next } | |
/^[.]LP/ { cmd_LP(); next } | |
/^[.]na/ { cmd_na(); next } | |
/^[.]PP/ { cmd_PP(); next } | |
/^[.]RE/ { cmd_RE(); next } | |
/^[.]RS/ { cmd_RS(); next } | |
/^[.]sp/ { cmd_sp(); next } | |
/^[.]TP/ { cmd_TP(); next } | |
/^[.]TS/ { cmd_TS(); next } | |
/^[.][A-Za-z]/ { cmd_unknown(); next } | |
{ print_line(strtohtml($0)) } | |
END { terminate(); } | |
# The anchor() function is adapted from my bibtex-to-html.awk file | |
function anchor(s,type,pattern,offset,prefix,save_label, name,rstart,rlength,save) | |
{ | |
# Add anchors <A type="....">...</A> around text in s matching | |
# pattern. A non-zero offset discards that many characters from | |
# the start of the match, allowing the pattern to contain leading | |
# context which goes outside the anchored region. The prefix is | |
# attached to the start of the matched string, inside the value | |
# quotes in the anchor. | |
if (match(s,pattern)) | |
{ | |
rstart = RSTART # need private copies of these globals because | |
rlength = RLENGTH # recursion will change them | |
rstart += offset # adjust by offset to discard leading | |
rlength -= offset # context in pattern | |
name = substr(s,rstart,rlength) | |
sub(/ +at +/,"@",name) # reduce "user at host" to "user@host" | |
s = substr(s,1,rstart-1) \ | |
"<A " type "=\"" prefix name "\">" \ | |
((type == "NAME") ? "<STRONG>" : "") \ | |
substr(s,rstart,rlength) \ | |
((type == "NAME") ? "</STRONG>" : "") \ | |
"</A>" \ | |
anchor(substr(s,rstart+rlength),type,pattern,offset,prefix,save) | |
} | |
return (s) | |
} | |
function begin_toc() | |
{ | |
print_toc("<H1>") | |
print_toc("Table of contents") | |
print_toc("</H1>") | |
print_toc("<UL>") | |
In_TOC_Item = 0 | |
} | |
function cmd_ad() | |
{ # .ad: turn on adjust (flush-left-and-right justification) | |
cmd_comment($0) # no HTML equivalent | |
} | |
function cmd_B( s) | |
{ | |
end_font() | |
if (match($0,/^[.]B *\"/)) | |
{ | |
s = substr($0,RSTART+RLENGTH) | |
gsub(/[" ]*$/,"",s) | |
print_line("<STRONG>" strtohtml(s) "</STRONG>") | |
} | |
else | |
print_line("<STRONG>" strtohtml($2) "</STRONG>") | |
} | |
function cmd_BIR() | |
{ | |
end_font() | |
print_line(strtohtml("\\f" substr($0,2,1))) # Remap .B into \fB etc | |
} | |
function cmd_br() | |
{ | |
cmd_PP() | |
} | |
function cmd_ce( k,n) | |
{ | |
# .ce nnn: turn on centering for next nnn lines (nnn = 0 turns it off) | |
n = $2 | |
cmd_comment($0) | |
if (n > 0) | |
{ | |
# The HTML 3.2 grammar supports <CENTER> ... </CENTER> as a | |
# shorthand for the more general <DIV ALIGN=CENTER> ... </DIV> | |
# (CENTER can be replaced by LEFT or RIGHT). However, except | |
# for amaya (W3C's testbed for HTML 3.2), none of the current | |
# browsers support DIV. grail, hotjava, netscape all recognize | |
# CENTER. arena, chimera, lynx, and xmosaic do not recognize it | |
# either. | |
if (HTML == "3.2") | |
print_line("<CENTER>") | |
for (k = 1; k <= n; ++k) | |
{ | |
getline | |
print_line(strtohtml($0) "<BR>") | |
} | |
if (HTML == "3.2") | |
print_line("</CENTER>") | |
} | |
} | |
function cmd_comment(s) | |
{ | |
In_Comment = 1 | |
sub(/^[.']\\"/,"",s) # remove troff comment prefix: it confuses html-pretty | |
print_line("<!-- " strtohtml(s) " -->") | |
In_Comment = 0 | |
} | |
function cmd_comment_block(s) | |
{ | |
cmd_comment(s) | |
In_Comment = 1 | |
while (getline s > 0) | |
{ | |
cmd_comment(s) | |
if (s ~ /^ *\\\}/) | |
break # found end of block | |
} | |
In_Comment = 0 | |
} | |
function cmd_I( s) | |
{ | |
end_font() | |
if (match($0,/^[.]I *\"/)) | |
{ | |
s = substr($0,RSTART+RLENGTH) | |
gsub(/[" ]*$/,"",s) | |
print_line("<EM>" strtohtml(s) "</EM>") | |
} | |
else | |
print_line("<EM>" strtohtml($2) "</EM>") | |
} | |
function cmd_IX() | |
{ | |
# .IX index entry lines are simply discarded | |
while (match($0,/\\$/) && (getline > 0)) | |
; # discard continuation lines | |
} | |
function cmd_fi() | |
{ | |
end_font() | |
if (In_PRE) | |
{ | |
print_line("</PRE>") | |
In_PRE = 0 | |
} | |
else | |
cmd_comment($0) | |
} | |
function cmd_hw() | |
{ # .hw word-hyph-en-a-tion ex-cep-tions | |
cmd_comment($0) | |
} | |
function cmd_IP() | |
{ | |
end_font() | |
PP++ | |
print_line("<P>") | |
} | |
function cmd_LP() | |
{ | |
end_font() | |
PP++ | |
print_line("<P>") | |
} | |
function cmd_na() | |
{ # .na: no adjust: turn off flush-left-and-right justification, producing ragged-right | |
cmd_comment($0) # no HTML equivalent | |
} | |
function cmd_ne() | |
{ # .ne dimen: need dimen vertical space before end of page | |
# otherwise, force a page break (e.g. to prevent page | |
# breaks after headings) | |
cmd_comment($0) | |
} | |
function cmd_nf() | |
{ | |
end_font() | |
if (In_PRE) | |
cmd_comment($0) | |
else | |
{ | |
print_line("<PRE>") | |
In_PRE = 1 | |
} | |
} | |
function cmd_PP() | |
{ | |
end_font() | |
PP++ | |
if (In_PRE) # <P> tags are illegal in <PRE>...</PRE> environments | |
print_line("") | |
else | |
print_line("<P>") | |
end_TP() | |
} | |
function cmd_R( s) | |
{ | |
end_font() | |
if (match($0,/^[.]R *\"/)) | |
{ | |
s = substr($0,RSTART+RLENGTH) | |
gsub(/[" ]*$/,"",s) | |
print_line(strtohtml(s)) | |
} | |
else | |
print_line(strtohtml($2)) | |
} | |
function cmd_RE() | |
{ | |
end_font() | |
if (In_PRE) # should not happen, but some man pages | |
cmd_fi() # are irregular | |
while (List_Level > RSE_List_Level[RSE_Level]) | |
end_TP() | |
if (RSE_Level > 0) | |
RSE_Level-- | |
print_line("</BLOCKQUOTE>") | |
} | |
function cmd_RS() | |
{ | |
end_font() | |
RSE_List_Level[++RSE_Level] = List_Level | |
List_Level++ # new .TP level too | |
print_line("<BLOCKQUOTE>") | |
} | |
function cmd_SH( s) | |
{ # section heading | |
cmd_SH_SS("H1") | |
} | |
function cmd_SS( s) | |
{ # subsection heading | |
cmd_SH_SS("H2") | |
} | |
function cmd_SH_SS(tag, s) | |
{ # [sub]section heading | |
if (!TH_seen) # should not happen, but some man pages are | |
cmd_TH(substr($0,5)) # irregular | |
end_font() | |
while (RSE_Level > 0) | |
cmd_RE() | |
while (List_Level > 0) | |
end_TP() | |
if (tag == "H1") | |
{ | |
H1++ | |
if (H1 == 1) | |
begin_toc() | |
if (H2 > 0) | |
{ | |
print_toc("</LI>") | |
print_toc("</UL>") | |
} | |
H2 = 0 | |
if (H1 > 1) | |
print_line("<HR>") # a separating horizontal rule is a nice touch | |
} | |
else if (tag == "H2") | |
{ | |
H2++ | |
} | |
s = substr($0,5) | |
sub(/^ *\"/,"",s) | |
sub(/\" *$/,"",s) | |
s = strtohtml(s) | |
SH_SS_count = "." H1 | |
if (H2 > 0) | |
SH_SS_count = SH_SS_count "." H2 | |
print_line("<" tag ">") | |
print_line("<A NAME=\"HDR" SH_SS_count "\">") | |
print_line(s) | |
print_line("</A>") | |
print_line("</" tag ">") | |
if (In_TOC_Item && (H2 != 1)) | |
print_toc("</LI>") | |
if (H2 == 1) | |
print_toc("<UL>") | |
In_TOC_Item = 1 | |
print_toc("<LI>") | |
print_toc("<A HREF=\"#HDR" SH_SS_count "\">") | |
print_toc(s) | |
print_toc("</A>") | |
} | |
function cmd_sp() | |
{ # .sp nnn: vertical space | |
cmd_comment($0) # no sensible HTML equivalent | |
} | |
function cmd_TH( line) | |
{ | |
end_font() | |
print_line("<HTML>") | |
print_line("<HEAD>") | |
print_line("<TITLE>") | |
line = $0 | |
while (line ~ /\\$/) | |
{ | |
getline | |
line = substr(line,1,length(line)-1) $0 | |
} | |
print_line(strtohtml(substr(line,4))) | |
print_line("</TITLE>") | |
print_line("<LINK REV=\"made\" HREF=\"mailto:" LOGNAME "@" HOSTNAME "\">") | |
print_line("</HEAD>") | |
print_line("") | |
print_line("<BODY>") | |
print_line("") | |
TH_seen = 1 | |
} | |
function cmd_TP() | |
{ | |
end_font() | |
getline # this is the item label, usually "\(bu" or ".B ..." | |
if (Item_Count[List_Level] == 0) # then first item of new list | |
{ | |
List_Level++ | |
Item_Count[List_Level] = 0 | |
if ($0 == "\\(bu") | |
{ | |
List_Name[List_Level] = "UL" | |
List_Item[List_Level] = "LI" | |
} | |
else | |
{ | |
List_Name[List_Level] = "DL" | |
List_Item[List_Level] = "DT" | |
} | |
if (Item_Count[List_Level] == 0) | |
print_line("<" List_Name[List_Level] ">") | |
} | |
Item_Count[List_Level]++ | |
if (List_Name[List_Level] == "DL") | |
{ | |
if (Item_Count[List_Level] > 1) | |
print_line("</DD>") | |
print_line("<DT>") | |
if ($0 ~ /^[.]B /) | |
cmd_B() | |
else if ($0 ~ /^[.]I /) | |
cmd_I() | |
else if ($0 ~ /^[.]R /) | |
cmd_R() | |
else if ($0 ~ /^[.]BR/) | |
cmd_XY("B","R") | |
else if ($0 ~ /^[.]BI/) | |
cmd_XY("B","I") | |
else if ($0 ~ /^[.]IB/) | |
cmd_XY("I","B") | |
else if ($0 ~ /^[.]IR/) | |
cmd_XY("I","R") | |
else if ($0 ~ /^[.]RB/) | |
cmd_XY("R","B") | |
else if ($0 ~ /^[.]RI/) | |
cmd_XY("R","I") | |
else | |
print_line(strtohtml($0)) | |
end_font() | |
if (In_PRE) # should not happen, but some man pages | |
cmd_fi() # are irregular | |
print_line("</DT>") | |
print_line("<DD>") | |
} | |
else # must be <UL> <LI> ... </LI> </UL> type list | |
{ | |
if (Item_Count[List_Level] > 1) | |
print_line("</LI>") | |
print_line("<LI>") | |
} | |
} | |
function cmd_TS( tbl_nroff_cmd) | |
{ | |
# Copy the table to a temporary file | |
print $0 >TBLFILE | |
while (getline > 0) | |
{ | |
print $0 >TBLFILE | |
if ($0 ~ /^[.]TE/) # then end of table found | |
break | |
} | |
close (TBLFILE) | |
# Run tbl, nroff, and col to convert the table to | |
# formatted text, and include it as a preformatted | |
# environment. | |
tbl_nroff_cmd = "tbl " TBLFILE " | nroff -man | col -b" | |
print_line("<PRE>") | |
while ((tbl_nroff_cmd | getline) > 0) | |
print_line(strtohtml($0)) | |
print_line("</PRE>") | |
close (tbl_nroff_cmd) | |
delete_file(TBLFILE) | |
} | |
function cmd_unknown() | |
{ | |
end_font() | |
warning("Unrecognized nroff/troff command in [" $0 "] changed to comment") | |
cmd_comment($0) | |
} | |
function cmd_XY(x,y, font,k) | |
{ | |
end_font() | |
protect_quoted_args() | |
for (k = 2; k <= NF; ++k) | |
{ | |
font = Font_Map[(k % 2) ? y : x] | |
printf("%s%s%s", html_font_begintag(font), strtohtml(unprotect_quoted_arg($k)), \ | |
html_font_endtag(font)) > TMPFILE | |
} | |
print_line("") | |
} | |
function define(name, regexp) | |
{ | |
# Typical values: | |
# .if n .ds Bi BibTeX | |
# .el .ds Bi BibTeX | |
# Macro used as \*(Bi, but stored as a regexp | |
regexp = "\\\\\\*\\(" name | |
Macro[regexp] = substr($0,index($0,name)+3) | |
} | |
function delete_file(s) | |
{ | |
system("/bin/rm -f " s) | |
} | |
function end_font() | |
{ | |
for (; Font_Level > 0; Font_Level--) | |
print_line(html_font_endtag(HTML_Font_Name[Font_Level])) | |
} | |
function end_toc() | |
{ | |
print_toc("</LI>") | |
print_toc("</UL>") | |
print_toc("<HR>") | |
close (TOCFILE) | |
} | |
function end_TP() | |
{ | |
if (Item_Count[List_Level] > 0) | |
{ | |
if (List_Name[List_Level] == "DL") | |
{ | |
print_line("</DD>") | |
print_line("</DL>") | |
} | |
else | |
{ | |
print_line("</LI>") | |
print_line("</UL>") | |
} | |
} | |
Item_Count[List_Level] = 0 | |
if (List_Level > 0) | |
List_Level-- | |
} | |
function font_sub(s, tag) | |
{ | |
while (match(s,/\\f[BCIPRST]/)) | |
{ | |
if (substr(s,RSTART+2,1) == "P") # revert to previous font | |
{ | |
tag = html_font_endtag(HTML_Font_Name[Font_Level]) | |
if (Font_Level > 0) | |
Font_Level-- | |
} | |
else # set explicit font | |
{ | |
Font_Level++ | |
HTML_Font_Name[Font_Level] = Font_Map[substr(s,RSTART+2,1)] | |
tag = html_font_begintag(HTML_Font_Name[Font_Level]) | |
# Handle ...\fB...\fR... style by ending previous font | |
if (Font_Level > 1) | |
{ | |
tag = html_font_endtag(HTML_Font_Name[Font_Level-1]) tag | |
HTML_Font_Name[Font_Level-1] = HTML_Font_Name[Font_Level] | |
Font_Level-- | |
} | |
} | |
s = substr(s,1,RSTART-1) tag substr(s,RSTART+3) | |
} | |
return (s) | |
} | |
function html_font_begintag(name) | |
{ | |
if (name == "") | |
return "" | |
else | |
return "<" name ">" | |
} | |
function html_font_endtag(name) | |
{ | |
if (name == "") | |
return "" | |
else | |
return "</" name ">" | |
} | |
function initialize() | |
{ | |
# Change these two lines whenever the program is modified | |
VERSION_NUMBER = "1.06" | |
VERSION_DATE = "[24-Oct-1997]" | |
VERSION = "Version " VERSION_NUMBER " " VERSION_DATE | |
"echo $LOGNAME" | getline LOGNAME | |
"hostname" | getline HOSTNAME | |
"date" | getline DATE | |
if (HTML == "") | |
HTML = 2 | |
if ((HTML != 2) && (HTML != 3) && (HTML != "3.2")) | |
{ | |
warning("Unsupported HTML level " HTML " requested: defaulting to HTML level 2") | |
HTML = 2 | |
} | |
Font_Map["B"] = "STRONG" | |
Font_Map["C"] = "TT" | |
Font_Map["I"] = "EM" | |
Font_Map["R"] = "" | |
Font_Map["S"] = "" # cannot map symbol font yet | |
Font_Map["T"] = "TT" | |
Macro["\\\\e"] = "\\" | |
if (HTML == 2) | |
Macro["\\\\0"] = "\\ " # change non-breakable space to numeric entity | |
else if (HTML >= 3) | |
Macro["\\\\0"] = "\\ " # can finally use named entity | |
else | |
warning("No conversion implemented for \\\\0 (non-breakable space) in HTML level", HTML) | |
TOCFILE = "/tmp/man2html.toc" | |
TBLFILE = "/tmp/man2html.tbl" | |
TMPFILE = "/tmp/man2html.tmp" | |
H1 = 0 | |
H2 = 0 | |
Macro["\\\\\\(bu"] = "\\¤" | |
Macro["\\\\\\(em"] = "---" | |
Macro["\\\\\\(en"] = "--" | |
# The following fragment for setting URL_xxx variables | |
# is borrowed intact from my bibtex-to-html.awk file: | |
# | |
# According to Internet RFC 1614 (May 1994), a URL is | |
# defined in the document T. Berners-Lee, ``Uniform | |
# Resource Locators'', March 1993, available at URL | |
# ftp://info.cern.ch/pub/ietf/url4.ps. Unfortunately, | |
# that address is no longer valid. However, I was able to | |
# track down pointers from http://www.w3.org/ to locate a | |
# suitable description in Internet RFC 1630 (June 1994). | |
# NB: We additionally disallow & in a URL because it is | |
# needed in SGML entities "&name;". We also disallow = | |
# and | because these are commonly used in \path=...= and | |
# \path|...| strings in BibTeX files. These restrictions | |
# could be removed if we went to the trouble of first | |
# encoding these special characters in %xy hexadecimal | |
# format, but they are rare enough that I am not going to | |
# do so for now. The worst that will happen from this | |
# decision is that an occasional URL in a BibTeX file will | |
# be missing a surrounding anchor. | |
# Bug fix [24-Oct-1997]: Add < and > to the set of excluded | |
# characters, to avoid incorrectly including SGML markup inside a | |
# URL. Before this fix, "\fChttp://www/\fP" got translated | |
# incorrectly to | |
# <TT><A HREF="http://www/</TT>">http://www/</TT></A> | |
# instead of the correct | |
# <TT><A HREF="http://www">http://www</A></TT> | |
URL_PATTERN = "[A-Za-z]+://[^ \",&=|<>]+" | |
URL_OFFSET = 0 | |
URL_PREFIX = "" | |
URL_SAVE_LABEL = 0 | |
E_MAIL_PATTERN = "[A-Za-z0-9_-]+@[A-Za-z0-9-]+([.][A-Za-z0-9-]+)*" | |
E_MAIL_OFFSET = 0 | |
E_MAIL_PREFIX = "mailto:" | |
E_MAIL_SAVE_LABEL = 0 | |
print_header() | |
} | |
function print_header() | |
{ | |
print_line("<!-- Warning: Do NOT edit this file. -->") | |
print_line("<!-- It was created automatically by man2html.awk " VERSION " on " DATE " -->") | |
print_line("<!-- from the file " strtohtml(FILENAME) " at " HOSTNAME " -->") | |
print_line("") | |
if (HTML == 2) | |
print_line("<!DOCTYPE HTML public \"-//IETF//DTD HTML//EN\">") | |
else if (HTML == 3) # We need level 3 HTML only because of our use of and ­ | |
print_line("<!DOCTYPE HTML public \"-//IETF//DTD HTML 3.0//EN\">") | |
else if (HTML == "3.2") # HTML 3.2 released 5-Nov-1996 at http://www.w3.org/pub/WWW | |
print_line("<!DOCTYPE HTML public \"-//W3C//DTD HTML 3.2//EN\">") | |
} | |
function print_line(s) | |
{ | |
print s >TMPFILE | |
} | |
function print_toc(s) | |
{ | |
print s >TOCFILE | |
} | |
function protect_quoted_args( inside,k,s) | |
{ | |
if (index($0,"\"") == 0) | |
return | |
s = $0 | |
inside = 0 | |
for (k = 1; k <= length(s); ++k) | |
{ | |
if (substr(s,k,1) == "\"") | |
inside = !inside | |
else if (inside && (substr(s,k,1) == " ")) | |
s = substr(s,1,k-1) "\177" substr(s,k+1) | |
} | |
$0 = s | |
} | |
function strtohtml(s, name) | |
{ | |
gsub(/\\$/,"",s) # discard backslash-newline | |
gsub(/\\-/,"-",s) # show troff minus as ASCII minus | |
gsub(/\\[&]/,"",s) # remove no-op macros | |
# gsub(/\\[|]/," ",s) # change thin space to space | |
gsub(/\\[|]/,"",s) # delete thin space (nroff does too) | |
gsub(/[&]/,"\\&",s) # protect 3 or 4 | |
gsub(/</,"\\<",s) # special SGML | |
gsub(/>/,"\\>",s) # characters | |
if (HTML == 2) | |
{ | |
gsub(/\\ /,"\\ ",s)# represent literal space by numeric entity | |
gsub(/\\%/,"",s) # squeeze out discretionary hyphens | |
} | |
else if (HTML >= 3) | |
{ | |
gsub(/\\ /,"\\ ",s) # preserve literal spaces | |
# NB: several browers fail to implement soft hyphen properly: they show | |
# it as an explicit hyphen when the word is not broken at end of line, | |
# instead of discarding it. We translate it correctly, and hope that | |
# broken browsers eventually get fixed, sigh... | |
gsub(/\\%/,"\\­",s) # discretionary hyphen -> soft hyphen | |
} | |
if (In_Comment) | |
gsub(/--/,"__",s) # must hide -- pairs to avoid grammar error | |
else if (HTML == "3.2") | |
gsub(/\"/,"\\"",s) # " was left out of HTML 3.2, sigh... | |
else | |
gsub(/\"/,"\\"",s) # but other versions, and SGML, have " | |
# It is curious that browsers can display a bullet, but there is no | |
# HTML markup to represent it, and it is absent from the standard | |
# ISO8859-1 fonts | |
# gsub(/\\\(bu/,"\\¤",s) # change bullets to general currency sign | |
# ¤ but use numeric code because | |
# xmosaic does not recognize it | |
for (name in Macro) # substitute macro names | |
gsub(name,Macro[name],s) | |
s = font_sub(s) | |
gsub(/\\\\/,"\\",s) # reduce troff doubled backslash to single HTML one | |
# if (index(s,"\\") > 0) # check for anything we missed | |
# warning("Possible unrecognized nroff/troff markup in [" s "]") | |
if (!In_Comment) # no link inside comment; otherwise, browser shows text | |
{ | |
s = anchor(s,"HREF",URL_PATTERN,URL_OFFSET,URL_PREFIX,URL_SAVE_LABEL) | |
s = anchor(s,"HREF",E_MAIL_PATTERN,E_MAIL_OFFSET,E_MAIL_PREFIX, \ | |
E_MAIL_SAVE_LABEL) | |
} | |
return (s) | |
} | |
function terminate( x,y) | |
{ | |
print_line("</BODY>") | |
print_line("</HTML>") | |
close (TMPFILE) | |
end_toc() | |
while (getline x < TMPFILE > 0) | |
{ | |
if (x == "<H1>") | |
break | |
print x | |
} | |
while (getline y < TOCFILE > 0) | |
print y | |
close (TOCFILE) | |
delete_file(TOCFILE) | |
print x | |
while (getline x < TMPFILE > 0) | |
print x | |
close (TMPFILE) | |
delete_file(TMPFILE) | |
} | |
function unprotect_quoted_arg(s) | |
{ | |
sub(/^"/,"",s) # remove leading and | |
sub(/"$/,"",s) # trailing quotes and | |
gsub(/\177/," ",s) # restore spaces | |
return (s) | |
} | |
function warning(message) | |
{ | |
print FILENAME ":" FNR ":%%" message >"/dev/stderr" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment