7c0faeb3e81136caed9ed2d425ab3c76cd62c58a
[shellsnippets/shellsnippets.git] / mksh / roff2htm
1 # $MirOS: src/scripts/roff2htm,v 1.58 2009/02/17 12:55:22 tg Exp $
2 # $ekkoBSD: catman2html.sh,v 1.2 2004/03/07 03:02:53 stephen Exp $
3 #-
4 # Copyright (c) 2004, 2005, 2006, 2007
5 #       Thorsten “mirabilos” Glaser <tg@mirbsd.de>
6 # Original version for ekkoBSD by:
7 # Copyright (c) 2004
8 #       Stephen Paskaluk <sap@mirbsd.org>
9 # Parts of the regular expression set below are based upon work by:
10 # Copyright (c) 1995
11 #       Panagiotis J. Christias <christia@theseas.ntua.gr>
12 #
13 # Provided that these terms and disclaimer and all copyright notices
14 # are retained or reproduced in an accompanying document, permission
15 # is granted to deal in this work without restriction, including un-
16 # limited rights to use, publicly perform, distribute, sell, modify,
17 # merge, give away, or sublicence.
18 #
19 # Advertising materials mentioning features or use of this work must
20 # display the following acknowledgement:
21 #       This product includes material provided by Thorsten Glaser.
22 #
23 # This work is provided “AS IS” and WITHOUT WARRANTY of any kind, to
24 # the utmost extent permitted by applicable law, neither express nor
25 # implied; without malicious intent or gross negligence. In no event
26 # may a licensor, author or contributor be held liable for indirect,
27 # direct, other damage, loss, or other issues arising in any way out
28 # of dealing in the work, even if advised of the possibility of such
29 # damage or existence of a defect, except proven that it results out
30 # of said person's immediate fault when using the work as intended.
31 #-
32 # Routines for converting catman pages and nrcon(1)d papers to HTML.
33 # ATTENTION: this file contains embedded white-, backspace and high-
34 #            bit-on control characters! Use “jupp --asis $0” to edit
35 # Note: this file contains magic and can’t be edited as UTF-8 either.
36 # Note: this script assumes MirBSD filesystem interna: ino_t=uint32_t
37
38 # check if mksh R31:2007/10/18 or up
39 if [[ $KSH_VERSION = @(\@\(#\)MIRBSD KSH R)@(3[2-9]|[4-9][0-9]|[1-9][0-9][0-9])\ +([0-9])/+([0-9])/+([0-9])?(\ *) ]]; then
40         i=0
41 elif [[ $KSH_VERSION = @(\@\(#\)MIRBSD KSH R31)* ]]; then
42         eval $(print "$KSH_VERSION" | sed 's#^.*R31 \([0-9]*\)/\([0-9]*\)/\([0-9]*\)\( .*\)*$#y=\1 m=\2 d=\3#')
43         (( i = y < 2007 ? 1 :
44             y > 2007 ? 0 :
45             m < 10 ? 1 :
46             m > 10 ? 0 :
47             d < 18 ? 1 : 0 ))
48         unset y m d
49 else
50         i=1
51 fi
52 # we need an mksh version with uint32_t array indicēs
53 if (( i )); then
54         print -u2 Error: your mksh is not recent enough.
55         print -u2 Please upgrade to at least mksh R32.
56         exit 1
57 fi
58 unset i
59
60 # initialise globals
61 roff2htm_gendate=$(date +"%F %T")               # current time
62 set -A roff2htm_inodecache                      # inode cache (empty)
63 roff2htm_machine=$(uname -m)                    # i386, sparc
64
65 function do_convert {
66         typeset -i ws=0
67         sed -e 's/[      ]*$//g'                                        \
68             -e '/-$/N
69 {
70 s/\([0-9A-z][-.,0-9A-z]*\)-\n\(  *\)\([0-9A-z][-.,0-9A-z]*([1-9][A-z]*)\)\([^ ]*\) /\1\3\4\
71 \2/
72 }'                                                                      \
73             -e '/-$/N
74 {
75 s/\([0-9A-z][-.,0-9A-z]*\)-\n\(  *\)\([0-9A-z][-.,0-9A-z]*([1-9][A-z]*)\)\([^ ]*\) /\1\3\4\
76 \2/
77 }'                                                                      \
78             -e 'y#&<>#Áþÿ#'                                             \
79                                                                         \
80             -e '/^[A-Z]\b/s#.\b##g'                                       \
81             -e 's#^[A-Z][ ,A-Z0-9]*$#</pre><h2>&</h2><pre>#'            \
82             -e 's#^  \([A-Z][ \b,A-Z0-9]*\)$#</pre><h3>\1</h3><pre>#'    \
83                                                                         \
84             -e 's#_\b\([^\80-¿][\80-¿]*\)#<i>\1</i>#g'                       \
85             -e 's#[^\80-¿][\80-¿]*\b\([^\80-¿][\80-¿]*\)#<b>\1</b>#g'            \
86                                                                         \
87             -e 's#</\([bi]\)><\1>##g'                                   \
88             -e 's#</b>\b<b>[^\80-¿][\80-¿]*##g'                              \
89             -e 's#</b>\b[^\80-¿][\80-¿]*<b>##g'                              \
90             -e 's#\b[^\80-¿][\80-¿]*##g'                                     \
91             -e 's#_</i<b><</b>i>##g'                                    \
92                                                                         \
93             -e 's#^\( \{2,3\}\)\([A-Z][ ,0-9A-z]*\)$#\1<b>\2</b>#'      \
94                                                                         \
95             -e '/^   /s#\(\([0-9A-z][-.,0-9A-z]*\)(\([1-9]\)[/0-9A-Za-z]*)\)#<a href=\"../man\3/\2.htm\">\1</a>#g' \
96             -e '/^   /s#\(<i>\([0-9A-z][-.,0-9A-z]*\)</i>(\([1-9]\)[/0-9A-Za-z]*)\)#<a href=\"../man\3/\2.htm\">\1</a>#g' \
97             -e '/^   /s#\(\([0-9A-z][-.,0-9A-z]*\)(\([PSU][MS][DM]\))\)#<a href=\"../man\3/\2.htm\">\1</a>#g' \
98             -e '/^   /s#\(<i>\([0-9A-z][-.,0-9A-z]*\)</i>(\([PSU][MS][DM]\))\)#<a href=\"../man\3/\2.htm\">\1</a>#g' \
99             -e '/^   /s#\(\([0-9A-z][-.,0-9A-z]*\)(\(PAPERS\))\)#<a href=\"../man\3/\2.htm\">\1</a>#g' \
100             -e '/^   /s#\(<i>\([0-9A-z][-.,0-9A-z]*\)</i>(\(PAPERS\))\)#<a href=\"../man\3/\2.htm\">\1</a>#g' \
101             -e '/^   /s#\(\([0-9A-z][-.,0-9A-z]*\)(GNU)\)#<a href=\"../manINFO/\2.htm\">\1</a>#g' \
102             -e '/^   /s#\(<i>\([0-9A-z][-.,0-9A-z]*\)</i>(GNU)\)#<a href=\"../manINFO/\2.htm\">\1</a>#g' \
103                                                                         \
104             -e 's#<b>+</b>\( *\)<b>o</b># \1•#'                               \
105             -e 's#<b>+</b>#•#'                                                \
106             -e 's#</\([bi]\)><\1>##g'                                   \
107             -e 's#</\([bi]\)>\([[:punct:][:space:]]*\)<\1>#\2#g'        \
108             -e 's#\([^[:punct:]]\)\([-!"#$%&'\''()*+,./:;=?@[\]^_`{|}~]*\)\(<[bi]>\)#\1\3\2#g'  \
109             -e 's#\(<i>[fh]t*p:[^<]*\)</i>/#\1/</i>#g'                  \
110             -e 's#\(<i>/[^<]*\)</i>/#\1/</i>#g'                         \
111             -e 's#<h3>*<b>*>#<h3>#g' -e 's#</b></h3>#</h3>#g'           \
112                                                                         \
113             -e 's/Á/\&#38;/g'                                           \
114             -e 's/þ/\&#60;/g'                                           \
115             -e 's/ÿ/\&#62;/g'                                           \
116                                                                         \
117             -e '1s#^#<pre>#'                                            \
118             -e '$s#$#</pre>#'                                           \
119             -e 's#<pre></pre>##g'                                       \
120             -e 's#</pre><pre>##g'                                       \
121             -e 's#<a href="../man'${1:-0}'/#<a href="#g'                \
122         | while IFS= read -r line; do
123                 if [[ -n $line ]]; then
124                         print -r -- "$line"
125                         ws=0
126                 else
127                         (( !ws++ )) && print
128                 fi
129         done
130 }
131
132 function output_header {
133         print '<?xml version="1.0" encoding="utf-8" ?>
134 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
135  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
136 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head>
137  <meta http-equiv="content-type" content="text/html; charset=utf-8" />
138  <title>RTFM '$1\($2')</title>
139  <meta name="robots" content="index, follow" />
140  <link rel="canonical" href="https://www.mirbsd.org/man'$roff2htm_machine/$1.$2'" />
141 </head><body>
142 <h1>MirOS Manual: <a href="../man'$2/$1'.htm">'$1\($2')</a></h1>'
143 }
144
145 function output_footer {
146         print '<hr /><p style="font-size:xx-small;">Generated on' \
147             $roff2htm_gendate 'by
148  <tt>$MirOS: src/scripts/roff2htm,v 1.58 2009/02/17 12:55:22 tg Exp $</tt></p>
149 <p>These manual pages are <a href="../man7/BSD-Licence.htm">copyrighted</a>
150  by their respective writers; their source is available at our <a
151  href="http://cvs.mirbsd.de/">CVSweb</a>, AnonCVS, and other mirrors.
152  The rest is Copyright © 2002-2008 <a href="http://www.mirbsd.org/">The
153  MirOS Project</a>, Germany. <br /><i style="font-size:3pt;">
154  This product includes material provided by Thorsten Glaser.</i></p>
155 <p style="font-size:x-small;">This manual page’s HTML representation
156  is supposed to be <a href="http://validator.w3.org/check/referer">valid
157  XHTML/1.1</a>; if not, please send a bug report – diffs preferred.</p>
158 </body></html>'
159 }
160
161 function do_conversion {
162         output_header ${1:-missing-pagename} ${2:-0}
163         do_convert ${2:-0}
164         output_footer
165 }
166
167 # do_conversion_verbose title section infile outfile
168 function do_conversion_verbose {
169         print -nru2 -- $3 → $4
170         do_conversion $1 $2 <$3 >$4
171         print -u2
172 }
173
174 # convert_page /path/to/man.cat1 /targetpath
175 function convert_page {
176         typeset fn=$1 page sect tn
177         typeset -Uui ino=$(stat -Lf %i $fn 2>/dev/null)
178         page=${fn##*/}                  # basename
179         page=${page%.0}                 # manual page name
180         sect=${fn%/*}                   # dirname
181         sect=${sect##*/cat}             # archsection
182         sect=${sect%%/*}                # section
183         tn=man${sect}/${page}.htm       # target file
184
185         if (( ino )) && [[ -n ${roff2htm_inodecache[ino]} ]]; then
186                 # source file is linked to a file we know
187                 print -ru2 -- $tn ← ${roff2htm_inodecache[ino]}
188                 ln -f $2/${roff2htm_inodecache[ino]} $2/$tn
189                 # patch in the additional name(s)
190                 ed -s $2/$tn <<-EOF
191                         /<title>/s#</title>#, $page($sect)&#
192                         /<h1>/s#</h1>#, <a href="../$tn">$page($sect)</a>&#
193                         wq
194                 EOF
195         else
196                 # store target filename in the inode cache
197                 roff2htm_inodecache[ino]=$tn
198                 do_conversion_verbose $page $sect $fn $2/$tn
199         fi
200 }
201
202 # output_htaccess >…/.htaccess
203 function output_htaccess {
204         print DirectoryIndex /dev/null
205         print "AddType 'text/html; charset=utf-8' htm"
206 }
207
208 # convert_all /path/to/share/man /targetpath
209 function convert_all {
210         typeset tp=${2:-$(pwd)/mbsdman}         # target basepath
211         typeset x f
212
213         (find ${1:-/usr/share/man}/cat{[1-9],3p} -name \*.0 2>&- | sort -f) |&
214         for x in 1 2 3 3p 4 5 6 7 8 9; do
215                 mkdir -p $tp/man$x      # one per section
216                 output_htaccess >$tp/man$x/.htaccess
217         done
218         while read -p f; do
219                 convert_page $f $tp     # any subpages
220         done
221 }