sync from MirBSD: fix over-zealous optimisation that was bogus
[shellsnippets/shellsnippets.git] / mksh / roff2htm
1 # $MirOS: src/scripts/roff2htm,v 1.74 2012/09/27 18:03:10 tg Exp $
2 # $ekkoBSD: catman2html.sh,v 1.2 2004/03/07 03:02:53 stephen Exp $
3 #-
4 # Copyright (c) 2004, 2005, 2006, 2007, 2011, 2012
5 #       Thorsten “mirabilos” Glaser <tg@mirbsd.org>
6 # Original version for ekkoBSD by:
7 # Copyright (c) 2004
8 #       Stephen Paskaluk <sap@mirbsd.org>
9 # Parts of the regular expression set below are based upon work by:
10 # Copyright (c) 1995
11 #       Panagiotis J. Christias <christia@theseas.ntua.gr>
12 #
13 # Provided that these terms and disclaimer and all copyright notices
14 # are retained or reproduced in an accompanying document, permission
15 # is granted to deal in this work without restriction, including un-
16 # limited rights to use, publicly perform, distribute, sell, modify,
17 # merge, give away, or sublicence.
18 #
19 # Advertising materials mentioning features or use of this work must
20 # display the following acknowledgement:
21 #       This product includes material provided by Thorsten Glaser.
22 #
23 # This work is provided “AS IS” and WITHOUT WARRANTY of any kind, to
24 # the utmost extent permitted by applicable law, neither express nor
25 # implied; without malicious intent or gross negligence. In no event
26 # may a licensor, author or contributor be held liable for indirect,
27 # direct, other damage, loss, or other issues arising in any way out
28 # of dealing in the work, even if advised of the possibility of such
29 # damage or existence of a defect, except proven that it results out
30 # of said person's immediate fault when using the work as intended.
31 #-
32 # Routines for converting catman pages and nrcon(1)d papers to HTML.
33 # ATTENTION: this file contains embedded white-, backspace and high-
34 #            bit-on control characters! Use “jupp --asis $0” to edit
35 # Note: this file contains magic and can’t be edited as UTF-8 either.
36 # Note: this script assumes MirBSD filesystem interna: ino_t=uint32_t
37
38 # check if mksh R31:2007/10/18 or up
39 if [[ $KSH_VERSION = @(\@\(#\)MIRBSD KSH R)@(3[2-9]|[4-9][0-9]|[1-9][0-9][0-9])\ +([0-9])/+([0-9])/+([0-9])?(\ *) ]]; then
40         i=0
41 elif [[ $KSH_VERSION = @(\@\(#\)MIRBSD KSH R31)* ]]; then
42         eval $(print "$KSH_VERSION" | sed 's#^.*R31 \([0-9]*\)/\([0-9]*\)/\([0-9]*\)\( .*\)*$#y=\1 m=\2 d=\3#')
43         (( i = y < 2007 ? 1 :
44             y > 2007 ? 0 :
45             m < 10 ? 1 :
46             m > 10 ? 0 :
47             d < 18 ? 1 : 0 ))
48         unset y m d
49 else
50         i=1
51 fi
52 # we need an mksh version with uint32_t array indicēs
53 if (( i )); then
54         print -u2 Error: your mksh is not recent enough.
55         print -u2 Please upgrade to at least mksh R32.
56         exit 1
57 fi
58 unset i
59
60 # initialise globals
61 roff2htm_gendate=$(date +"%F %T")               # current time
62 set -A roff2htm_inodecache                      # inode cache (empty)
63 roff2htm_machine=$(uname -m)                    # i386, sparc
64
65 function set_conversion_man {
66         function do_convert {
67                 do_convert_man "$@"
68         }
69 }
70 function set_conversion_paper {
71         function do_convert {
72                 do_convert_paper "$@"
73         }
74 }
75 set_conversion_man
76
77 function do_convert_man {
78         local -i _nl=0
79         col -x | sed                                                    \
80             -e '/-$/N
81 {
82 s/\([0-9A-z][-.,0-9A-z:]*\)-\n\(  *\)\([0-9A-z][-.,0-9A-z:]*([1-9][A-z]*)\)\([^ ]*\) /\1\3\4\
83 \2/
84 }'                                                                      \
85             -e 's#<\b_#≤#g' -e 's#>\b_#≥#g'                           \
86             -e 'y#&<>#Áþÿ#'                                             \
87                                                                         \
88             -e 's#[Oo]\b[Oo]\b+\b+#•#g'                                  \
89             -e 's#_\b|\b|#_\b|\b_\b|#g'                                      \
90             -e 's#+\b_#±#g'                                             \
91                                                                         \
92             -e 's#^[A-z][\b 0-9A-z]*$#</pre><h2>&</h2><pre>#'            \
93             -e 's#^  \([A-z][\b ,0-9A-z]*\)$#</pre><h3>\1</h3><pre>#'    \
94                                                                         \
95             -e 's#\([^~<>\80-¿][\80-¿]*\)\b~#\1Ì\85#g'                         \
96             -e 's#\([^\b]\)~\b_#\1_̅\b #g'                                        \
97             -e 's#\([^\b]\)\([^<>_\80-¿][\80-¿]*\)\([Ì\85]*\)\b_#\1_\2\b\3#g'    \
98             -e 's#\([^<>\80-¿]\)[\80-¿]*\([Ì\85]*\)\b\([^<>\80-¿][\80-¿]*\)#<\1<\3\2>#g'   \
99             -e 's#\(<_<\([^>_]*\)>\)\b\1#<G>\2</G>#g'                    \
100             -e 's#<_<\([^>_]*\)>#<i>\1</i>#g'                           \
101             -e 's#<.<\([^>]*\)>#<b>\1</b>#g'                            \
102             -e 's#\b##g'                                                 \
103                                                                         \
104             -e '/<h[23]/s#</*[biG]>##g'                                 \
105             -e 's#</\([biG]\)><\1>##g'                                  \
106             -e 's#</\([biG]\)>\([- -*./:;?@^_~]*\)<\1>#\2#g'            \
107             -e 's#\([\ 1- 0-9A-z]\)\([$/_-]*\)\(<[biG]>\)#\1\3\2#g'       \
108             -e 's#\(</[biG]>\)\([)$/_-]*\)\([\ 1- 0-9A-z]\)#\2\1\3#g'     \
109                                                                         \
110             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\(3p\))#<a href="../man\2/\1.htm">&</a>#g' \
111             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\([1-9]\)\(/[/0-9A-Za-z]*\)*)#<a href="../man\2/\1.htm">&</a>#g' \
112             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\([PSU][MS][DM]\))#<a href="../man\2/\1.htm">&</a>#g' \
113             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\(PAPERS\))#<a href="../man\2/\1.htm">&</a>#g' \
114             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(GNU)#<a href="../manINFO/\1.html">&</a>#g' \
115             -e 's#)\(</[biG]>\)\([\ 1- 0-9A-z]\)#\1)\2#g'                 \
116                                                                         \
117             -e 's/Á/\&#38;/g'                                           \
118             -e 's/þ/\&#60;/g'                                           \
119             -e 's/ÿ/\&#62;/g'                                           \
120                                                                         \
121             -e 's#<G>#<b><i>#g' -e 's#</G>#</i></b>#g'                  \
122             -e 's#</b><b>##g'                                           \
123                                                                         \
124             -e '1s#^#<pre>#'                                            \
125             -e '$s#$#</pre>#'                                           \
126             -e 's#<pre></pre>##g'                                       \
127             -e 's#</pre><pre>##g'                                       \
128         | while IFS= read -r line; do
129                 if [[ -n $line ]]; then
130                         (( _nl )) && [[ $line != '</pre>'* ]] && print
131                         print -r -- "$line"
132                         _nl=0
133                 else
134                         _nl=1
135                 fi
136         done
137 }
138
139 function do_convert_paper {
140         local -i _nl=0
141         col -x | sed                                                    \
142             -e '/-$/N
143 {
144 s/\([0-9A-z][-.,0-9A-z:]*\)-\n\(  *\)\([0-9A-z][-.,0-9A-z:]*([1-9][A-z]*)\)\([^ ]*\) /\1\3\4\
145 \2/
146 }'                                                                      \
147             -e 's#<\b_#≤#g' -e 's#>\b_#≥#g'                           \
148             -e 'y#&<>#Áþÿ#'                                             \
149                                                                         \
150             -e 's#[Oo]\b[Oo]\b+\b+#•#g'                                  \
151             -e 's#_\b|\b|#_\b|\b_\b|#g'                                      \
152             -e 's#+\b_#±#g'                                             \
153                                                                         \
154             -e 's#\([^~<>\80-¿][\80-¿]*\)\b~#\1Ì\85#g'                         \
155             -e 's#\([^\b]\)~\b_#\1_̅\b #g'                                        \
156             -e 's#\([^\b]\)\([^<>_\80-¿][\80-¿]*\)\([Ì\85]*\)\b_#\1_\2\b\3#g'    \
157             -e 's#\([^<>\80-¿]\)[\80-¿]*\([Ì\85]*\)\b\([^<>\80-¿][\80-¿]*\)#<\1<\3\2>#g'   \
158             -e 's#\(<_<\([^>_]*\)>\)\b\1#<G>\2</G>#g'                    \
159             -e 's#<_<\([^>_]*\)>#<i>\1</i>#g'                           \
160             -e 's#<.<\([^>]*\)>#<b>\1</b>#g'                            \
161             -e 's#\b##g'                                                 \
162                                                                         \
163             -e 's#</\([biG]\)><\1>##g'                                  \
164             -e 's#</\([biG]\)>\([- -*./:;?@^_~]*\)<\1>#\2#g'            \
165             -e 's#\([\ 1- 0-9A-z]\)\([$/_-]*\)\(<[biG]>\)#\1\3\2#g'       \
166             -e 's#\(</[biG]>\)\([)$/_-]*\)\([\ 1- 0-9A-z]\)#\2\1\3#g'     \
167                                                                         \
168             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\([1-9]\)[/0-9A-Za-z]*)#<a href="../man\2/\1.htm">&</a>#g' \
169             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\([PSU][MS][DM]\))#<a href="../man\2/\1.htm">&</a>#g' \
170             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\(PAPERS\))#<a href="../man\2/\1.htm">&</a>#g' \
171             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(GNU)#<a href="../manINFO/\1.html">&</a>#g' \
172             -e 's#)\(</[biG]>\)\([\ 1- 0-9A-z]\)#\1)\2#g'                 \
173                                                                         \
174             -e 's/Á/\&#38;/g'                                           \
175             -e 's/þ/\&#60;/g'                                           \
176             -e 's/ÿ/\&#62;/g'                                           \
177                                                                         \
178             -e 's#<G>#<b><i>#g' -e 's#</G>#</i></b>#g'                  \
179             -e 's#</b><b>##g'                                           \
180                                                                         \
181             -e '1s#^#<pre>#'                                            \
182             -e '$s#$#</pre>#'                                           \
183             -e 's#<pre></pre>##g'                                       \
184             -e 's#</pre><pre>##g'                                       \
185         | while IFS= read -r line; do
186                 if [[ -n $line ]]; then
187                         (( _nl )) && [[ $line != '</pre>'* ]] && print
188                         print -r -- "$line"
189                         _nl=0
190                 else
191                         _nl=1
192                 fi
193         done
194 }
195
196 function output_header {
197         print '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
198  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
199 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head>
200  <meta http-equiv="content-type" content="text/html; charset=utf-8" />
201  <title>RTFM '$1\($2')</title>
202  <meta name="robots" content="index, follow" />
203  <link rel="canonical" href="https://www.mirbsd.org/man'$roff2htm_machine/$1.$2'" />
204  <style type="text/css">
205         /* <![CDATA[ */
206         body {
207                 background-color:#000000;
208                 color:#666666;
209                 font-family:serif;
210         }
211         a {
212                 color:inherit;
213                 text-decoration:none;
214                 border-bottom:1px dashed;
215         }
216         a:visited {
217                 text-decoration:none;
218                 border-bottom:1px dotted;
219         }
220         a:hover {
221                 text-decoration:none;
222                 border-bottom:1px double;
223         }
224         pre {
225                 line-height:112%;
226                 color:#FFBF00;
227         }
228         b {
229                 color:#FFEF00;
230                 font-weight:normal;
231         }
232         i {
233                 font-style:normal;
234                 border-bottom:1px solid #FFBF00;
235         }
236         b i,i b {
237                 color:#FFEF00;
238                 font-weight:normal;
239                 font-style:normal;
240                 border-bottom:1px solid #FFEF00;
241         }
242         h1 {
243                 color:#FFEF00;
244                 font-size:xx-large;
245                 font-family:serif;
246         }
247         h2 {
248                 color:#FFFFFF;
249                 font-size:x-large;
250                 font-family:sans-serif;
251         }
252         h3 {
253                 color:#CCCCCC;
254                 font-size:large;
255                 font-family:sans-serif;
256         }
257         /* ]]> */
258  </style>
259 </head><body>
260 <h1>MirOS Manual: <a href="../man'$2/$1'.htm">'$1\($2')</a></h1>'
261 }
262
263 function output_footer {
264         print '
265 <hr /><p style="font-size:xx-small;">Generated on' $roff2htm_gendate 'by
266  <tt>$MirOS: src/scripts/roff2htm,v 1.74 2012/09/27 18:03:10 tg Exp $</tt></p>
267 <p>These manual pages and other documentation are <a
268  href="../man7/BSD-Licence.htm">copyrighted</a> by their respective writers;
269  their source is available at our <a href="http://cvs.mirbsd.de/">CVSweb</a>,
270  AnonCVS, and other mirrors. The rest is Copyright © 2002‒2011 <a
271  href="https://www.mirbsd.org/">The MirOS Project</a>, Germany.<br /><span
272  style="font-size:3pt; font-style:italic;">This product includes material
273  provided by Thorsten Glaser.</span></p>
274 <p style="font-size:x-small;">This manual page’s HTML representation
275  is supposed to be <a href="http://validator.w3.org/check/referer">valid
276  XHTML/1.1</a>; if not, please send a bug report – diffs preferred.</p>
277 </body></html>'
278 }
279
280 function do_conversion {
281         output_header ${1:-missing-pagename} ${2:-0}
282         do_convert ${2:-0}
283         output_footer
284 }
285
286 # do_conversion_verbose title section infile outfile
287 function do_conversion_verbose {
288         print -nru2 -- $3 → $4
289         do_conversion $1 $2 <$3 >$4
290         print -u2
291 }
292
293 # convert_page /path/to/man.cat1 /targetpath
294 function convert_page {
295         local fn=$1 page sect tn
296         local -Uui ino=$(stat -Lf %i $fn 2>/dev/null)
297         page=${fn##*/}                  # basename
298         page=${page%.0}                 # manual page name
299         sect=${fn%/*}                   # dirname
300         sect=${sect##*/cat}             # archsection
301         sect=${sect%%/*}                # section
302         tn=man${sect}/${page}.htm       # target file
303
304         if (( ino )) && [[ -n ${roff2htm_inodecache[ino]} ]]; then
305                 # source file is linked to a file we know
306                 print -ru2 -- $tn ← ${roff2htm_inodecache[ino]}
307                 ln -f $2/${roff2htm_inodecache[ino]} $2/$tn
308                 # patch in the additional name(s)
309                 ed -s $2/$tn <<-EOF
310                         /<title>/s#</title>#, $page($sect)&#
311                         /<h1>/s#</h1>#, <a href="../$tn">$page($sect)</a>&#
312                         wq
313                 EOF
314         else
315                 # store target filename in the inode cache
316                 roff2htm_inodecache[ino]=$tn
317                 do_conversion_verbose $page $sect $fn $2/$tn
318         fi
319 }
320
321 # output_htaccess >…/.htaccess
322 function output_htaccess {
323         print DirectoryIndex /dev/null
324         print "AddType 'text/html; charset=utf-8' htm"
325 }
326
327 # convert_all /path/to/share/man /targetpath
328 function convert_all {
329         local tp=${2:-$(pwd)/mbsdman}           # target basepath
330         local x f
331
332         (find ${1:-/usr/share/man}/cat{[1-9],3p} -name \*.0 2>&- | sort -f) |&
333         for x in 1 2 3 3p 4 5 6 7 8 9; do
334                 mkdir -p $tp/man$x      # one per section
335                 output_htaccess >$tp/man$x/.htaccess
336         done
337         while read -p f; do
338                 convert_page $f $tp     # any subpages
339         done
340 }