update from latest MirBSD CVS
[shellsnippets/shellsnippets.git] / mksh / roff2htm
1 roff2htm_rcsid='$MirOS: src/scripts/roff2htm,v 1.90 2018/01/07 23:27:30 tg Exp $'
2 # $ekkoBSD: catman2html.sh,v 1.2 2004/03/07 03:02:53 stephen Exp $
3 #-
4 # Copyright (c) 2004, 2005, 2006, 2007, 2011, 2012, 2014, 2015,
5 #               2016, 2017
6 #       mirabilos <m@mirbsd.org>
7 # Original version for ekkoBSD by:
8 # Copyright (c) 2004
9 #       Stephen Paskaluk <sap@mirbsd.org>
10 # Parts of the regular expression set below are based upon work by:
11 # Copyright (c) 1995
12 #       Panagiotis J. Christias <christia@theseas.ntua.gr>
13 #
14 # Provided that these terms and disclaimer and all copyright notices
15 # are retained or reproduced in an accompanying document, permission
16 # is granted to deal in this work without restriction, including un-
17 # limited rights to use, publicly perform, distribute, sell, modify,
18 # merge, give away, or sublicence.
19 #
20 # Advertising materials mentioning features or use of this work must
21 # display the following acknowledgement:
22 #       This product includes material provided by mirabilos.
23 #
24 # This work is provided “AS IS” and WITHOUT WARRANTY of any kind, to
25 # the utmost extent permitted by applicable law, neither express nor
26 # implied; without malicious intent or gross negligence. In no event
27 # may a licensor, author or contributor be held liable for indirect,
28 # direct, other damage, loss, or other issues arising in any way out
29 # of dealing in the work, even if advised of the possibility of such
30 # damage or existence of a defect, except proven that it results out
31 # of said person's immediate fault when using the work as intended.
32 #-
33 # Routines for converting catman pages and nrcon(1)d papers to HTML.
34 # ATTENTION: this file contains embedded white-, backspace and high-
35 #            bit-on control characters! Use “jupp --asis $0” to edit
36 # Note: this file contains magic and can’t be edited as UTF-8 either.
37 # Note: this script assumes MirBSD filesystem interna: ino_t=uint32_t
38
39 # check if mksh R31:2007/10/18 or up
40 if [[ $KSH_VERSION = @(\@\(#\)MIRBSD KSH R)@(3[2-9]|[4-9][0-9]|[1-9][0-9]+([0-9]))\ +([0-9])/+([0-9])/+([0-9])?(\ *) ]]; then
41         i=0
42 elif [[ $KSH_VERSION = @(\@\(#\)MIRBSD KSH R31)* ]]; then
43         eval $(print "$KSH_VERSION" | sed 's#^.*R31 \([0-9]*\)/\([0-9]*\)/\([0-9]*\)\( .*\)*$#y=\1 m=\2 d=\3#')
44         (( i = y < 2007 ? 1 :
45             y > 2007 ? 0 :
46             m < 10 ? 1 :
47             m > 10 ? 0 :
48             d < 18 ? 1 : 0 ))
49         unset y m d
50 else
51         i=1
52 fi
53 # we need an mksh version with uint32_t array indicēs
54 if (( i )); then
55         print -u2 Error: your mksh is not recent enough.
56         print -u2 Please upgrade to at least mksh R32.
57         exit 1
58 fi
59 unset i
60
61 # initialise globals
62 roff2htm_gendate=$(date +"%F %T")               # current time
63 set -A roff2htm_inodecache                      # inode cache (empty)
64 roff2htm_machine=$(uname -m)                    # i386, sparc
65
66 function set_conversion_man {
67         function do_convert {
68                 do_convert_man "$@"
69         }
70 }
71 function set_conversion_paper {
72         function do_convert {
73                 do_convert_paper "$@"
74         }
75 }
76 set_conversion_man
77
78 function set_target_absolute {
79         roff2htm_rel=http://www.mirbsd.org/
80 }
81 function set_target_relative {
82         roff2htm_rel=../
83 }
84 set_target_relative
85
86 function do_convert_man {
87         local -i _nl=0
88         col -x | sed \
89             -e '/-$/N
90 {
91 s/\([0-9A-z][-.,0-9A-z:]*\)-\n\(  *\)\([0-9A-z][-.,0-9A-z:]*([1-9][A-z]*)\)\([^ ]*\) /\1\3\4\
92 \2/
93 }'                                                                      \
94             -e 's#<\b_#≤#g' -e 's#>\b_#≥#g'                           \
95             -e 'y#&<>#Áþÿ#'                                             \
96                                                                         \
97             -e 's#[Oo]\b[Oo]\b+\b+#•#g'                                  \
98             -e 's#_\b|\b|#_\b|\b_\b|#g'                                      \
99             -e 's#+\b_#±#g'                                             \
100                                                                         \
101             -e 's#^[A-z][\b 0-9A-z]*$#</pre><h2>&</h2><pre>#'            \
102             -e 's#^  \([A-z][\b -%'\''-;=?-~]*\)$#</pre><h3>\1</h3><pre>#'       \
103                                                                         \
104             -e 's#\([^~<>\80-¿][\80-¿]*\)\b~#\1Ì\85#g'                         \
105             -e 's#\([^\b]\)~\b_#\1_̅\b #g'                                        \
106             -e 's#\([^\b]\)\([^<>_\80-¿][\80-¿]*\)\([Ì\85]*\)\b_#\1_\2\b\3#g'    \
107             -e 's#\([^<>\80-¿]\)[\80-¿]*\([Ì\85]*\)\b\([^<>\80-¿][\80-¿]*\)#<\1<\3\2>#g'   \
108             -e 's#\(<_<\([^>_]*\)>\)\b\1#<G>\2</G>#g'                    \
109             -e 's#<_<\([^>_]*\)>#<i>\1</i>#g'                           \
110             -e 's#<.<\([^>]*\)>#<b>\1</b>#g'                            \
111             -e 's#\b##g'                                                 \
112                                                                         \
113             -e '/<h[23]/s#</*[biG]>##g'                                 \
114             -e 's#</\([biG]\)><\1>##g'                                  \
115             -e 's#</\([biG]\)>\([- -*./:;?@^_~]*\)<\1>#\2#g'            \
116             -e 's#\([\ 1- 0-9A-z]\)\([$/_-]*\)\(<[biG]>\)#\1\3\2#g'       \
117             -e 's#\(</[biG]>\)\([$/_-]*\)\([\ 1- 0-9A-z]\)#\2\1\3#g'      \
118             -e 's#\(</[biG]>\))\([\ 1- ,.0-9A-z]\)#)\1\2#g'               \
119                                                                         \
120             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\(3p\))#<a href="'$roff2htm_rel'man\2/\1.htm">&</a>#g' \
121             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\([1-9]\)\(/[/0-9A-Za-z]*\)*)#<a href="'$roff2htm_rel'man\2/\1.htm">&</a>#g' \
122             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\([PSU][MS][DM]\))#<a href="'$roff2htm_rel'man\2/\1.htm">&</a>#g' \
123             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\(PAPERS\))#<a href="'$roff2htm_rel'man\2/\1.htm">&</a>#g' \
124             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(GNU)#<a href="'$roff2htm_rel'manINFO/\1.html">&</a>#g' \
125             -e 's#)\(</[biG]>\)\([\ 1- 0-9A-z]\)#\1)\2#g'                 \
126                                                                         \
127             -e 's/Á/\&#38;/g'                                           \
128             -e 's/þ/\&#60;/g'                                           \
129             -e 's/ÿ/\&#62;/g'                                           \
130                                                                         \
131             -e 's#<G>#<b><i>#g' -e 's#</G>#</i></b>#g'                  \
132             -e 's#</b><b>##g'                                           \
133                                                                         \
134             -e '1s#^#<pre>#'                                            \
135             -e '$s#$#</pre>#'                                           \
136             -e 's#<pre></pre>##g'                                       \
137             -e 's#</pre><pre>##g'                                       \
138         | while IFS= read -r line; do
139                 if [[ -n $line ]]; then
140                         (( _nl )) && [[ $line != '</pre>'* ]] && print
141                         print -r -- "$line"
142                         _nl=0
143                 else
144                         _nl=1
145                 fi
146         done
147 }
148
149 function do_convert_paper {
150         local -i _nl=0
151         col -x | sed \
152             -e '/-$/N
153 {
154 s/\([0-9A-z][-.,0-9A-z:]*\)-\n\(  *\)\([0-9A-z][-.,0-9A-z:]*([1-9][A-z]*)\)\([^ ]*\) /\1\3\4\
155 \2/
156 }'                                                                      \
157             -e 's#<\b_#≤#g' -e 's#>\b_#≥#g'                           \
158             -e 'y#&<>#Áþÿ#'                                             \
159                                                                         \
160             -e 's#[Oo]\b[Oo]\b+\b+#•#g'                                  \
161             -e 's#_\b|\b|#_\b|\b_\b|#g'                                      \
162             -e 's#+\b_#±#g'                                             \
163                                                                         \
164             -e 's#\([^~<>\80-¿][\80-¿]*\)\b~#\1Ì\85#g'                         \
165             -e 's#\([^\b]\)~\b_#\1_̅\b #g'                                        \
166             -e 's#\([^\b]\)\([^<>_\80-¿][\80-¿]*\)\([Ì\85]*\)\b_#\1_\2\b\3#g'    \
167             -e 's#\([^<>\80-¿]\)[\80-¿]*\([Ì\85]*\)\b\([^<>\80-¿][\80-¿]*\)#<\1<\3\2>#g'   \
168             -e 's#\(<_<\([^>_]*\)>\)\b\1#<G>\2</G>#g'                    \
169             -e 's#<_<\([^>_]*\)>#<i>\1</i>#g'                           \
170             -e 's#<.<\([^>]*\)>#<b>\1</b>#g'                            \
171             -e 's#\b##g'                                                 \
172                                                                         \
173             -e 's#</\([biG]\)><\1>##g'                                  \
174             -e 's#</\([biG]\)>\([- -*./:;?@^_~]*\)<\1>#\2#g'            \
175             -e 's#\([\ 1- 0-9A-z]\)\([$/_-]*\)\(<[biG]>\)#\1\3\2#g'       \
176             -e 's#\(</[biG]>\)\([$/_-]*\)\([\ 1- 0-9A-z]\)#\2\1\3#g'      \
177             -e 's#\(</[biG]>\))\([\ 1- ,.0-9A-z]\)#)\1\2#g'               \
178                                                                         \
179             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\([1-9]\)[/0-9A-Za-z]*)#<a href="'$roff2htm_rel'man\2/\1.htm">&</a>#g' \
180             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\([PSU][MS][DM]\))#<a href="'$roff2htm_rel'man\2/\1.htm">&</a>#g' \
181             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(\(PAPERS\))#<a href="'$roff2htm_rel'man\2/\1.htm">&</a>#g' \
182             -e '/^ /s#\([0-9A-z][-.,0-9A-z:]*\)(GNU)#<a href="'$roff2htm_rel'manINFO/\1.html">&</a>#g' \
183             -e 's#)\(</[biG]>\)\([\ 1- 0-9A-z]\)#\1)\2#g'                 \
184                                                                         \
185             -e 's/Á/\&#38;/g'                                           \
186             -e 's/þ/\&#60;/g'                                           \
187             -e 's/ÿ/\&#62;/g'                                           \
188                                                                         \
189             -e 's#<G>#<b><i>#g' -e 's#</G>#</i></b>#g'                  \
190             -e 's#</b><b>##g'                                           \
191                                                                         \
192             -e '1s#^#<pre>#'                                            \
193             -e '$s#$#</pre>#'                                           \
194             -e 's#<pre></pre>##g'                                       \
195             -e 's#</pre><pre>##g'                                       \
196         | while IFS= read -r line; do
197                 if [[ -n $line ]]; then
198                         (( _nl )) && [[ $line != '</pre>'* ]] && print
199                         print -r -- "$line"
200                         _nl=0
201                 else
202                         _nl=1
203                 fi
204         done
205 }
206
207 function output_header {
208         print '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
209  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
210 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head>
211  <meta http-equiv="content-type" content="text/html; charset=utf-8" />
212  <meta name="MSSmartTagsPreventParsing" content="TRUE" />
213  <title>RTFM '$1\($2')</title>
214  <meta name="robots" content="index, follow" />
215  <link rel="canonical" href="http://www.mirbsd.org/man'$roff2htm_machine/$1.$2'" />
216  <style type="text/css"><!--/*--><![CDATA[/*><!--*/
217         body {
218                 background-color:#000000;
219                 color:#666666;
220                 font-family:serif;
221         }
222         a {
223                 color:inherit;
224                 text-decoration:none;
225                 border-bottom:1px dashed;
226         }
227         a:visited {
228                 text-decoration:none;
229                 border-bottom:1px dotted;
230         }
231         a:hover {
232                 text-decoration:none;
233                 border-bottom:1px double;
234         }
235         pre {
236                 line-height:1.12;
237                 color:#FFBF00;
238         }
239         b {
240                 color:#FFEF00;
241                 font-weight:normal;
242         }
243         i {
244                 font-style:normal;
245                 border-bottom:1px solid #FFBF00;
246         }
247         b i,i b {
248                 color:#FFEF00;
249                 font-weight:normal;
250                 font-style:normal;
251                 border-bottom:1px solid #FFEF00;
252         }
253         h1 {
254                 color:#FFEF00;
255                 font-size:xx-large;
256                 font-family:serif;
257         }
258         h2 {
259                 color:#FFFFFF;
260                 font-size:x-large;
261                 font-family:sans-serif;
262         }
263         h3 {
264                 color:#CCCCCC;
265                 font-size:large;
266                 font-family:sans-serif;
267         }
268  /*]]>*/--></style>
269 </head><body>
270 <h1>MirOS Manual: <a href="'$roff2htm_rel'man'$2/$1'.htm">'$1\($2')</a></h1>'
271 }
272
273 function output_footer {
274         print '
275 <hr /><p style="font-size:xx-small;">Generated on' $roff2htm_gendate by \
276  '<tt>'$roff2htm_rcsid'</tt></p>
277 <p>These manual pages and other documentation are <a
278  href="'$roff2htm_rel'man7/BSD-Licence.htm">copyrighted</a> by their respective writers;
279  their source is available at our <a href="http://cvs.mirbsd.de/">CVSweb</a>,
280  AnonCVS, and other mirrors. The rest is Copyright © 2002–2018 <a
281  href="http://www.mirbsd.org/">MirBSD</a>.<br /><span
282  style="font-size:3pt; font-style:italic;">This product includes material
283  provided by <b>mirabilos</b>.</span></p>
284 <p style="font-size:x-small;">This manual page’s HTML representation
285  is supposed to be <a href="http://validator.w3.org/check/referer">valid
286  XHTML/1.1</a>; if not, please send a bug report — diffs preferred.</p>
287 </body></html>'
288 }
289
290 function do_conversion {
291         output_header ${1:-missing-pagename} ${2:-0}
292         do_convert ${2:-0}
293         output_footer
294 }
295
296 # do_conversion_verbose title section infile outfile
297 function do_conversion_verbose {
298         print -nru2 -- $3 → $4
299         do_conversion $1 $2 <$3 >$4
300         print -u2
301 }
302
303 # convert_page /path/to/man.cat1 /targetpath
304 function convert_page {
305         local fn=$1 page sect tn
306         local -Uui ino=$(stat -Lf %i $fn 2>/dev/null)
307         page=${fn##*/}                  # basename
308         page=${page%.0}                 # manual page name
309         sect=${fn%/*}                   # dirname
310         sect=${sect##*/cat}             # archsection
311         sect=${sect%%/*}                # section
312         tn=man${sect}/${page}.htm       # target file
313
314         if (( ino )) && [[ -n ${roff2htm_inodecache[ino]} ]]; then
315                 # source file is linked to a file we know
316                 print -ru2 -- $tn ← ${roff2htm_inodecache[ino]}
317                 ln -f $2/${roff2htm_inodecache[ino]} $2/$tn
318                 # patch in the additional name(s)
319                 ed -s $2/$tn <<-EOF
320                         /<title>/s#</title>#, $page($sect)&#
321                         /<h1>/s#</h1>#, <a href="$roff2htm_rel$tn">$page($sect)</a>&#
322                         wq
323                 EOF
324         else
325                 # store target filename in the inode cache
326                 roff2htm_inodecache[ino]=$tn
327                 do_conversion_verbose $page $sect $fn $2/$tn
328         fi
329 }
330
331 # output_htaccess >…/.htaccess
332 function output_htaccess {
333         print DirectoryIndex /dev/null
334         print "AddType 'text/html; charset=utf-8' htm"
335 }
336
337 # convert_all /path/to/share/man /targetpath
338 function convert_all {
339         local tp=${2:-$(pwd)/mbsdman}           # target basepath
340         local x f
341
342         (find ${1:-/usr/share/man}/cat{[1-9],3p} -name \*.0 2>/dev/null | \
343             sort -f) |&
344         for x in 1 2 3 3p 4 5 6 7 8 9; do
345                 mkdir -p $tp/man$x      # one per section
346                 output_htaccess >$tp/man$x/.htaccess
347         done
348         while read -p f; do
349                 convert_page $f $tp     # any subpages
350         done
351 }