/MirOS/dist/jupp/joe-3.1jupp30.tgz
[alioth/jupp.git] / utf8.c
1 /* $MirOS: contrib/code/jupp/utf8.c,v 1.19 2017/01/11 21:48:58 tg Exp $ */
2 /*
3  *      UTF-8 Utilities
4  *      Copyright
5  *              (C) 2004 Joseph H. Allen
6  *              (c) 2004, 2006, 2011, 2013, 2014, 2017 Thorsten Glaser
7  *
8  *      This file is part of JOE (Joe's Own Editor)
9  */
10
11 #include "config.h"
12 #include "types.h"
13
14 #include <stdio.h>
15 #include <string.h>
16
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20
21 #ifdef __CYGWIN__
22 #include <cygwin/version.h>
23 #endif
24
25 #ifdef __MirBSD__
26 #include <sys/param.h>
27 #endif
28
29 #undef USE_CODEPAGE
30 #undef USE_LOCALE
31 #if defined(HAVE_SETLOCALE) && defined(HAVE_NL_LANGINFO)
32 #define USE_LOCALE
33 #endif
34
35 /* Cygwin before 1.7.2 did not have locale support */
36 #if defined(CYGWIN_VERSION_API_MAJOR) && (CYGWIN_VERSION_API_MAJOR < 1) && \
37     defined(CYGWIN_VERSION_API_MINOR) && (CYGWIN_VERSION_API_MINOR < 222)
38 #define USE_CODEPAGE
39 #undef USE_LOCALE
40 #endif
41
42 /* OpenBSD, ekkoBSD and old MirOS do not have real locale support */
43 #if defined(__OpenBSD__) && (!defined(MirBSD) || (MirBSD < 0x09A0))
44 #undef USE_LOCALE
45 #endif
46
47 #ifdef USE_LOCALE
48 #ifdef HAVE_LOCALE_H
49 #include <locale.h>
50 #endif
51 #ifdef HAVE_LANGINFO_H
52 #include <langinfo.h>
53 #endif
54 #endif
55
56 #ifndef CODESET
57 #undef USE_LOCALE
58 #endif
59
60 #ifdef USE_LOCALE
61 #undef USE_CODEPAGE
62 #endif
63
64 #include "rc.h"
65 #include "utf8.h"
66 #include "charmap.h"
67
68 /* UTF-8 Encoder
69  *
70  * c is unicode character.
71  * buf is 7 byte buffer- utf-8 coded character is written to this followed by a 0 termination.
72  * returns length (not including terminator).
73  */
74
75 int utf8_encode(unsigned char *buf,int c)
76 {
77         if (c < 0x80) {
78                 buf[0] = c;
79                 buf[1] = 0;
80                 return 1;
81         } else if(c < 0x800) {
82                 buf[0] = (0xc0|(c>>6));
83                 buf[1] = (0x80|(c&0x3F));
84                 buf[2] = 0;
85                 return 2;
86         } else if(c < 0x10000) {
87                 buf[0] = (0xe0|(c>>12));
88                 buf[1] = (0x80|((c>>6)&0x3f));
89                 buf[2] = (0x80|((c)&0x3f));
90                 buf[3] = 0;
91                 return 3;
92         } else if(c < 0x200000) {
93                 buf[0] = (0xf0|(c>>18));
94                 buf[1] = (0x80|((c>>12)&0x3f));
95                 buf[2] = (0x80|((c>>6)&0x3f));
96                 buf[3] = (0x80|((c)&0x3f));
97                 buf[4] = 0;
98                 return 4;
99         } else if(c < 0x4000000) {
100                 buf[0] = (0xf8|(c>>24));
101                 buf[1] = (0x80|((c>>18)&0x3f));
102                 buf[2] = (0x80|((c>>12)&0x3f));
103                 buf[3] = (0x80|((c>>6)&0x3f));
104                 buf[4] = (0x80|((c)&0x3f));
105                 buf[5] = 0;
106                 return 5;
107         } else {
108                 buf[0] = (0xfC|(c>>30));
109                 buf[1] = (0x80|((c>>24)&0x3f));
110                 buf[2] = (0x80|((c>>18)&0x3f));
111                 buf[3] = (0x80|((c>>12)&0x3f));
112                 buf[4] = (0x80|((c>>6)&0x3f));
113                 buf[5] = (0x80|((c)&0x3f));
114                 buf[6] = 0;
115                 return 6;
116         }
117 }
118
119 /* UTF-8 Decoder
120  *
121  * Returns 0 - 7FFFFFFF: decoded character
122  *                   -1: character accepted, nothing decoded yet.
123  *                   -2: incomplete sequence
124  *                   -3: no sequence started, but character is between 128 - 191, 254 or 255
125  */
126
127 int utf8_decode(struct utf8_sm *utf8_sm,unsigned char c)
128 {
129         if (utf8_sm->state) {
130                 if ((c&0xC0)==0x80) {
131                         utf8_sm->buf[utf8_sm->ptr++] = c;
132                         --utf8_sm->state;
133                         utf8_sm->accu = ((utf8_sm->accu<<6)|(c&0x3F));
134                         if(!utf8_sm->state)
135                                 return utf8_sm->accu;
136                 } else {
137                         utf8_sm->state = 0;
138                         return -2;
139                 }
140         } else if ((c&0xE0)==0xC0) {
141                 /* 192 - 223 */
142                 utf8_sm->buf[0] = c;
143                 utf8_sm->ptr = 1;
144                 utf8_sm->state = 1;
145                 utf8_sm->accu = (c&0x1F);
146         } else if ((c&0xF0)==0xE0) {
147                 /* 224 - 239 */
148                 utf8_sm->buf[0] = c;
149                 utf8_sm->ptr = 1;
150                 utf8_sm->state = 2;
151                 utf8_sm->accu = (c&0x0F);
152         } else if ((c&0xF8)==0xF0) {
153                 /* 240 - 247 */
154                 utf8_sm->buf[0] = c;
155                 utf8_sm->ptr = 1;
156                 utf8_sm->state = 3;
157                 utf8_sm->accu = (c&0x07);
158         } else if ((c&0xFC)==0xF8) {
159                 /* 248 - 251 */
160                 utf8_sm->buf[0] = c;
161                 utf8_sm->ptr = 1;
162                 utf8_sm->state = 4;
163                 utf8_sm->accu = (c&0x03);
164         } else if ((c&0xFE)==0xFC) {
165                 /* 252 - 253 */
166                 utf8_sm->buf[0] = c;
167                 utf8_sm->ptr = 1;
168                 utf8_sm->state = 5;
169                 utf8_sm->accu = (c&0x01);
170         } else if ((c&0x80)==0x00) {
171                 /* 0 - 127 */
172                 utf8_sm->buf[0] = c;
173                 utf8_sm->ptr = 1;
174                 utf8_sm->state = 0;
175                 return c;
176         } else {
177                 /* 128 - 191, 254, 255 */
178                 utf8_sm->ptr = 0;
179                 utf8_sm->state = 0;
180                 return -3;
181         }
182         return -1;
183 }
184
185 /* Initialize state machine */
186
187 void utf8_init(struct utf8_sm *utf8_sm)
188 {
189         utf8_sm->ptr = 0;
190         utf8_sm->state = 0;
191 }
192
193 /* Decode an entire string */
194
195 int utf8_decode_string(unsigned char *s)
196 {
197         struct utf8_sm sm;
198         int x;
199         int c = 0;
200         utf8_init(&sm);
201         for(x=0;s[x];++x)
202                 c = utf8_decode(&sm,s[x]);
203         return c;
204 }
205
206 /* Decode and advance */
207
208 int utf8_decode_fwrd(unsigned char **p,int *plen)
209 {
210         struct utf8_sm sm;
211         unsigned char *s = *p;
212         int len = *plen;
213         int c = -2;
214
215         utf8_init(&sm);
216
217         while (len) {
218                 --len;
219                 c = utf8_decode(&sm,*s++);
220                 if (c >= 0)
221                         break;
222         }
223
224         *plen = len;
225         *p = s;
226
227         return c;
228 }
229
230 /* Initialize locale for JOE */
231
232 #ifdef USE_CODEPAGE
233 extern unsigned int cygwin32_get_cp(void);
234 #endif
235
236 struct charmap *locale_map;
237                         /* Character map of terminal */
238 struct charmap *utf8_map;
239                         /* Handy character map for UTF-8 */
240
241 void
242 joe_locale(void)
243 {
244         unsigned char *s;
245
246         s=(unsigned char *)getenv("JOECHARMAP");
247         locale_map = find_charmap(s);
248 #if !defined(USE_LOCALE)
249         if (!locale_map) {
250                 s=(unsigned char *)getenv("LC_ALL");
251                 if (!s) {
252                         s=(unsigned char *)getenv("LC_CTYPE");
253                         if (!s) {
254                                 s=(unsigned char *)getenv("LANG");
255                         }
256                 }
257 #ifdef USE_CODEPAGE
258                 /* if LC_* are unset, use codepage */
259                 if (!s) {
260                         char buf[16];
261
262                         joe_snprintf_1(buf, sizeof(buf), "cp%u", cygwin32_get_cp());
263                         locale_map = find_charmap(buf);
264                 }
265 #endif
266         }
267 #endif
268
269 #ifdef USE_LOCALE
270         if (!locale_map) {
271                 setlocale(LC_ALL,"");
272                 s = (unsigned char *)strdup(nl_langinfo(CODESET));
273
274                 locale_map = find_charmap(s);
275         }
276 #else
277         if (!locale_map && s) {
278                 unsigned char *t, *tt;
279
280                 if ((t = strrchr(s, '.')) != NULL) {
281                         if ((tt = strchr(++t, '@')) != NULL)
282                                 *tt = '\0';
283                         locale_map = find_charmap(t);
284                 }
285                 if (!locale_map)
286                         locale_map = find_charmap(s);
287         }
288 #endif
289         if (!locale_map)
290                 locale_map = find_charmap(US "ascii");
291         utf8_map = find_charmap(US "utf-8");
292
293 #ifndef TEST
294 #ifdef defutf8
295         fdefault.charmap = utf8_map;
296 #else
297         fdefault.charmap = locale_map;
298 #endif
299         pdefault.charmap = locale_map;
300 #endif
301 }
302
303 void to_utf8(struct charmap *map,unsigned char *s,int c)
304 {
305         int d = to_uni(map,c);
306
307         if (d==-1)
308                 utf8_encode(s,'?');
309         else
310                 utf8_encode(s,d);
311 }
312
313 int from_utf8(struct charmap *map,unsigned char *s)
314 {
315         int d = utf8_decode_string(s);
316         int c = from_uni(map,d);
317         if (c==-1)
318                 return '?';
319         else
320                 return c;
321 }