another update from CVS HEAD, for QA
[alioth/jupp.git] / utf8.c
1 /*
2  *      UTF-8 Utilities
3  *      Copyright
4  *              (C) 2004 Joseph H. Allen
5  *              (c) 2004, 2006, 2011, 2013, 2014, 2017 Thorsten Glaser
6  *
7  *      This file is part of JOE (Joe's Own Editor)
8  */
9
10 #include "config.h"
11 #include "types.h"
12
13 __RCSID("$MirOS: contrib/code/jupp/utf8.c,v 1.22 2017/12/06 21:17:04 tg Exp $");
14
15 #include <stdlib.h>
16 #include <string.h>
17
18 #ifdef __CYGWIN__
19 #include <cygwin/version.h>
20 #endif
21
22 #ifdef __MirBSD__
23 #include <sys/param.h>
24 #endif
25
26 #undef USE_CODEPAGE
27 #undef USE_LOCALE
28 #if defined(HAVE_SETLOCALE) && defined(HAVE_NL_LANGINFO)
29 #define USE_LOCALE
30 #endif
31
32 /* Cygwin before 1.7.2 did not have locale support */
33 #if defined(CYGWIN_VERSION_API_MAJOR) && (CYGWIN_VERSION_API_MAJOR < 1) && \
34     defined(CYGWIN_VERSION_API_MINOR) && (CYGWIN_VERSION_API_MINOR < 222)
35 #define USE_CODEPAGE
36 #undef USE_LOCALE
37 #endif
38
39 /* OpenBSD, ekkoBSD and old MirOS do not have real locale support */
40 #if defined(__OpenBSD__) && (!defined(MirBSD) || (MirBSD < 0x09A0))
41 #undef USE_LOCALE
42 #endif
43
44 #ifdef USE_LOCALE
45 #ifdef HAVE_LOCALE_H
46 #include <locale.h>
47 #endif
48 #ifdef HAVE_LANGINFO_H
49 #include <langinfo.h>
50 #endif
51 #endif
52
53 #ifndef CODESET
54 #undef USE_LOCALE
55 #endif
56
57 #ifdef USE_LOCALE
58 #undef USE_CODEPAGE
59 #endif
60
61 #include "rc.h"
62 #include "charmap.h"
63
64 /* UTF-8 Encoder
65  *
66  * c is unicode character.
67  * buf is 7 byte buffer- utf-8 coded character is written to this followed by a 0 termination.
68  * returns length (not including terminator).
69  */
70
71 int utf8_encode(unsigned char *buf,int c)
72 {
73         if (c < 0x80) {
74                 buf[0] = c;
75                 buf[1] = 0;
76                 return 1;
77         } else if(c < 0x800) {
78                 buf[0] = (0xc0|(c>>6));
79                 buf[1] = (0x80|(c&0x3F));
80                 buf[2] = 0;
81                 return 2;
82         } else if(c < 0x10000) {
83                 buf[0] = (0xe0|(c>>12));
84                 buf[1] = (0x80|((c>>6)&0x3f));
85                 buf[2] = (0x80|((c)&0x3f));
86                 buf[3] = 0;
87                 return 3;
88         } else if(c < 0x200000) {
89                 buf[0] = (0xf0|(c>>18));
90                 buf[1] = (0x80|((c>>12)&0x3f));
91                 buf[2] = (0x80|((c>>6)&0x3f));
92                 buf[3] = (0x80|((c)&0x3f));
93                 buf[4] = 0;
94                 return 4;
95         } else if(c < 0x4000000) {
96                 buf[0] = (0xf8|(c>>24));
97                 buf[1] = (0x80|((c>>18)&0x3f));
98                 buf[2] = (0x80|((c>>12)&0x3f));
99                 buf[3] = (0x80|((c>>6)&0x3f));
100                 buf[4] = (0x80|((c)&0x3f));
101                 buf[5] = 0;
102                 return 5;
103         } else {
104                 buf[0] = (0xfC|(c>>30));
105                 buf[1] = (0x80|((c>>24)&0x3f));
106                 buf[2] = (0x80|((c>>18)&0x3f));
107                 buf[3] = (0x80|((c>>12)&0x3f));
108                 buf[4] = (0x80|((c>>6)&0x3f));
109                 buf[5] = (0x80|((c)&0x3f));
110                 buf[6] = 0;
111                 return 6;
112         }
113 }
114
115 /* UTF-8 Decoder
116  *
117  * Returns 0 - 7FFFFFFF: decoded character
118  *                   -1: character accepted, nothing decoded yet.
119  *                   -2: incomplete sequence
120  *                   -3: no sequence started, but character is between 128 - 191, 254 or 255
121  */
122
123 int utf8_decode(struct utf8_sm *utf8_sm,unsigned char c)
124 {
125         if (utf8_sm->state) {
126                 if ((c&0xC0)==0x80) {
127                         utf8_sm->buf[utf8_sm->ptr++] = c;
128                         --utf8_sm->state;
129                         utf8_sm->accu = ((utf8_sm->accu<<6)|(c&0x3F));
130                         if(!utf8_sm->state)
131                                 return utf8_sm->accu;
132                 } else {
133                         utf8_sm->state = 0;
134                         return -2;
135                 }
136         } else if ((c&0xE0)==0xC0) {
137                 /* 192 - 223 */
138                 utf8_sm->buf[0] = c;
139                 utf8_sm->ptr = 1;
140                 utf8_sm->state = 1;
141                 utf8_sm->accu = (c&0x1F);
142         } else if ((c&0xF0)==0xE0) {
143                 /* 224 - 239 */
144                 utf8_sm->buf[0] = c;
145                 utf8_sm->ptr = 1;
146                 utf8_sm->state = 2;
147                 utf8_sm->accu = (c&0x0F);
148         } else if ((c&0xF8)==0xF0) {
149                 /* 240 - 247 */
150                 utf8_sm->buf[0] = c;
151                 utf8_sm->ptr = 1;
152                 utf8_sm->state = 3;
153                 utf8_sm->accu = (c&0x07);
154         } else if ((c&0xFC)==0xF8) {
155                 /* 248 - 251 */
156                 utf8_sm->buf[0] = c;
157                 utf8_sm->ptr = 1;
158                 utf8_sm->state = 4;
159                 utf8_sm->accu = (c&0x03);
160         } else if ((c&0xFE)==0xFC) {
161                 /* 252 - 253 */
162                 utf8_sm->buf[0] = c;
163                 utf8_sm->ptr = 1;
164                 utf8_sm->state = 5;
165                 utf8_sm->accu = (c&0x01);
166         } else if ((c&0x80)==0x00) {
167                 /* 0 - 127 */
168                 utf8_sm->buf[0] = c;
169                 utf8_sm->ptr = 1;
170                 utf8_sm->state = 0;
171                 return c;
172         } else {
173                 /* 128 - 191, 254, 255 */
174                 utf8_sm->ptr = 0;
175                 utf8_sm->state = 0;
176                 return -3;
177         }
178         return -1;
179 }
180
181 /* Initialize state machine */
182
183 void utf8_init(struct utf8_sm *utf8_sm)
184 {
185         utf8_sm->ptr = 0;
186         utf8_sm->state = 0;
187 }
188
189 /* Decode an entire string */
190
191 int utf8_decode_string(unsigned char *s)
192 {
193         struct utf8_sm sm;
194         int x;
195         int c = 0;
196         utf8_init(&sm);
197         for(x=0;s[x];++x)
198                 c = utf8_decode(&sm,s[x]);
199         return c;
200 }
201
202 /* Decode and advance */
203
204 int utf8_decode_fwrd(unsigned char **p,int *plen)
205 {
206         struct utf8_sm sm;
207         unsigned char *s = *p;
208         int len = *plen;
209         int c = -2;
210
211         utf8_init(&sm);
212
213         while (len) {
214                 --len;
215                 c = utf8_decode(&sm,*s++);
216                 if (c >= 0)
217                         break;
218         }
219
220         *plen = len;
221         *p = s;
222
223         return c;
224 }
225
226 /* Initialize locale for JOE */
227
228 #ifdef USE_CODEPAGE
229 extern unsigned int cygwin32_get_cp(void);
230 #endif
231
232 struct charmap *locale_map;
233                         /* Character map of terminal */
234 struct charmap *utf8_map;
235                         /* Handy character map for UTF-8 */
236
237 void
238 joe_locale(void)
239 {
240         unsigned char *s;
241
242         s=(unsigned char *)getenv("JOECHARMAP");
243         locale_map = find_charmap(s);
244 #if !defined(USE_LOCALE)
245         if (!locale_map) {
246                 s=(unsigned char *)getenv("LC_ALL");
247                 if (!s) {
248                         s=(unsigned char *)getenv("LC_CTYPE");
249                         if (!s) {
250                                 s=(unsigned char *)getenv("LANG");
251                         }
252                 }
253 #ifdef USE_CODEPAGE
254                 /* if LC_* are unset, use codepage */
255                 if (!s) {
256                         char buf[16];
257
258                         joe_snprintf_1(buf, sizeof(buf), "cp%u", cygwin32_get_cp());
259                         locale_map = find_charmap(buf);
260                 }
261 #endif
262         }
263 #endif
264
265 #ifdef USE_LOCALE
266         if (!locale_map) {
267                 setlocale(LC_ALL,"");
268                 s = (unsigned char *)strdup(nl_langinfo(CODESET));
269
270                 locale_map = find_charmap(s);
271         }
272 #else
273         if (!locale_map && s) {
274                 unsigned char *t, *tt;
275
276                 if ((t = strrchr(s, '.')) != NULL) {
277                         if ((tt = strchr(++t, '@')) != NULL)
278                                 *tt = '\0';
279                         locale_map = find_charmap(t);
280                 }
281                 if (!locale_map)
282                         locale_map = find_charmap(s);
283         }
284 #endif
285         if (!locale_map)
286                 locale_map = find_charmap(US "ascii");
287         utf8_map = find_charmap(US "utf-8");
288
289 #ifndef TEST
290 #ifdef defutf8
291         fdefault.charmap = utf8_map;
292 #else
293         fdefault.charmap = locale_map;
294 #endif
295         pdefault.charmap = locale_map;
296 #endif
297 }
298
299 void to_utf8(struct charmap *map,unsigned char *s,int c)
300 {
301         int d = to_uni(map,c);
302
303         if (d==-1)
304                 utf8_encode(s,'?');
305         else
306                 utf8_encode(s,d);
307 }
308
309 int from_utf8(struct charmap *map,unsigned char *s)
310 {
311         int d = utf8_decode_string(s);
312         int c = from_uni(map,d);
313         if (c==-1)
314                 return '?';
315         else
316                 return c;
317 }