we’ll need to distinguish these for sarge/etch as well
[alioth/jupp.git] / utf8.c
1 /*
2  *      UTF-8 Utilities
3  *      Copyright
4  *              (C) 2004 Joseph H. Allen
5  *              (c) 2004, 2006, 2011, 2013, 2014, 2017 Thorsten Glaser
6  *
7  *      This file is part of JOE (Joe's Own Editor)
8  */
9
10 #include "config.h"
11 #include "types.h"
12
13 __RCSID("$MirOS: contrib/code/jupp/utf8.c,v 1.21 2017/12/02 04:32:43 tg Exp $");
14
15 #include <string.h>
16
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20
21 #ifdef __CYGWIN__
22 #include <cygwin/version.h>
23 #endif
24
25 #ifdef __MirBSD__
26 #include <sys/param.h>
27 #endif
28
29 #undef USE_CODEPAGE
30 #undef USE_LOCALE
31 #if defined(HAVE_SETLOCALE) && defined(HAVE_NL_LANGINFO)
32 #define USE_LOCALE
33 #endif
34
35 /* Cygwin before 1.7.2 did not have locale support */
36 #if defined(CYGWIN_VERSION_API_MAJOR) && (CYGWIN_VERSION_API_MAJOR < 1) && \
37     defined(CYGWIN_VERSION_API_MINOR) && (CYGWIN_VERSION_API_MINOR < 222)
38 #define USE_CODEPAGE
39 #undef USE_LOCALE
40 #endif
41
42 /* OpenBSD, ekkoBSD and old MirOS do not have real locale support */
43 #if defined(__OpenBSD__) && (!defined(MirBSD) || (MirBSD < 0x09A0))
44 #undef USE_LOCALE
45 #endif
46
47 #ifdef USE_LOCALE
48 #ifdef HAVE_LOCALE_H
49 #include <locale.h>
50 #endif
51 #ifdef HAVE_LANGINFO_H
52 #include <langinfo.h>
53 #endif
54 #endif
55
56 #ifndef CODESET
57 #undef USE_LOCALE
58 #endif
59
60 #ifdef USE_LOCALE
61 #undef USE_CODEPAGE
62 #endif
63
64 #include "rc.h"
65 #include "charmap.h"
66
67 /* UTF-8 Encoder
68  *
69  * c is unicode character.
70  * buf is 7 byte buffer- utf-8 coded character is written to this followed by a 0 termination.
71  * returns length (not including terminator).
72  */
73
74 int utf8_encode(unsigned char *buf,int c)
75 {
76         if (c < 0x80) {
77                 buf[0] = c;
78                 buf[1] = 0;
79                 return 1;
80         } else if(c < 0x800) {
81                 buf[0] = (0xc0|(c>>6));
82                 buf[1] = (0x80|(c&0x3F));
83                 buf[2] = 0;
84                 return 2;
85         } else if(c < 0x10000) {
86                 buf[0] = (0xe0|(c>>12));
87                 buf[1] = (0x80|((c>>6)&0x3f));
88                 buf[2] = (0x80|((c)&0x3f));
89                 buf[3] = 0;
90                 return 3;
91         } else if(c < 0x200000) {
92                 buf[0] = (0xf0|(c>>18));
93                 buf[1] = (0x80|((c>>12)&0x3f));
94                 buf[2] = (0x80|((c>>6)&0x3f));
95                 buf[3] = (0x80|((c)&0x3f));
96                 buf[4] = 0;
97                 return 4;
98         } else if(c < 0x4000000) {
99                 buf[0] = (0xf8|(c>>24));
100                 buf[1] = (0x80|((c>>18)&0x3f));
101                 buf[2] = (0x80|((c>>12)&0x3f));
102                 buf[3] = (0x80|((c>>6)&0x3f));
103                 buf[4] = (0x80|((c)&0x3f));
104                 buf[5] = 0;
105                 return 5;
106         } else {
107                 buf[0] = (0xfC|(c>>30));
108                 buf[1] = (0x80|((c>>24)&0x3f));
109                 buf[2] = (0x80|((c>>18)&0x3f));
110                 buf[3] = (0x80|((c>>12)&0x3f));
111                 buf[4] = (0x80|((c>>6)&0x3f));
112                 buf[5] = (0x80|((c)&0x3f));
113                 buf[6] = 0;
114                 return 6;
115         }
116 }
117
118 /* UTF-8 Decoder
119  *
120  * Returns 0 - 7FFFFFFF: decoded character
121  *                   -1: character accepted, nothing decoded yet.
122  *                   -2: incomplete sequence
123  *                   -3: no sequence started, but character is between 128 - 191, 254 or 255
124  */
125
126 int utf8_decode(struct utf8_sm *utf8_sm,unsigned char c)
127 {
128         if (utf8_sm->state) {
129                 if ((c&0xC0)==0x80) {
130                         utf8_sm->buf[utf8_sm->ptr++] = c;
131                         --utf8_sm->state;
132                         utf8_sm->accu = ((utf8_sm->accu<<6)|(c&0x3F));
133                         if(!utf8_sm->state)
134                                 return utf8_sm->accu;
135                 } else {
136                         utf8_sm->state = 0;
137                         return -2;
138                 }
139         } else if ((c&0xE0)==0xC0) {
140                 /* 192 - 223 */
141                 utf8_sm->buf[0] = c;
142                 utf8_sm->ptr = 1;
143                 utf8_sm->state = 1;
144                 utf8_sm->accu = (c&0x1F);
145         } else if ((c&0xF0)==0xE0) {
146                 /* 224 - 239 */
147                 utf8_sm->buf[0] = c;
148                 utf8_sm->ptr = 1;
149                 utf8_sm->state = 2;
150                 utf8_sm->accu = (c&0x0F);
151         } else if ((c&0xF8)==0xF0) {
152                 /* 240 - 247 */
153                 utf8_sm->buf[0] = c;
154                 utf8_sm->ptr = 1;
155                 utf8_sm->state = 3;
156                 utf8_sm->accu = (c&0x07);
157         } else if ((c&0xFC)==0xF8) {
158                 /* 248 - 251 */
159                 utf8_sm->buf[0] = c;
160                 utf8_sm->ptr = 1;
161                 utf8_sm->state = 4;
162                 utf8_sm->accu = (c&0x03);
163         } else if ((c&0xFE)==0xFC) {
164                 /* 252 - 253 */
165                 utf8_sm->buf[0] = c;
166                 utf8_sm->ptr = 1;
167                 utf8_sm->state = 5;
168                 utf8_sm->accu = (c&0x01);
169         } else if ((c&0x80)==0x00) {
170                 /* 0 - 127 */
171                 utf8_sm->buf[0] = c;
172                 utf8_sm->ptr = 1;
173                 utf8_sm->state = 0;
174                 return c;
175         } else {
176                 /* 128 - 191, 254, 255 */
177                 utf8_sm->ptr = 0;
178                 utf8_sm->state = 0;
179                 return -3;
180         }
181         return -1;
182 }
183
184 /* Initialize state machine */
185
186 void utf8_init(struct utf8_sm *utf8_sm)
187 {
188         utf8_sm->ptr = 0;
189         utf8_sm->state = 0;
190 }
191
192 /* Decode an entire string */
193
194 int utf8_decode_string(unsigned char *s)
195 {
196         struct utf8_sm sm;
197         int x;
198         int c = 0;
199         utf8_init(&sm);
200         for(x=0;s[x];++x)
201                 c = utf8_decode(&sm,s[x]);
202         return c;
203 }
204
205 /* Decode and advance */
206
207 int utf8_decode_fwrd(unsigned char **p,int *plen)
208 {
209         struct utf8_sm sm;
210         unsigned char *s = *p;
211         int len = *plen;
212         int c = -2;
213
214         utf8_init(&sm);
215
216         while (len) {
217                 --len;
218                 c = utf8_decode(&sm,*s++);
219                 if (c >= 0)
220                         break;
221         }
222
223         *plen = len;
224         *p = s;
225
226         return c;
227 }
228
229 /* Initialize locale for JOE */
230
231 #ifdef USE_CODEPAGE
232 extern unsigned int cygwin32_get_cp(void);
233 #endif
234
235 struct charmap *locale_map;
236                         /* Character map of terminal */
237 struct charmap *utf8_map;
238                         /* Handy character map for UTF-8 */
239
240 void
241 joe_locale(void)
242 {
243         unsigned char *s;
244
245         s=(unsigned char *)getenv("JOECHARMAP");
246         locale_map = find_charmap(s);
247 #if !defined(USE_LOCALE)
248         if (!locale_map) {
249                 s=(unsigned char *)getenv("LC_ALL");
250                 if (!s) {
251                         s=(unsigned char *)getenv("LC_CTYPE");
252                         if (!s) {
253                                 s=(unsigned char *)getenv("LANG");
254                         }
255                 }
256 #ifdef USE_CODEPAGE
257                 /* if LC_* are unset, use codepage */
258                 if (!s) {
259                         char buf[16];
260
261                         joe_snprintf_1(buf, sizeof(buf), "cp%u", cygwin32_get_cp());
262                         locale_map = find_charmap(buf);
263                 }
264 #endif
265         }
266 #endif
267
268 #ifdef USE_LOCALE
269         if (!locale_map) {
270                 setlocale(LC_ALL,"");
271                 s = (unsigned char *)strdup(nl_langinfo(CODESET));
272
273                 locale_map = find_charmap(s);
274         }
275 #else
276         if (!locale_map && s) {
277                 unsigned char *t, *tt;
278
279                 if ((t = strrchr(s, '.')) != NULL) {
280                         if ((tt = strchr(++t, '@')) != NULL)
281                                 *tt = '\0';
282                         locale_map = find_charmap(t);
283                 }
284                 if (!locale_map)
285                         locale_map = find_charmap(s);
286         }
287 #endif
288         if (!locale_map)
289                 locale_map = find_charmap(US "ascii");
290         utf8_map = find_charmap(US "utf-8");
291
292 #ifndef TEST
293 #ifdef defutf8
294         fdefault.charmap = utf8_map;
295 #else
296         fdefault.charmap = locale_map;
297 #endif
298         pdefault.charmap = locale_map;
299 #endif
300 }
301
302 void to_utf8(struct charmap *map,unsigned char *s,int c)
303 {
304         int d = to_uni(map,c);
305
306         if (d==-1)
307                 utf8_encode(s,'?');
308         else
309                 utf8_encode(s,d);
310 }
311
312 int from_utf8(struct charmap *map,unsigned char *s)
313 {
314         int d = utf8_decode_string(s);
315         int c = from_uni(map,d);
316         if (c==-1)
317                 return '?';
318         else
319                 return c;
320 }