/MirOS/dist/jupp/joe-3.1jupp30.tgz
[alioth/jupp.git] / regex.c
1 /* $MirOS: contrib/code/jupp/regex.c,v 1.5 2012/12/22 00:06:13 tg Exp $ */
2 /*
3  *      Regular expression subroutines
4  *      Copyright
5  *              (C) 1992 Joseph H. Allen
6  *
7  *      This file is part of JOE (Joe's Own Editor)
8  */
9 #include "config.h"
10 #include "types.h"
11
12 #include <stdio.h> 
13
14 #include "b.h"
15 #include "utf8.h"
16 #include "charmap.h"
17 #include "vs.h"
18
19 int escape(int utf8_,unsigned char **a, int *b)
20 {
21         int c;
22         unsigned char *s = *a;
23         int l = *b;
24
25         if (*s == '\\' && l >= 2) {
26                 ++s; --l;
27                 switch (*s) {
28                 case 'n':
29                         c = 10;
30                         ++s; --l;
31                         break;
32                 case 't':
33                         c = 9;
34                         ++s; --l;
35                         break;
36                 case 'a':
37                         c = 7;
38                         ++s; --l;
39                         break;
40                 case 'b':
41                         c = 8;
42                         ++s; --l;
43                         break;
44                 case 'f':
45                         c = 12;
46                         ++s; --l;
47                         break;
48                 case 'e':
49                         c = 27;
50                         ++s; --l;
51                         break;
52                 case 'r':
53                         c = 13;
54                         ++s; --l;
55                         break;
56                 case '8':
57                         c = 8;
58                         ++s; --l;
59                         break;
60                 case '9':
61                         c = 9;
62                         ++s; --l;
63                         break;
64                 case '0':
65                 case '1':
66                 case '2':
67                 case '3':
68                 case '4':
69                 case '5':
70                 case '6':
71                 case '7':
72                         c = *s - '0';
73                         ++s; --l;
74                         if (l > 0 && *s >= '0' && *s <= '7') {
75                                 c = c * 8 + s[1] - '0';
76                                 ++s; --l;
77                         }
78                         if (l > 0 && *s >= '0' && *s <= '7') {
79                                 c = c * 8 + s[1] - '0';
80                                 ++s; --l;
81                         }
82                         break;
83                 case 'x':
84                 case 'X':
85                         c = 0;
86                         ++s; --l;
87                         if (l > 0 && *s >= '0' && *s <= '9') {
88                                 c = c * 16 + *s - '0';
89                                 ++s; --l;
90                         } else if (l > 0 && *s >= 'A' && *s <= 'F') {
91                                 c = c * 16 + *s - 'A' + 10;
92                                 ++s; --l;
93                         } else if (l > 0 && *s >= 'a' && *s <= 'f') {
94                                 c = c * 16 + *s - 'a' + 10;
95                                 ++s; --l;
96                         }
97
98                         if (l > 0 && *s >= '0' && *s <= '9') {
99                                 c = c * 16 + *s - '0';
100                                 ++s; --l;
101                         } else if (l > 0 && *s >= 'A' && *s <= 'F') {
102                                 c = c * 16 + *s - 'A' + 10;
103                                 ++s; --l;
104                         } else if (l > 0 && *s >= 'a' && *s <= 'f') {
105                                 c = c * 16 + *s - 'a' + 10;
106                                 ++s; --l;
107                         }
108                         break;
109                 default:
110                         if (utf8_)
111                                 c = utf8_decode_fwrd(&s, &l);
112                         else {
113                                 c = *s++;
114                                 --l;
115                         }
116                         break;
117                 }
118         } else if (utf8_) {
119                 c = utf8_decode_fwrd(&s,&l);
120         } else {
121                 c = *s++;
122                 --l;
123         }
124         *a = s;
125         *b = l;
126         return c;
127 }
128
129 static int brack(int utf8_,unsigned char **a, int *la, int c)
130 {
131         int inverse = 0;
132         int flag = 0;
133         unsigned char *s = *a;
134         int l = *la;
135
136         if (!l)
137                 return 0;
138         if (*s == '^' || *s == '*') {
139                 inverse = 1;
140                 ++s;
141                 --l;
142         }
143         if (l && *s == ']') {
144                 ++s;
145                 --l;
146                 if (c == ']')
147                         flag = 1;
148         }
149         while (l)
150                 if (*s == ']') {
151                         ++s;
152                         --l;
153                         break;
154                 } else {
155                         int cl, cr;
156
157                         cl = escape(utf8_, &s, &l);
158
159                         if (l >= 2 && s[0] == '-' && s[1] != ']') {
160                                 --l;
161                                 ++s;
162                                 cr = escape(utf8_, &s, &l);
163                                 if (c >= cl && c <= cr)
164                                         flag = 1;
165                         } else if (c == cl)
166                                 flag = 1;
167                 }
168         *a = s;
169         *la = l;
170         if (inverse)
171                 return !flag;
172         else
173                 return flag;
174 }
175
176 static void savec(int utf8_,unsigned char **pieces, int n, int c)
177 {
178         unsigned char buf[16];
179         int len;
180         unsigned char *s = NULL;
181
182         if (utf8_)
183                 len = utf8_encode(buf,c);
184         else {
185                 buf[0] = c;
186                 len = 1;
187         }
188
189         if (pieces[n])
190                 vsrm(pieces[n]);
191         s = vsncpy(s, 0, buf, len);
192         pieces[n] = s;
193 }
194
195 #define MAX_REGEX_SAVED 16384 /* Largest regex string we will save */
196
197 static void saves(unsigned char **pieces, int n, P *p, long int szz)
198 {
199         if (szz > MAX_REGEX_SAVED)
200                 pieces[n] = vstrunc(pieces[n], 0);
201         else {
202                 pieces[n] = vstrunc(pieces[n], (int) szz);
203                 brmem(p, pieces[n], (int) szz);
204         }
205 }
206
207 /* Returns -1 (NO_MORE_DATA) for end of file.
208  * Returns -2 if we skipped a special sequence and didn't take the character
209  * after it (this happens for "strings").
210  * Otherwise returns character after sequence (character will be >=0).
211  */
212
213 static int skip_special(P *p)
214 {
215         int to, s;
216
217         switch (s = pgetc(p)) {
218         case '"':
219                 do {
220                         if ((s = pgetc(p)) == '\\') {
221                                 pgetc(p);
222                                 s = pgetc(p);
223                         }
224                 } while (s != NO_MORE_DATA && s != '"');
225                 if (s == '"')
226                         return -2;
227                 break;
228         case '\'':
229                 if ((s = pgetc(p)) == '\\') {
230                         pgetc(p);
231                         s = pgetc(p);
232                 }
233                 if (s == '\'')
234                         return -2;
235                 if ((s = pgetc(p)) == '\'')
236                         return -2;
237                 if ((s = pgetc(p)) == '\'')
238                         return -2;
239                 break;
240         case '[':
241                 to = ']';
242                 goto skip;
243         case '(':
244                 to = ')';
245                 goto skip;
246         case '{':
247                 to = '}';
248 skip:
249                 do {
250                         s = skip_special(p);
251                 } while (s != to && s != NO_MORE_DATA);
252                 if (s == to)
253                         return -2;
254                 break;
255         case '/':
256                 s = pgetc(p);
257                 if (s == '*')
258                         do {
259                                 s = pgetc(p);
260                                 while (s == '*')
261                                         if ((s = pgetc(p)) == '/')
262                                                 return -2;
263                         } while (s != NO_MORE_DATA);
264                 else if (s != NO_MORE_DATA)
265                         s = prgetc(p);
266                 else
267                         s = '/';
268                 break;
269         }
270         return s;
271 }
272
273 int pmatch(unsigned char **pieces, unsigned char *regex, int len, P *p, int n, int icase)
274 {
275         int c, d;
276         P *q = pdup(p);
277         P *o = NULL;
278         int utf8_ = p->b->o.charmap->type;
279         struct charmap *map = p->b->o.charmap;
280         struct utf8_sm sm;
281
282         utf8_init(&sm);
283
284         while (len) {
285                 if (utf8_) {
286                         do {
287                                 c = utf8_decode(&sm,*regex++);
288                                 --len;
289                         } while (len && c<0);
290                         if (c<0)
291                                 return 0;
292                 } else {
293                         c = *regex++;
294                         --len;
295                 }
296
297                 switch (c) {
298                 case '\\':
299                         if (!len--)
300                                 goto fail;
301                         switch (c = *regex++) {
302                         case '?':
303                                 d = pgetc(p);
304                                 if (d == NO_MORE_DATA)
305                                         goto fail;
306                                 savec(utf8_, pieces, n++, d);
307                                 break;
308                         case 'n':
309                         case 'r':
310                         case 'a':
311                         case 'f':
312                         case 'b':
313                         case 't':
314                         case 'e':
315                         case 'x':
316                         case 'X':
317                         case '0':
318                         case '1':
319                         case '2':
320                         case '3':
321                         case '4':
322                         case '5':
323                         case '6':
324                         case '7':
325                         case '8':
326                         case '9':
327                                 regex -= 2;
328                                 len += 2;
329                                 if (pgetc(p) != escape(utf8_, &regex, &len))
330                                         goto fail;
331                                 break;
332                         case '*':
333                                 /* Find shortest matching sequence */
334                                 o = pdup(p);
335                                 do {
336                                         long pb = p->byte;
337
338                                         if (pmatch(pieces, regex, len, p, n + 1, icase)) {
339                                                 saves(pieces, n, o, pb - o->byte);
340                                                 goto succeed;
341                                         }
342                                         c = pgetc(p);
343                                 } while (c != NO_MORE_DATA && c != '\n');
344                                 goto fail;
345                         case 'c':
346                                 o = pdup(p);
347                                 do {
348                                         long pb = p->byte;
349
350                                         if (pmatch(pieces, regex, len, p, n + 1, icase)) {
351                                                 saves(pieces, n, o, pb - o->byte);
352                                                 goto succeed;
353                                         }
354                                 } while (skip_special(p) != NO_MORE_DATA);
355                                 goto fail;
356                         case '[':
357                                 d = pgetc(p);
358                                 if (d == NO_MORE_DATA)
359                                         goto fail;
360                                 if (!brack(utf8_, &regex, &len, d))
361                                         goto fail;
362                                 savec(utf8_, pieces, n++, d);
363                                 break;
364                         case '+':
365                                 {
366                                         unsigned char *oregex = regex;  /* Point to character to skip */
367                                         int olen = len;
368
369                                         unsigned char *tregex;
370                                         int tlen;
371
372                                         int match;
373
374                                         P *r = NULL;
375
376                                         d = 0;
377
378                                         o = pdup(p);
379
380                                         /* Advance over character to skip.  Save character in d unless
381                                            we're skipping over a \[..] */
382                                         if (len >= 2 && regex[0] == '\\') {
383                                                 if (regex[1] == '[') {
384                                                         regex += 2;
385                                                         len -= 2;
386                                                         brack(utf8_, &regex, &len, 0);
387                                                 } else {
388                                                         d = escape(utf8_, &regex, &len);
389                                                         if (icase)
390                                                                 d = joe_tolower(map,d);
391                                                 }
392                                         } else if (utf8_) {
393                                                 if ((d = utf8_decode_fwrd(&regex, &len)) < 0)
394                                                         goto done;
395                                                 else if (icase)
396                                                         d = joe_tolower(map,d);
397                                         } else {
398                                                 if (len >= 1) {
399                                                         --len;
400                                                         d = *regex++;
401                                                         if (icase)
402                                                                 d = joe_tolower(map,d);
403                                                 } else
404                                                         goto done;
405                                         }
406
407                                         /* Now oregex/olen point to character to skip over and
408                                            regex/len point to sequence which follows */
409
410                                         do {
411                                                 P *z = pdup(p);
412
413                                                 if (pmatch(pieces, regex, len, p, n + 1, icase)) {
414                                                         saves(pieces, n, o, z->byte - o->byte);
415                                                         if (r)
416                                                                 prm(r);
417                                                         r = pdup(p);
418                                                 }
419                                                 pset(p, z);
420                                                 prm(z);
421                                                 c = pgetc(p);
422                                                 tregex = oregex;
423                                                 tlen = olen;
424                                                 if (*oregex == '\\') {
425                                                         if (oregex[1] == '[') {
426                                                                 tregex += 2;
427                                                                 tlen -= 2;
428                                                                 match = brack(utf8_, &tregex, &tlen, c);
429                                                         } else
430                                                                 match = (d == c);
431                                                 } else {
432                                                         if(icase)
433                                                                 match = (joe_tolower(map,c) == d);
434                                                         else
435                                                                 match = (c == d);
436                                                 }
437                                         } while (c != NO_MORE_DATA && match);
438
439                                       done:
440                                         if (r) {
441                                                 pset(p, r);
442                                                 prm(r);
443                                         }
444                                         if (r)
445                                                 goto succeed;
446                                         else
447                                                 goto fail;
448                                 }
449                         case '^':
450                                 if (!pisbol(p))
451                                         goto fail;
452                                 break;
453                         case '$':
454                                 if (!piseol(p))
455                                         goto fail;
456                                 break;
457                         case '<':
458                                 if (!pisbow(p))
459                                         goto fail;
460                                 break;
461                         case '>':
462                                 if (!piseow(p))
463                                         goto fail;
464                                 break;
465                         case '\\':
466                                 d = pgetc(p);
467                                 if (d != c)
468                                         goto fail;
469                                 break;
470                         default:
471                                 goto fail;
472                         }
473                         break;
474                 default:
475                         d = pgetc(p);
476                         if (icase) {
477                                 if (joe_tolower(map,d) != joe_tolower(map,c))
478                                         goto fail;
479                         } else {
480                                 if (d != c)
481                                         goto fail;
482                         }
483                 }
484         }
485 succeed:
486         if (o)
487                 prm(o);
488         prm(q);
489         return 1;
490
491 fail:
492         if (o)
493                 prm(o);
494         pset(p, q);
495         prm(q);
496         return 0;
497 }