another update from CVS HEAD, for QA
[alioth/jupp.git] / regex.c
1 /*
2  *      Regular expression subroutines
3  *      Copyright
4  *              (C) 1992 Joseph H. Allen
5  *
6  *      This file is part of JOE (Joe's Own Editor)
7  */
8 #include "config.h"
9 #include "types.h"
10
11 __RCSID("$MirOS: contrib/code/jupp/regex.c,v 1.10 2017/12/06 23:02:04 tg Exp $");
12
13 #include "b.h"
14 #include "charmap.h"
15 #include "regex.h"
16 #include "utils.h"
17 #include "vs.h"
18
19 int
20 escape(int isutf8, unsigned char **a, int *b)
21 {
22         int c;
23         unsigned char *s = *a;
24         int l = *b, z;
25
26         if (*s == '\\' && l >= 2) {
27                 ++s; --l;
28                 switch (*s) {
29                 case 'n':
30                         c = 10;
31                         ++s; --l;
32                         break;
33                 case 't':
34                         c = 9;
35                         ++s; --l;
36                         break;
37                 case 'a':
38                         c = 7;
39                         ++s; --l;
40                         break;
41                 case 'b':
42                         c = 8;
43                         ++s; --l;
44                         break;
45                 case 'f':
46                         c = 12;
47                         ++s; --l;
48                         break;
49                 case 'e':
50                         c = 27;
51                         ++s; --l;
52                         break;
53                 case 'r':
54                         c = 13;
55                         ++s; --l;
56                         break;
57                 case '8':
58                         c = 8;
59                         ++s; --l;
60                         break;
61                 case '9':
62                         c = 9;
63                         ++s; --l;
64                         break;
65                 case '0':
66                 case '1':
67                 case '2':
68                 case '3':
69                 case '4':
70                 case '5':
71                 case '6':
72                 case '7':
73                         z = ustoc_oct(s, &c, l);
74                         s += z;
75                         l -= z;
76                         break;
77                 case 'x':
78                 case 'X':
79                         z = ustoc_hex(s, &c, l);
80                         s += z;
81                         l -= z;
82                         break;
83                 default:
84                         if (isutf8)
85                                 c = utf8_decode_fwrd(&s, &l);
86                         else {
87                                 c = *s++;
88                                 --l;
89                         }
90                         break;
91                 }
92         } else if (isutf8) {
93                 c = utf8_decode_fwrd(&s,&l);
94         } else {
95                 c = *s++;
96                 --l;
97         }
98         *a = s;
99         *b = l;
100         return c;
101 }
102
103 static int brack(int isutf8,unsigned char **a, int *la, int c)
104 {
105         int inverse = 0;
106         int flag = 0;
107         unsigned char *s = *a;
108         int l = *la;
109
110         if (!l)
111                 return 0;
112         if (*s == '^' || *s == '*') {
113                 inverse = 1;
114                 ++s;
115                 --l;
116         }
117         if (l && *s == ']') {
118                 ++s;
119                 --l;
120                 if (c == ']')
121                         flag = 1;
122         }
123         while (l)
124                 if (*s == ']') {
125                         ++s;
126                         --l;
127                         break;
128                 } else {
129                         int cl, cr;
130
131                         cl = escape(isutf8, &s, &l);
132
133                         if (l >= 2 && s[0] == '-' && s[1] != ']') {
134                                 --l;
135                                 ++s;
136                                 cr = escape(isutf8, &s, &l);
137                                 if (c >= cl && c <= cr)
138                                         flag = 1;
139                         } else if (c == cl)
140                                 flag = 1;
141                 }
142         *a = s;
143         *la = l;
144         if (inverse)
145                 return !flag;
146         else
147                 return flag;
148 }
149
150 static void savec(int isutf8,unsigned char **pieces, int n, int c)
151 {
152         unsigned char buf[16];
153         int len;
154         unsigned char *s = NULL;
155
156         if (isutf8)
157                 len = utf8_encode(buf,c);
158         else {
159                 buf[0] = c;
160                 len = 1;
161         }
162
163         if (pieces[n])
164                 vsrm(pieces[n]);
165         s = vsncpy(s, 0, buf, len);
166         pieces[n] = s;
167 }
168
169 #define MAX_REGEX_SAVED 16384 /* Largest regex string we will save */
170
171 static void saves(unsigned char **pieces, int n, P *p, long int szz)
172 {
173         if (szz > MAX_REGEX_SAVED)
174                 pieces[n] = vstrunc(pieces[n], 0);
175         else {
176                 pieces[n] = vstrunc(pieces[n], (int) szz);
177                 brmem(p, pieces[n], (int) szz);
178         }
179 }
180
181 /* Returns -1 (NO_MORE_DATA) for end of file.
182  * Returns -2 if we skipped a special sequence and didn't take the character
183  * after it (this happens for "strings").
184  * Otherwise returns character after sequence (character will be >=0).
185  */
186
187 static int skip_special(P *p)
188 {
189         int to, s;
190
191         switch (s = pgetc(p)) {
192         case '"':
193                 do {
194                         if ((s = pgetc(p)) == '\\') {
195                                 pgetc(p);
196                                 s = pgetc(p);
197                         }
198                 } while (s != NO_MORE_DATA && s != '"');
199                 if (s == '"')
200                         return -2;
201                 break;
202         case '\'':
203                 if ((s = pgetc(p)) == '\\') {
204                         pgetc(p);
205                         s = pgetc(p);
206                 }
207                 if (s == '\'')
208                         return -2;
209                 if ((s = pgetc(p)) == '\'')
210                         return -2;
211                 if ((s = pgetc(p)) == '\'')
212                         return -2;
213                 break;
214         case '[':
215                 to = ']';
216                 goto skip;
217         case '(':
218                 to = ')';
219                 goto skip;
220         case '{':
221                 to = '}';
222  skip:
223                 do {
224                         s = skip_special(p);
225                 } while (s != to && s != NO_MORE_DATA);
226                 if (s == to)
227                         return -2;
228                 break;
229         case '/':
230                 s = pgetc(p);
231                 if (s == '*')
232                         do {
233                                 s = pgetc(p);
234                                 while (s == '*')
235                                         if ((s = pgetc(p)) == '/')
236                                                 return -2;
237                         } while (s != NO_MORE_DATA);
238                 else if (s != NO_MORE_DATA)
239                         s = prgetc(p);
240                 else
241                         s = '/';
242                 break;
243         }
244         return s;
245 }
246
247 int pmatch(unsigned char **pieces, unsigned char *regex, int len, P *p, int n, int icase)
248 {
249         int c, d;
250         P *q = pdup(p);
251         P *o = NULL;
252         int isutf8 = p->b->o.charmap->type;
253         struct charmap *map = p->b->o.charmap;
254         struct utf8_sm sm;
255
256         utf8_init(&sm);
257
258         while (len) {
259                 if (isutf8) {
260                         do {
261                                 c = utf8_decode(&sm,*regex++);
262                                 --len;
263                         } while (len && c<0);
264                         if (c<0)
265                                 return 0;
266                 } else {
267                         c = *regex++;
268                         --len;
269                 }
270
271                 switch (c) {
272                 case '\\':
273                         if (!len--)
274                                 goto fail;
275                         switch (c = *regex++) {
276                         case '?':
277                                 d = pgetc(p);
278                                 if (d == NO_MORE_DATA)
279                                         goto fail;
280                                 savec(isutf8, pieces, n++, d);
281                                 break;
282                         case 'n':
283                         case 'r':
284                         case 'a':
285                         case 'f':
286                         case 'b':
287                         case 't':
288                         case 'e':
289                         case 'x':
290                         case 'X':
291                         case '0':
292                         case '1':
293                         case '2':
294                         case '3':
295                         case '4':
296                         case '5':
297                         case '6':
298                         case '7':
299                         case '8':
300                         case '9':
301                                 regex -= 2;
302                                 len += 2;
303                                 if (pgetc(p) != escape(isutf8, &regex, &len))
304                                         goto fail;
305                                 break;
306                         case '*':
307                                 /* Find shortest matching sequence */
308                                 o = pdup(p);
309                                 do {
310                                         long pb = p->byte;
311
312                                         if (pmatch(pieces, regex, len, p, n + 1, icase)) {
313                                                 saves(pieces, n, o, pb - o->byte);
314                                                 goto succeed;
315                                         }
316                                         c = pgetc(p);
317                                 } while (c != NO_MORE_DATA && c != '\n');
318                                 goto fail;
319                         case 'c':
320                                 o = pdup(p);
321                                 do {
322                                         long pb = p->byte;
323
324                                         if (pmatch(pieces, regex, len, p, n + 1, icase)) {
325                                                 saves(pieces, n, o, pb - o->byte);
326                                                 goto succeed;
327                                         }
328                                 } while (skip_special(p) != NO_MORE_DATA);
329                                 goto fail;
330                         case '[':
331                                 d = pgetc(p);
332                                 if (d == NO_MORE_DATA)
333                                         goto fail;
334                                 if (!brack(isutf8, &regex, &len, d))
335                                         goto fail;
336                                 savec(isutf8, pieces, n++, d);
337                                 break;
338                         case '+':
339                                 {
340                                         unsigned char *oregex = regex;  /* Point to character to skip */
341                                         int olen = len;
342
343                                         unsigned char *tregex;
344                                         int tlen;
345
346                                         int match;
347
348                                         P *r = NULL;
349
350                                         d = 0;
351
352                                         o = pdup(p);
353
354                                         /* Advance over character to skip.  Save character in d unless
355                                            we're skipping over a \[..] */
356                                         if (len >= 2 && regex[0] == '\\') {
357                                                 if (regex[1] == '[') {
358                                                         regex += 2;
359                                                         len -= 2;
360                                                         brack(isutf8, &regex, &len, 0);
361                                                 } else {
362                                                         d = escape(isutf8, &regex, &len);
363                                                         if (icase)
364                                                                 d = joe_tolower(map,d);
365                                                 }
366                                         } else if (isutf8) {
367                                                 if ((d = utf8_decode_fwrd(&regex, &len)) < 0)
368                                                         goto done;
369                                                 else if (icase)
370                                                         d = joe_tolower(map,d);
371                                         } else {
372                                                 if (len >= 1) {
373                                                         --len;
374                                                         d = *regex++;
375                                                         if (icase)
376                                                                 d = joe_tolower(map,d);
377                                                 } else
378                                                         goto done;
379                                         }
380
381                                         /* Now oregex/olen point to character to skip over and
382                                            regex/len point to sequence which follows */
383
384                                         do {
385                                                 P *z = pdup(p);
386
387                                                 if (pmatch(pieces, regex, len, p, n + 1, icase)) {
388                                                         saves(pieces, n, o, z->byte - o->byte);
389                                                         if (r)
390                                                                 prm(r);
391                                                         r = pdup(p);
392                                                 }
393                                                 pset(p, z);
394                                                 prm(z);
395                                                 c = pgetc(p);
396                                                 tregex = oregex;
397                                                 tlen = olen;
398                                                 if (*oregex == '\\') {
399                                                         if (oregex[1] == '[') {
400                                                                 tregex += 2;
401                                                                 tlen -= 2;
402                                                                 match = brack(isutf8, &tregex, &tlen, c);
403                                                         } else
404                                                                 match = (d == c);
405                                                 } else {
406                                                         if(icase)
407                                                                 match = (joe_tolower(map,c) == d);
408                                                         else
409                                                                 match = (c == d);
410                                                 }
411                                         } while (c != NO_MORE_DATA && match);
412  done:
413                                         if (r) {
414                                                 pset(p, r);
415                                                 prm(r);
416                                         }
417                                         if (r)
418                                                 goto succeed;
419                                         else
420                                                 goto fail;
421                                 }
422                         case '^':
423                                 if (!pisbol(p))
424                                         goto fail;
425                                 break;
426                         case '$':
427                                 if (!piseol(p))
428                                         goto fail;
429                                 break;
430                         case '<':
431                                 if (!pisbow(p))
432                                         goto fail;
433                                 break;
434                         case '>':
435                                 if (!piseow(p))
436                                         goto fail;
437                                 break;
438                         case '\\':
439                                 d = pgetc(p);
440                                 if (d != c)
441                                         goto fail;
442                                 break;
443                         default:
444                                 goto fail;
445                         }
446                         break;
447                 default:
448                         d = pgetc(p);
449                         if (icase) {
450                                 if (joe_tolower(map,d) != joe_tolower(map,c))
451                                         goto fail;
452                         } else {
453                                 if (d != c)
454                                         goto fail;
455                         }
456                 }
457         }
458  succeed:
459         if (o)
460                 prm(o);
461         prm(q);
462         return 1;
463
464  fail:
465         if (o)
466                 prm(o);
467         pset(p, q);
468         prm(q);
469         return 0;
470 }