we’ll need to distinguish these for sarge/etch as well
[alioth/jupp.git] / regex.c
1 /*
2  *      Regular expression subroutines
3  *      Copyright
4  *              (C) 1992 Joseph H. Allen
5  *
6  *      This file is part of JOE (Joe's Own Editor)
7  */
8 #include "config.h"
9 #include "types.h"
10
11 __RCSID("$MirOS: contrib/code/jupp/regex.c,v 1.7 2017/12/02 04:32:41 tg Exp $");
12
13 #include "b.h"
14 #include "charmap.h"
15 #include "vs.h"
16
17 int escape(int utf8_,unsigned char **a, int *b)
18 {
19         int c;
20         unsigned char *s = *a;
21         int l = *b;
22
23         if (*s == '\\' && l >= 2) {
24                 ++s; --l;
25                 switch (*s) {
26                 case 'n':
27                         c = 10;
28                         ++s; --l;
29                         break;
30                 case 't':
31                         c = 9;
32                         ++s; --l;
33                         break;
34                 case 'a':
35                         c = 7;
36                         ++s; --l;
37                         break;
38                 case 'b':
39                         c = 8;
40                         ++s; --l;
41                         break;
42                 case 'f':
43                         c = 12;
44                         ++s; --l;
45                         break;
46                 case 'e':
47                         c = 27;
48                         ++s; --l;
49                         break;
50                 case 'r':
51                         c = 13;
52                         ++s; --l;
53                         break;
54                 case '8':
55                         c = 8;
56                         ++s; --l;
57                         break;
58                 case '9':
59                         c = 9;
60                         ++s; --l;
61                         break;
62                 case '0':
63                 case '1':
64                 case '2':
65                 case '3':
66                 case '4':
67                 case '5':
68                 case '6':
69                 case '7':
70                         c = *s - '0';
71                         ++s; --l;
72                         if (l > 0 && *s >= '0' && *s <= '7') {
73                                 c = c * 8 + s[1] - '0';
74                                 ++s; --l;
75                         }
76                         if (l > 0 && *s >= '0' && *s <= '7') {
77                                 c = c * 8 + s[1] - '0';
78                                 ++s; --l;
79                         }
80                         break;
81                 case 'x':
82                 case 'X':
83                         c = 0;
84                         ++s; --l;
85                         if (l > 0 && *s >= '0' && *s <= '9') {
86                                 c = c * 16 + *s - '0';
87                                 ++s; --l;
88                         } else if (l > 0 && *s >= 'A' && *s <= 'F') {
89                                 c = c * 16 + *s - 'A' + 10;
90                                 ++s; --l;
91                         } else if (l > 0 && *s >= 'a' && *s <= 'f') {
92                                 c = c * 16 + *s - 'a' + 10;
93                                 ++s; --l;
94                         }
95
96                         if (l > 0 && *s >= '0' && *s <= '9') {
97                                 c = c * 16 + *s - '0';
98                                 ++s; --l;
99                         } else if (l > 0 && *s >= 'A' && *s <= 'F') {
100                                 c = c * 16 + *s - 'A' + 10;
101                                 ++s; --l;
102                         } else if (l > 0 && *s >= 'a' && *s <= 'f') {
103                                 c = c * 16 + *s - 'a' + 10;
104                                 ++s; --l;
105                         }
106                         break;
107                 default:
108                         if (utf8_)
109                                 c = utf8_decode_fwrd(&s, &l);
110                         else {
111                                 c = *s++;
112                                 --l;
113                         }
114                         break;
115                 }
116         } else if (utf8_) {
117                 c = utf8_decode_fwrd(&s,&l);
118         } else {
119                 c = *s++;
120                 --l;
121         }
122         *a = s;
123         *b = l;
124         return c;
125 }
126
127 static int brack(int utf8_,unsigned char **a, int *la, int c)
128 {
129         int inverse = 0;
130         int flag = 0;
131         unsigned char *s = *a;
132         int l = *la;
133
134         if (!l)
135                 return 0;
136         if (*s == '^' || *s == '*') {
137                 inverse = 1;
138                 ++s;
139                 --l;
140         }
141         if (l && *s == ']') {
142                 ++s;
143                 --l;
144                 if (c == ']')
145                         flag = 1;
146         }
147         while (l)
148                 if (*s == ']') {
149                         ++s;
150                         --l;
151                         break;
152                 } else {
153                         int cl, cr;
154
155                         cl = escape(utf8_, &s, &l);
156
157                         if (l >= 2 && s[0] == '-' && s[1] != ']') {
158                                 --l;
159                                 ++s;
160                                 cr = escape(utf8_, &s, &l);
161                                 if (c >= cl && c <= cr)
162                                         flag = 1;
163                         } else if (c == cl)
164                                 flag = 1;
165                 }
166         *a = s;
167         *la = l;
168         if (inverse)
169                 return !flag;
170         else
171                 return flag;
172 }
173
174 static void savec(int utf8_,unsigned char **pieces, int n, int c)
175 {
176         unsigned char buf[16];
177         int len;
178         unsigned char *s = NULL;
179
180         if (utf8_)
181                 len = utf8_encode(buf,c);
182         else {
183                 buf[0] = c;
184                 len = 1;
185         }
186
187         if (pieces[n])
188                 vsrm(pieces[n]);
189         s = vsncpy(s, 0, buf, len);
190         pieces[n] = s;
191 }
192
193 #define MAX_REGEX_SAVED 16384 /* Largest regex string we will save */
194
195 static void saves(unsigned char **pieces, int n, P *p, long int szz)
196 {
197         if (szz > MAX_REGEX_SAVED)
198                 pieces[n] = vstrunc(pieces[n], 0);
199         else {
200                 pieces[n] = vstrunc(pieces[n], (int) szz);
201                 brmem(p, pieces[n], (int) szz);
202         }
203 }
204
205 /* Returns -1 (NO_MORE_DATA) for end of file.
206  * Returns -2 if we skipped a special sequence and didn't take the character
207  * after it (this happens for "strings").
208  * Otherwise returns character after sequence (character will be >=0).
209  */
210
211 static int skip_special(P *p)
212 {
213         int to, s;
214
215         switch (s = pgetc(p)) {
216         case '"':
217                 do {
218                         if ((s = pgetc(p)) == '\\') {
219                                 pgetc(p);
220                                 s = pgetc(p);
221                         }
222                 } while (s != NO_MORE_DATA && s != '"');
223                 if (s == '"')
224                         return -2;
225                 break;
226         case '\'':
227                 if ((s = pgetc(p)) == '\\') {
228                         pgetc(p);
229                         s = pgetc(p);
230                 }
231                 if (s == '\'')
232                         return -2;
233                 if ((s = pgetc(p)) == '\'')
234                         return -2;
235                 if ((s = pgetc(p)) == '\'')
236                         return -2;
237                 break;
238         case '[':
239                 to = ']';
240                 goto skip;
241         case '(':
242                 to = ')';
243                 goto skip;
244         case '{':
245                 to = '}';
246 skip:
247                 do {
248                         s = skip_special(p);
249                 } while (s != to && s != NO_MORE_DATA);
250                 if (s == to)
251                         return -2;
252                 break;
253         case '/':
254                 s = pgetc(p);
255                 if (s == '*')
256                         do {
257                                 s = pgetc(p);
258                                 while (s == '*')
259                                         if ((s = pgetc(p)) == '/')
260                                                 return -2;
261                         } while (s != NO_MORE_DATA);
262                 else if (s != NO_MORE_DATA)
263                         s = prgetc(p);
264                 else
265                         s = '/';
266                 break;
267         }
268         return s;
269 }
270
271 int pmatch(unsigned char **pieces, unsigned char *regex, int len, P *p, int n, int icase)
272 {
273         int c, d;
274         P *q = pdup(p);
275         P *o = NULL;
276         int utf8_ = p->b->o.charmap->type;
277         struct charmap *map = p->b->o.charmap;
278         struct utf8_sm sm;
279
280         utf8_init(&sm);
281
282         while (len) {
283                 if (utf8_) {
284                         do {
285                                 c = utf8_decode(&sm,*regex++);
286                                 --len;
287                         } while (len && c<0);
288                         if (c<0)
289                                 return 0;
290                 } else {
291                         c = *regex++;
292                         --len;
293                 }
294
295                 switch (c) {
296                 case '\\':
297                         if (!len--)
298                                 goto fail;
299                         switch (c = *regex++) {
300                         case '?':
301                                 d = pgetc(p);
302                                 if (d == NO_MORE_DATA)
303                                         goto fail;
304                                 savec(utf8_, pieces, n++, d);
305                                 break;
306                         case 'n':
307                         case 'r':
308                         case 'a':
309                         case 'f':
310                         case 'b':
311                         case 't':
312                         case 'e':
313                         case 'x':
314                         case 'X':
315                         case '0':
316                         case '1':
317                         case '2':
318                         case '3':
319                         case '4':
320                         case '5':
321                         case '6':
322                         case '7':
323                         case '8':
324                         case '9':
325                                 regex -= 2;
326                                 len += 2;
327                                 if (pgetc(p) != escape(utf8_, &regex, &len))
328                                         goto fail;
329                                 break;
330                         case '*':
331                                 /* Find shortest matching sequence */
332                                 o = pdup(p);
333                                 do {
334                                         long pb = p->byte;
335
336                                         if (pmatch(pieces, regex, len, p, n + 1, icase)) {
337                                                 saves(pieces, n, o, pb - o->byte);
338                                                 goto succeed;
339                                         }
340                                         c = pgetc(p);
341                                 } while (c != NO_MORE_DATA && c != '\n');
342                                 goto fail;
343                         case 'c':
344                                 o = pdup(p);
345                                 do {
346                                         long pb = p->byte;
347
348                                         if (pmatch(pieces, regex, len, p, n + 1, icase)) {
349                                                 saves(pieces, n, o, pb - o->byte);
350                                                 goto succeed;
351                                         }
352                                 } while (skip_special(p) != NO_MORE_DATA);
353                                 goto fail;
354                         case '[':
355                                 d = pgetc(p);
356                                 if (d == NO_MORE_DATA)
357                                         goto fail;
358                                 if (!brack(utf8_, &regex, &len, d))
359                                         goto fail;
360                                 savec(utf8_, pieces, n++, d);
361                                 break;
362                         case '+':
363                                 {
364                                         unsigned char *oregex = regex;  /* Point to character to skip */
365                                         int olen = len;
366
367                                         unsigned char *tregex;
368                                         int tlen;
369
370                                         int match;
371
372                                         P *r = NULL;
373
374                                         d = 0;
375
376                                         o = pdup(p);
377
378                                         /* Advance over character to skip.  Save character in d unless
379                                            we're skipping over a \[..] */
380                                         if (len >= 2 && regex[0] == '\\') {
381                                                 if (regex[1] == '[') {
382                                                         regex += 2;
383                                                         len -= 2;
384                                                         brack(utf8_, &regex, &len, 0);
385                                                 } else {
386                                                         d = escape(utf8_, &regex, &len);
387                                                         if (icase)
388                                                                 d = joe_tolower(map,d);
389                                                 }
390                                         } else if (utf8_) {
391                                                 if ((d = utf8_decode_fwrd(&regex, &len)) < 0)
392                                                         goto done;
393                                                 else if (icase)
394                                                         d = joe_tolower(map,d);
395                                         } else {
396                                                 if (len >= 1) {
397                                                         --len;
398                                                         d = *regex++;
399                                                         if (icase)
400                                                                 d = joe_tolower(map,d);
401                                                 } else
402                                                         goto done;
403                                         }
404
405                                         /* Now oregex/olen point to character to skip over and
406                                            regex/len point to sequence which follows */
407
408                                         do {
409                                                 P *z = pdup(p);
410
411                                                 if (pmatch(pieces, regex, len, p, n + 1, icase)) {
412                                                         saves(pieces, n, o, z->byte - o->byte);
413                                                         if (r)
414                                                                 prm(r);
415                                                         r = pdup(p);
416                                                 }
417                                                 pset(p, z);
418                                                 prm(z);
419                                                 c = pgetc(p);
420                                                 tregex = oregex;
421                                                 tlen = olen;
422                                                 if (*oregex == '\\') {
423                                                         if (oregex[1] == '[') {
424                                                                 tregex += 2;
425                                                                 tlen -= 2;
426                                                                 match = brack(utf8_, &tregex, &tlen, c);
427                                                         } else
428                                                                 match = (d == c);
429                                                 } else {
430                                                         if(icase)
431                                                                 match = (joe_tolower(map,c) == d);
432                                                         else
433                                                                 match = (c == d);
434                                                 }
435                                         } while (c != NO_MORE_DATA && match);
436
437                                       done:
438                                         if (r) {
439                                                 pset(p, r);
440                                                 prm(r);
441                                         }
442                                         if (r)
443                                                 goto succeed;
444                                         else
445                                                 goto fail;
446                                 }
447                         case '^':
448                                 if (!pisbol(p))
449                                         goto fail;
450                                 break;
451                         case '$':
452                                 if (!piseol(p))
453                                         goto fail;
454                                 break;
455                         case '<':
456                                 if (!pisbow(p))
457                                         goto fail;
458                                 break;
459                         case '>':
460                                 if (!piseow(p))
461                                         goto fail;
462                                 break;
463                         case '\\':
464                                 d = pgetc(p);
465                                 if (d != c)
466                                         goto fail;
467                                 break;
468                         default:
469                                 goto fail;
470                         }
471                         break;
472                 default:
473                         d = pgetc(p);
474                         if (icase) {
475                                 if (joe_tolower(map,d) != joe_tolower(map,c))
476                                         goto fail;
477                         } else {
478                                 if (d != c)
479                                         goto fail;
480                         }
481                 }
482         }
483 succeed:
484         if (o)
485                 prm(o);
486         prm(q);
487         return 1;
488
489 fail:
490         if (o)
491                 prm(o);
492         pset(p, q);
493         prm(q);
494         return 0;
495 }