2 * Regular expression subroutines
4 * (C) 1992 Joseph H. Allen
6 * This file is part of JOE (Joe's Own Editor)
11 __RCSID("$MirOS: contrib/code/jupp/regex.c,v 1.7 2017/12/02 04:32:41 tg Exp $");
17 int escape(int utf8_,unsigned char **a, int *b)
20 unsigned char *s = *a;
23 if (*s == '\\' && l >= 2) {
72 if (l > 0 && *s >= '0' && *s <= '7') {
73 c = c * 8 + s[1] - '0';
76 if (l > 0 && *s >= '0' && *s <= '7') {
77 c = c * 8 + s[1] - '0';
85 if (l > 0 && *s >= '0' && *s <= '9') {
86 c = c * 16 + *s - '0';
88 } else if (l > 0 && *s >= 'A' && *s <= 'F') {
89 c = c * 16 + *s - 'A' + 10;
91 } else if (l > 0 && *s >= 'a' && *s <= 'f') {
92 c = c * 16 + *s - 'a' + 10;
96 if (l > 0 && *s >= '0' && *s <= '9') {
97 c = c * 16 + *s - '0';
99 } else if (l > 0 && *s >= 'A' && *s <= 'F') {
100 c = c * 16 + *s - 'A' + 10;
102 } else if (l > 0 && *s >= 'a' && *s <= 'f') {
103 c = c * 16 + *s - 'a' + 10;
109 c = utf8_decode_fwrd(&s, &l);
117 c = utf8_decode_fwrd(&s,&l);
127 static int brack(int utf8_,unsigned char **a, int *la, int c)
131 unsigned char *s = *a;
136 if (*s == '^' || *s == '*') {
141 if (l && *s == ']') {
155 cl = escape(utf8_, &s, &l);
157 if (l >= 2 && s[0] == '-' && s[1] != ']') {
160 cr = escape(utf8_, &s, &l);
161 if (c >= cl && c <= cr)
174 static void savec(int utf8_,unsigned char **pieces, int n, int c)
176 unsigned char buf[16];
178 unsigned char *s = NULL;
181 len = utf8_encode(buf,c);
189 s = vsncpy(s, 0, buf, len);
193 #define MAX_REGEX_SAVED 16384 /* Largest regex string we will save */
195 static void saves(unsigned char **pieces, int n, P *p, long int szz)
197 if (szz > MAX_REGEX_SAVED)
198 pieces[n] = vstrunc(pieces[n], 0);
200 pieces[n] = vstrunc(pieces[n], (int) szz);
201 brmem(p, pieces[n], (int) szz);
205 /* Returns -1 (NO_MORE_DATA) for end of file.
206 * Returns -2 if we skipped a special sequence and didn't take the character
207 * after it (this happens for "strings").
208 * Otherwise returns character after sequence (character will be >=0).
211 static int skip_special(P *p)
215 switch (s = pgetc(p)) {
218 if ((s = pgetc(p)) == '\\') {
222 } while (s != NO_MORE_DATA && s != '"');
227 if ((s = pgetc(p)) == '\\') {
233 if ((s = pgetc(p)) == '\'')
235 if ((s = pgetc(p)) == '\'')
249 } while (s != to && s != NO_MORE_DATA);
259 if ((s = pgetc(p)) == '/')
261 } while (s != NO_MORE_DATA);
262 else if (s != NO_MORE_DATA)
271 int pmatch(unsigned char **pieces, unsigned char *regex, int len, P *p, int n, int icase)
276 int utf8_ = p->b->o.charmap->type;
277 struct charmap *map = p->b->o.charmap;
285 c = utf8_decode(&sm,*regex++);
287 } while (len && c<0);
299 switch (c = *regex++) {
302 if (d == NO_MORE_DATA)
304 savec(utf8_, pieces, n++, d);
327 if (pgetc(p) != escape(utf8_, ®ex, &len))
331 /* Find shortest matching sequence */
336 if (pmatch(pieces, regex, len, p, n + 1, icase)) {
337 saves(pieces, n, o, pb - o->byte);
341 } while (c != NO_MORE_DATA && c != '\n');
348 if (pmatch(pieces, regex, len, p, n + 1, icase)) {
349 saves(pieces, n, o, pb - o->byte);
352 } while (skip_special(p) != NO_MORE_DATA);
356 if (d == NO_MORE_DATA)
358 if (!brack(utf8_, ®ex, &len, d))
360 savec(utf8_, pieces, n++, d);
364 unsigned char *oregex = regex; /* Point to character to skip */
367 unsigned char *tregex;
378 /* Advance over character to skip. Save character in d unless
379 we're skipping over a \[..] */
380 if (len >= 2 && regex[0] == '\\') {
381 if (regex[1] == '[') {
384 brack(utf8_, ®ex, &len, 0);
386 d = escape(utf8_, ®ex, &len);
388 d = joe_tolower(map,d);
391 if ((d = utf8_decode_fwrd(®ex, &len)) < 0)
394 d = joe_tolower(map,d);
400 d = joe_tolower(map,d);
405 /* Now oregex/olen point to character to skip over and
406 regex/len point to sequence which follows */
411 if (pmatch(pieces, regex, len, p, n + 1, icase)) {
412 saves(pieces, n, o, z->byte - o->byte);
422 if (*oregex == '\\') {
423 if (oregex[1] == '[') {
426 match = brack(utf8_, &tregex, &tlen, c);
431 match = (joe_tolower(map,c) == d);
435 } while (c != NO_MORE_DATA && match);
475 if (joe_tolower(map,d) != joe_tolower(map,c))