another update from CVS HEAD, for QA
[alioth/jupp.git] / syntax.c
1 /*
2  *      Syntax highlighting DFA interpreter
3  *      Copyright
4  *              (C) 2004 Joseph H. Allen
5  *
6  *      This file is part of JOE (Joe's Own Editor)
7  */
8
9 #include "config.h"
10 #include "types.h"
11
12 __RCSID("$MirOS: contrib/code/jupp/syntax.c,v 1.21 2017/12/08 02:00:41 tg Exp $");
13
14 #include <limits.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include "b.h"
18 #include "scrn.h"
19 #include "utils.h"
20 #include "hash.h"
21 #include "path.h"
22 #include "charmap.h"
23 #include "syntax.h"
24
25 static struct {
26         unsigned char buf[7];
27         unsigned char start;
28         unsigned char limit;
29         unsigned eaten : 1;
30         unsigned ebbed : 1;
31         unsigned unget : 1;
32         unsigned first : 1;
33 } utfstate;
34
35 static int
36 utfoctet(P *p)
37 {
38         int c;
39
40         utfstate.first = 0;
41         if (utfstate.eaten) {
42  ate:
43                 if (utfstate.start < utfstate.limit)
44                         return (utfstate.buf[utfstate.start++]);
45                 if (utfstate.ebbed)
46                         return (NO_MORE_DATA);
47                 utfstate.eaten = utfstate.limit = 0;
48         }
49         if (!utfstate.limit) {
50                 utfstate.first = 1;
51                 if (utfstate.unget) {
52                         c = utfstate.buf[utfstate.start];
53                         utfstate.unget = 0;
54                 } else
55                         c = pgetb(p);
56                 if ((c == NO_MORE_DATA) || (c < 0x80))
57                         return (c);
58                 if ((c < 0xC2) || (c >= 0xFE))
59                         return (0xFF);
60                 utfstate.start = 0;
61                 utfstate.buf[utfstate.start++] = (unsigned char)c;
62                 utfstate.limit = (c < 0xE0) ? 2 : (c < 0xF0) ? 3 :
63                     (c < 0xF8) ? 4 : (c < 0xFC) ? 5 : 6;
64         }
65         while (utfstate.start < utfstate.limit) {
66                 if (((c = pgetb(p)) == NO_MORE_DATA) || ((c ^ 0x80) > 0x3F)) {
67                         /* invalid follow byte, invalidate all previous ones */
68                         utfstate.limit = 0;
69                         while (utfstate.limit < utfstate.start)
70                                 utfstate.buf[utfstate.limit++] = 0xFF;
71                         /* append this as ungetch unless the well is dry */
72                         if (c == NO_MORE_DATA)
73                                 utfstate.ebbed = 1;
74                         else {
75                                 utfstate.buf[utfstate.limit] = (unsigned char)c;
76                                 utfstate.unget = 1;
77                         }
78                         /* now return those bytes */
79                         break;
80                 }
81                 utfstate.buf[utfstate.start++] = (unsigned char)c;
82         }
83         utfstate.start = 0;
84         utfstate.eaten = 1;
85         goto ate;
86 }
87
88 static int
89 octetutf(P *p)
90 {
91         int c;
92
93         utfstate.first = 0;
94         if (!(utfstate.start < utfstate.limit)) {
95                 if ((c = pgetb(p)) == NO_MORE_DATA)
96                         return (NO_MORE_DATA);
97
98                 utfstate.limit = utf8_encode(utfstate.buf,
99                     to_uni(p->b->o.charmap, c));
100                 utfstate.start = 0;
101                 utfstate.first = 1;
102         }
103         return (utfstate.buf[utfstate.start++]);
104 }
105
106 /* Parse one line.  Returns new state.
107    'syntax' is the loaded syntax definition for this buffer.
108    'line' is advanced to start of next line.
109    Global array 'attr_buf' end up with coloring for each character of line.
110    'state' is initial parser state for the line (0 is initial state).
111 */
112
113 int *attr_buf = 0;
114 int attr_size = 0;
115
116 int parse(struct high_syntax *syntax, P *line, int state)
117 {
118         struct high_state *h = syntax->states[state];
119                         /* Current state */
120         unsigned char buf[20];  /* Name buffer (trunc after 19 characters) */
121         int buf_idx = 0;        /* Index into buffer */
122         int buf_len = 0;        /* counts only starting characters */
123         int buf_en = 0;         /* Set for name buffering */
124         int *attr_end = attr_buf+attr_size;
125         int *attr = attr_buf;
126         int c;                  /* Current character */
127         int ofst = 0;   /* record length after we've stopped buffering */
128         int (*getoctet)(P *) = line->b->o.charmap->type ? utfoctet : octetutf;
129
130         memset(&utfstate, 0, sizeof(utfstate));
131         buf[0] = 0;
132
133         /* Get next character */
134         while((c = getoctet(line)) != NO_MORE_DATA) {
135                 struct high_cmd *cmd, *kw_cmd;
136                 int x;
137
138                 /* Expand attribute array if necessary */
139                 if(attr==attr_end) {
140                         attr_buf = realloc(attr_buf,
141                             sizeof(int) * (attr_size * 2));
142                         attr = attr_buf + attr_size;
143                         attr_size *= 2;
144                         attr_end = attr_buf + attr_size;
145                 }
146
147                 /* Advance to next attribute position (note attr[-1] below) */
148                 if (utfstate.first)
149                         attr++;
150
151                 /* Loop while noeat */
152                 do {
153                         /* Color with current state */
154                         attr[-1] = h->color;
155                         /* Get command for this character */
156                         cmd = h->cmd[c];
157                         /* Determine new state */
158                         if (cmd->keywords && (cmd->ignore ?
159                             (kw_cmd = htfind(cmd->keywords, joe_strtolower(buf))) :
160                             (kw_cmd = htfind(cmd->keywords, buf)))) {
161                                 cmd = kw_cmd;
162                                 h = cmd->new_state;
163                                 /* Recolor keyword */
164                                 for (x = -(buf_len + 1); x < -1; ++x)
165                                         attr[x - ofst] = h->color;
166                         } else {
167                                 h = cmd->new_state;
168                         }
169                         /* Recolor if necessary */
170                         x = cmd->recolor;
171                         while (&attr[x] < attr_buf)
172                                 ++x;
173                         while (x < 0)
174                                 attr[x++] = h->color;
175
176                         /* Start buffering? */
177                         if (cmd->start_buffering) {
178                                 buf_idx = 0;
179                                 buf_len = 0;
180                                 buf_en = 1;
181                                 ofst = 0;
182                         }
183
184                         /* Stop buffering? */
185                         if (cmd->stop_buffering)
186                                 buf_en = 0;
187                 } while(cmd->noeat);
188
189                 /* Save character in buffer */
190                 if (!buf_en)
191                         ofst += utfstate.first;
192                 else if (buf_idx < 19) {
193                         buf[buf_idx++] = c;
194                         buf[buf_idx] = 0;
195                         buf_len += utfstate.first;
196                 }
197
198                 if (c == '\n')
199                         break;
200         }
201         /* Return new state number */
202         return h->no;
203 }
204
205 /* Subroutines for load_dfa() */
206
207 static struct high_state *find_state(struct high_syntax *syntax, const unsigned char *name)
208 {
209         int x;
210         struct high_state *state;
211
212         /* Find state */
213         for(x=0;x!=syntax->nstates;++x)
214                 if(!strcmp(syntax->states[x]->name,name))
215                         break;
216
217         /* It doesn't exist, so create it */
218         if(x==syntax->nstates) {
219                 int y;
220                 state = malloc(sizeof(struct high_state));
221                 state->name=(const unsigned char *)strdup((const char *)name);
222                 state->no=syntax->nstates;
223                 state->color=FG_WHITE;
224                 if(!syntax->nstates)
225                         /* We're the first state */
226                         syntax->default_cmd.new_state = state;
227                 if(syntax->nstates==syntax->szstates)
228                         syntax->states = realloc(syntax->states,
229                            sizeof(struct high_state *) * (syntax->szstates *= 2));
230                 syntax->states[syntax->nstates++]=state;
231                 for(y=0; y!=256; ++y)
232                         state->cmd[y] = &syntax->default_cmd;
233         } else
234                 state = syntax->states[x];
235         return state;
236 }
237
238 /* Load syntax file */
239
240 struct high_syntax *syntax_list;
241
242 struct high_syntax *load_dfa(const unsigned char *name)
243 {
244         unsigned char buf[1024];
245         unsigned char bf[256];
246         unsigned char bf1[256];
247         int clist[256];
248         unsigned char *p;
249         int c;
250         FILE *f = NULL;
251         struct high_state *state=0;     /* Current state */
252         struct high_syntax *syntax;     /* New syntax table */
253         int line = 0;
254         void *np;
255
256         if (!name)
257                 return NULL;
258
259         if(!attr_buf) {
260                 attr_size = 1024;
261                 attr_buf = calloc(attr_size, sizeof(int));
262         }
263
264         /* Find syntax table */
265
266         /* Already loaded? */
267         for(syntax=syntax_list;syntax;syntax=syntax->next)
268                 if(!strcmp(syntax->name,name))
269                         return syntax;
270
271         /* Load it */
272         p = (unsigned char *)getenv("HOME");
273         if (p) {
274                 joe_snprintf_2((char *)buf,sizeof(buf),"%s/.jupp/syntax/%s.jsf",p,name);
275                 f = fopen((char *)buf,"r");
276         }
277
278         if (!f && has_JOERC) {
279                 joe_snprintf_2((char *)buf,sizeof(buf),"%ssyntax/%s.jsf",get_JOERC,name);
280                 f = fopen((char *)buf,"r");
281         }
282         if(!f)
283                 return 0;
284
285         /* Create new one */
286         syntax = calloc(1, sizeof(struct high_syntax));
287         syntax->name = (const unsigned char *)strdup((const char *)name);
288         syntax->next = syntax_list;
289         syntax_list = syntax;
290         syntax->states = malloc(sizeof(struct high_state *) * (syntax->szstates = 64));
291         syntax->sync_lines = 120;
292
293         memset(clist, 0, sizeof(clist));
294
295         /* Parse file */
296         while(fgets((char *)buf,1023,f)) {
297                 ++line;
298                 p = buf;
299                 parse_ws(&p,'#');
300                 if(!parse_char(&p, ':')) {
301                         if(!parse_ident(&p, bf, 255)) {
302
303                                 state = find_state(syntax,bf);
304
305                                 parse_ws(&p,'#');
306                                 if(!parse_ident(&p,bf,255)) {
307                                         struct high_color *color;
308                                         for(color=syntax->color;color;color=color->next)
309                                                 if(!strcmp(color->name,bf))
310                                                         break;
311                                         if(color)
312                                                 state->color=color->color;
313                                         else {
314                                                 state->color=0;
315                                                 fprintf(stderr,"%s:%d: Unknown class '%s'\n", name, line, bf);
316                                         }
317                                 } else
318                                         fprintf(stderr,"%s:%d: Missing color for state definition\n", name, line);
319                         } else
320                                 fprintf(stderr,"%s:%d: Missing state name\n", name, line);
321                 } else if(!parse_char(&p, '=')) {
322                         if(!parse_ident(&p, bf, 255)) {
323                                 struct high_color *color;
324
325                                 /* Find color */
326                                 for(color=syntax->color;color;color=color->next)
327                                         if(!strcmp(color->name,bf))
328                                                 break;
329                                 /* If it doesn't exist, create it */
330                                 if(!color) {
331                                         color = calloc(1, sizeof(struct high_color));
332                                         color->name = (unsigned char *)strdup((char *)bf);
333                                         color->next = syntax->color;
334                                         syntax->color = color;
335                                 } else {
336                                         fprintf(stderr,"%s:%d: Class '%s' already defined\n", name, line, bf);
337                                 }
338
339                                 /* Parse color definition */
340                                 while(parse_ws(&p,'#'), !parse_ident(&p,bf,255)) {
341                                         color->color |= meta_color(bf);
342                                 }
343                         }
344                 } else if(!parse_char(&p, '-')) { /* No. sync lines */
345                         syntax->sync_lines = (int)ustolb(p, &np,
346                             INT_MIN, INT_MAX, USTOL_TRIM);
347                         if (!np)
348                                 syntax->sync_lines = -1;
349                         else
350                                 p = np;
351                 } else {
352                         c = parse_ws(&p,'#');
353
354                         if (!c) {
355                         } else if (c=='"' || c=='*') {
356                                 if (state) {
357                                         struct high_cmd *cmd;
358                                         if(!parse_field(&p, US "*")) {
359                                                 int z;
360                                                 for(z=0;z!=256;++z)
361                                                         clist[z] = 1;
362                                         } else {
363                                                 c = parse_string(&p, bf, 255);
364                                                 if(c)
365                                                         fprintf(stderr,"%s:%d: Bad string\n", name, line);
366                                                 else {
367                                                         int z;
368                                                         int first, second;
369                                                         unsigned char *t = bf;
370                                                         for(z=0;z!=256;++z)
371                                                                 clist[z] = 0;
372                                                         while(!parse_range(&t, &first, &second)) {
373                                                                 if(first>second)
374                                                                         second = first;
375                                                                 while(first<=second)
376                                                                         clist[first++] = 1;
377                                                         }
378                                                 }
379                                         }
380                                         /* Create command */
381                                         cmd = calloc(1, sizeof(struct high_cmd));
382                                         parse_ws(&p,'#');
383                                         if(!parse_ident(&p,bf,255)) {
384                                                 int z;
385                                                 cmd->new_state = find_state(syntax,bf);
386
387                                                 /* Parse options */
388                                                 while (parse_ws(&p,'#'), !parse_ident(&p,bf,255))
389                                                         if(!strcmp(bf,"buffer")) {
390                                                                 cmd->start_buffering = 1;
391                                                         } else if(!strcmp(bf,"hold")) {
392                                                                 cmd->stop_buffering = 1;
393                                                         } else if(!strcmp(bf,"recolor")) {
394                                                                 parse_ws(&p,'#');
395                                                                 if(!parse_char(&p,'=')) {
396                                                                         parse_ws(&p,'#');
397                                                                         cmd->recolor = (int)ustolb(p, &np,
398                                                                             INT_MIN, INT_MAX, USTOL_TRIM);
399                                                                         if (!np)
400                                                                                 fprintf(stderr,"%s:%d: Missing value for option %s\n", name, line, bf);
401                                                                         else
402                                                                                 p = np;
403                                                                 } else
404                                                                         fprintf(stderr,"%s:%d: Missing value for option %s\n", name, line, bf);
405                                                         } else if(!strcmp(bf,"strings") || !strcmp(bf,"istrings")) {
406                                                                 if (bf[0]=='i')
407                                                                         cmd->ignore = 1;
408                                                                 while(fgets((char *)buf,1023,f)) {
409                                                                         ++line;
410                                                                         p = buf;
411                                                                         parse_ws(&p,'#');
412                                                                         if (*p) {
413                                                                                 if(!parse_field(&p,US "done"))
414                                                                                         break;
415                                                                                 if(!parse_string(&p,bf,255)) {
416                                                                                         parse_ws(&p,'#');
417                                                                                         if (cmd->ignore)
418                                                                                                 joe_strtolower(bf);
419                                                                                         if(!parse_ident(&p,bf1,255)) {
420                                                                                                 struct high_cmd *kw_cmd = calloc(1, sizeof(struct high_cmd));
421                                                                                                 kw_cmd->noeat=1;
422                                                                                                 kw_cmd->new_state = find_state(syntax,bf1);
423                                                                                                 if(!cmd->keywords)
424                                                                                                         cmd->keywords = htmk(64);
425                                                                                                 htadd(cmd->keywords,(unsigned char *)strdup((char *)bf),kw_cmd);
426                                                                                                 while (parse_ws(&p,'#'), !parse_ident(&p,bf,255))
427                                                                                                         if(!strcmp(bf,"buffer")) {
428                                                                                                                 kw_cmd->start_buffering = 1;
429                                                                                                         } else if(!strcmp(bf,"hold")) {
430                                                                                                                 kw_cmd->stop_buffering = 1;
431                                                                                                         } else if(!strcmp(bf,"recolor")) {
432                                                                                                                 parse_ws(&p,'#');
433                                                                                                                 if(!parse_char(&p,'=')) {
434                                                                                                                         parse_ws(&p,'#');
435                                                                                                                         kw_cmd->recolor = (int)ustolb(p, &np,
436                                                                                                                             INT_MIN, INT_MAX, USTOL_TRIM);
437                                                                                                                         if (!np)
438                                                                                                                                 fprintf(stderr,"%s:%d: Missing value for option %s\n", name, line, bf);
439                                                                                                                         else
440                                                                                                                                 p = np;
441                                                                                                                 } else
442                                                                                                                         fprintf(stderr,"%s:%d: Missing value for option %s\n", name, line, bf);
443                                                                                                         } else
444                                                                                                                 fprintf(stderr,"%s:%d: Unknown option '%s'\n", name, line, bf);
445                                                                                         } else
446                                                                                                 fprintf(stderr,"%s:%d: Missing state name\n", name, line);
447                                                                                 } else
448                                                                                         fprintf(stderr,"%s:%d: Missing string\n", name, line);
449                                                                         }
450                                                                 }
451                                                         } else if(!strcmp(bf,"noeat")) {
452                                                                 cmd->noeat = 1;
453                                                         } else if(!strcmp(bf,"mark")) {
454                                                                 /* not implemented yet */ ;
455                                                         } else if(!strcmp(bf,"markend")) {
456                                                                 /* not implemented yet */ ;
457                                                         } else if(!strcmp(bf,"recolormark")) {
458                                                                 /* not implemented yet */ ;
459                                                         } else
460                                                                 fprintf(stderr,"%s:%d: Unknown option '%s'\n", name, line, bf);
461
462                                                 /* Install command */
463                                                 for(z=0;z!=256;++z)
464                                                         if(clist[z])
465                                                                 state->cmd[z]=cmd;
466                                         } else
467                                                 fprintf(stderr,"%s:%d: Missing jump\n", name, line);
468                                 } else
469                                         fprintf(stderr,"%s:%d: No state\n", name, line);
470                         } else
471                                 fprintf(stderr,"%s:%d: Unknown character\n", name, line);
472                 }
473         }
474
475         fclose(f);
476
477         return syntax;
478 }