we’ll need to distinguish these for sarge/etch as well
[alioth/jupp.git] / syntax.c
1 /*
2  *      Syntax highlighting DFA interpreter
3  *      Copyright
4  *              (C) 2004 Joseph H. Allen
5  *
6  *      This file is part of JOE (Joe's Own Editor)
7  */
8
9 #include "config.h"
10 #include "types.h"
11
12 __RCSID("$MirOS: contrib/code/jupp/syntax.c,v 1.20 2017/12/02 04:36:56 tg Exp $");
13
14 #include <limits.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include "b.h"
18 #include "scrn.h"
19 #include "utils.h"
20 #include "hash.h"
21 #include "path.h"
22 #include "charmap.h"
23 #include "syntax.h"
24
25 static struct {
26         unsigned char buf[7];
27         unsigned char start;
28         unsigned char limit;
29         unsigned eaten : 1;
30         unsigned ebbed : 1;
31         unsigned unget : 1;
32         unsigned first : 1;
33 } utfstate;
34
35 static int
36 utfoctet(P *p)
37 {
38         int c;
39
40         utfstate.first = 0;
41         if (utfstate.eaten) {
42  ate:
43                 if (utfstate.start < utfstate.limit)
44                         return (utfstate.buf[utfstate.start++]);
45                 if (utfstate.ebbed)
46                         return (NO_MORE_DATA);
47                 utfstate.eaten = utfstate.limit = 0;
48         }
49         if (!utfstate.limit) {
50                 utfstate.first = 1;
51                 if (utfstate.unget) {
52                         c = utfstate.buf[utfstate.start];
53                         utfstate.unget = 0;
54                 } else
55                         c = pgetb(p);
56                 if ((c == NO_MORE_DATA) || (c < 0x80))
57                         return (c);
58                 if ((c < 0xC2) || (c >= 0xFE))
59                         return (0xFF);
60                 utfstate.start = 0;
61                 utfstate.buf[utfstate.start++] = (unsigned char)c;
62                 utfstate.limit = (c < 0xE0) ? 2 : (c < 0xF0) ? 3 :
63                     (c < 0xF8) ? 4 : (c < 0xFC) ? 5 : 6;
64         }
65         while (utfstate.start < utfstate.limit) {
66                 if (((c = pgetb(p)) == NO_MORE_DATA) || ((c ^ 0x80) > 0x3F)) {
67                         /* invalid follow byte, invalidate all previous ones */
68                         utfstate.limit = 0;
69                         while (utfstate.limit < utfstate.start)
70                                 utfstate.buf[utfstate.limit++] = 0xFF;
71                         /* append this as ungetch unless the well is dry */
72                         if (c == NO_MORE_DATA)
73                                 utfstate.ebbed = 1;
74                         else {
75                                 utfstate.buf[utfstate.limit] = (unsigned char)c;
76                                 utfstate.unget = 1;
77                         }
78                         /* now return those bytes */
79                         break;
80                 }
81                 utfstate.buf[utfstate.start++] = (unsigned char)c;
82         }
83         utfstate.start = 0;
84         utfstate.eaten = 1;
85         goto ate;
86 }
87
88 static int
89 octetutf(P *p)
90 {
91         int c;
92
93         utfstate.first = 0;
94         if (!(utfstate.start < utfstate.limit)) {
95                 if ((c = pgetb(p)) == NO_MORE_DATA)
96                         return (NO_MORE_DATA);
97
98                 utfstate.limit = utf8_encode(utfstate.buf,
99                     to_uni(p->b->o.charmap, c));
100                 utfstate.start = 0;
101                 utfstate.first = 1;
102         }
103         return (utfstate.buf[utfstate.start++]);
104 }
105
106 /* Parse one line.  Returns new state.
107    'syntax' is the loaded syntax definition for this buffer.
108    'line' is advanced to start of next line.
109    Global array 'attr_buf' end up with coloring for each character of line.
110    'state' is initial parser state for the line (0 is initial state).
111 */
112
113 int *attr_buf = 0;
114 int attr_size = 0;
115
116 int parse(struct high_syntax *syntax, P *line, int state)
117 {
118         struct high_state *h = syntax->states[state];
119                         /* Current state */
120         unsigned char buf[20];  /* Name buffer (trunc after 19 characters) */
121         int buf_idx = 0;        /* Index into buffer */
122         int buf_len = 0;        /* counts only starting characters */
123         int buf_en = 0;         /* Set for name buffering */
124         int *attr_end = attr_buf+attr_size;
125         int *attr = attr_buf;
126         int c;                  /* Current character */
127         int ofst = 0;   /* record length after we've stopped buffering */
128         int (*getoctet)(P *) = line->b->o.charmap->type ? utfoctet : octetutf;
129
130         memset(&utfstate, 0, sizeof(utfstate));
131         buf[0] = 0;
132
133         /* Get next character */
134         while((c = getoctet(line)) != NO_MORE_DATA) {
135                 struct high_cmd *cmd, *kw_cmd;
136                 int x;
137
138                 /* Expand attribute array if necessary */
139                 if(attr==attr_end) {
140                         attr_buf = realloc(attr_buf,sizeof(int)*(attr_size*2));
141                         attr = attr_buf + attr_size;
142                         attr_size *= 2;
143                         attr_end = attr_buf + attr_size;
144                 }
145
146                 /* Advance to next attribute position (note attr[-1] below) */
147                 if (utfstate.first)
148                         attr++;
149
150                 /* Loop while noeat */
151                 do {
152                         /* Color with current state */
153                         attr[-1] = h->color;
154                         /* Get command for this character */
155                         cmd = h->cmd[c];
156                         /* Determine new state */
157                         if (cmd->keywords && (cmd->ignore ?
158                             (kw_cmd = htfind(cmd->keywords, joe_strtolower(buf))) :
159                             (kw_cmd = htfind(cmd->keywords, buf)))) {
160                                 cmd = kw_cmd;
161                                 h = cmd->new_state;
162                                 /* Recolor keyword */
163                                 for (x = -(buf_len + 1); x < -1; ++x)
164                                         attr[x - ofst] = h->color;
165                         } else {
166                                 h = cmd->new_state;
167                         }
168                         /* Recolor if necessary */
169                         x = cmd->recolor;
170                         while (&attr[x] < attr_buf)
171                                 ++x;
172                         while (x < 0)
173                                 attr[x++] = h->color;
174
175                         /* Start buffering? */
176                         if (cmd->start_buffering) {
177                                 buf_idx = 0;
178                                 buf_len = 0;
179                                 buf_en = 1;
180                                 ofst = 0;
181                         }
182
183                         /* Stop buffering? */
184                         if (cmd->stop_buffering)
185                                 buf_en = 0;
186                 } while(cmd->noeat);
187
188                 /* Save character in buffer */
189                 if (!buf_en)
190                         ofst += utfstate.first;
191                 else if (buf_idx < 19) {
192                         buf[buf_idx++] = c;
193                         buf[buf_idx] = 0;
194                         buf_len += utfstate.first;
195                 }
196
197                 if (c == '\n')
198                         break;
199         }
200         /* Return new state number */
201         return h->no;
202 }
203
204 /* Subroutines for load_dfa() */
205
206 static struct high_state *find_state(struct high_syntax *syntax, const unsigned char *name)
207 {
208         int x;
209         struct high_state *state;
210
211         /* Find state */
212         for(x=0;x!=syntax->nstates;++x)
213                 if(!strcmp(syntax->states[x]->name,name))
214                         break;
215
216         /* It doesn't exist, so create it */
217         if(x==syntax->nstates) {
218                 int y;
219                 state=malloc(sizeof(struct high_state));
220                 state->name=(const unsigned char *)strdup((const char *)name);
221                 state->no=syntax->nstates;
222                 state->color=FG_WHITE;
223                 if(!syntax->nstates)
224                         /* We're the first state */
225                         syntax->default_cmd.new_state = state;
226                 if(syntax->nstates==syntax->szstates)
227                         syntax->states=realloc(syntax->states,sizeof(struct high_state *)*(syntax->szstates*=2));
228                 syntax->states[syntax->nstates++]=state;
229                 for(y=0; y!=256; ++y)
230                         state->cmd[y] = &syntax->default_cmd;
231         } else
232                 state = syntax->states[x];
233         return state;
234 }
235
236 /* Load syntax file */
237
238 struct high_syntax *syntax_list;
239
240 struct high_syntax *load_dfa(const unsigned char *name)
241 {
242         unsigned char buf[1024];
243         unsigned char bf[256];
244         unsigned char bf1[256];
245         int clist[256];
246         unsigned char *p;
247         int c;
248         FILE *f = NULL;
249         struct high_state *state=0;     /* Current state */
250         struct high_syntax *syntax;     /* New syntax table */
251         int line = 0;
252         void *np;
253
254         if (!name)
255                 return NULL;
256
257         if(!attr_buf) {
258                 attr_size = 1024;
259                 attr_buf = malloc(sizeof(int)*attr_size);
260         }
261
262         /* Find syntax table */
263
264         /* Already loaded? */
265         for(syntax=syntax_list;syntax;syntax=syntax->next)
266                 if(!strcmp(syntax->name,name))
267                         return syntax;
268
269         /* Load it */
270         p = (unsigned char *)getenv("HOME");
271         if (p) {
272                 joe_snprintf_2((char *)buf,sizeof(buf),"%s/.jupp/syntax/%s.jsf",p,name);
273                 f = fopen((char *)buf,"r");
274         }
275
276         if (!f && has_JOERC) {
277                 joe_snprintf_2((char *)buf,sizeof(buf),"%ssyntax/%s.jsf",get_JOERC,name);
278                 f = fopen((char *)buf,"r");
279         }
280         if(!f)
281                 return 0;
282
283         /* Create new one */
284         syntax = calloc(1, sizeof(struct high_syntax));
285         syntax->name = (const unsigned char *)strdup((const char *)name);
286         syntax->next = syntax_list;
287         syntax_list = syntax;
288         syntax->states = malloc(sizeof(struct high_state *)*(syntax->szstates=64));
289         syntax->sync_lines = 120;
290
291         memset(clist, 0, sizeof(clist));
292
293         /* Parse file */
294         while(fgets((char *)buf,1023,f)) {
295                 ++line;
296                 p = buf;
297                 parse_ws(&p,'#');
298                 if(!parse_char(&p, ':')) {
299                         if(!parse_ident(&p, bf, 255)) {
300
301                                 state = find_state(syntax,bf);
302
303                                 parse_ws(&p,'#');
304                                 if(!parse_ident(&p,bf,255)) {
305                                         struct high_color *color;
306                                         for(color=syntax->color;color;color=color->next)
307                                                 if(!strcmp(color->name,bf))
308                                                         break;
309                                         if(color)
310                                                 state->color=color->color;
311                                         else {
312                                                 state->color=0;
313                                                 fprintf(stderr,"%s:%d: Unknown class '%s'\n", name, line, bf);
314                                         }
315                                 } else
316                                         fprintf(stderr,"%s:%d: Missing color for state definition\n", name, line);
317                         } else
318                                 fprintf(stderr,"%s:%d: Missing state name\n", name, line);
319                 } else if(!parse_char(&p, '=')) {
320                         if(!parse_ident(&p, bf, 255)) {
321                                 struct high_color *color;
322
323                                 /* Find color */
324                                 for(color=syntax->color;color;color=color->next)
325                                         if(!strcmp(color->name,bf))
326                                                 break;
327                                 /* If it doesn't exist, create it */
328                                 if(!color) {
329                                         color = calloc(1, sizeof(struct high_color));
330                                         color->name = (unsigned char *)strdup((char *)bf);
331                                         color->next = syntax->color;
332                                         syntax->color = color;
333                                 } else {
334                                         fprintf(stderr,"%s:%d: Class '%s' already defined\n", name, line, bf);
335                                 }
336
337                                 /* Parse color definition */
338                                 while(parse_ws(&p,'#'), !parse_ident(&p,bf,255)) {
339                                         color->color |= meta_color(bf);
340                                 }
341                         }
342                 } else if(!parse_char(&p, '-')) { /* No. sync lines */
343                         syntax->sync_lines = (int)ustolb(p, &np,
344                             INT_MIN, INT_MAX, USTOL_TRIM);
345                         if (!np)
346                                 syntax->sync_lines = -1;
347                         else
348                                 p = np;
349                 } else {
350                         c = parse_ws(&p,'#');
351
352                         if (!c) {
353                         } else if (c=='"' || c=='*') {
354                                 if (state) {
355                                         struct high_cmd *cmd;
356                                         if(!parse_field(&p, US "*")) {
357                                                 int z;
358                                                 for(z=0;z!=256;++z)
359                                                         clist[z] = 1;
360                                         } else {
361                                                 c = parse_string(&p, bf, 255);
362                                                 if(c)
363                                                         fprintf(stderr,"%s:%d: Bad string\n", name, line);
364                                                 else {
365                                                         int z;
366                                                         int first, second;
367                                                         unsigned char *t = bf;
368                                                         for(z=0;z!=256;++z)
369                                                                 clist[z] = 0;
370                                                         while(!parse_range(&t, &first, &second)) {
371                                                                 if(first>second)
372                                                                         second = first;
373                                                                 while(first<=second)
374                                                                         clist[first++] = 1;
375                                                         }
376                                                 }
377                                         }
378                                         /* Create command */
379                                         cmd = calloc(1, sizeof(struct high_cmd));
380                                         parse_ws(&p,'#');
381                                         if(!parse_ident(&p,bf,255)) {
382                                                 int z;
383                                                 cmd->new_state = find_state(syntax,bf);
384
385                                                 /* Parse options */
386                                                 while (parse_ws(&p,'#'), !parse_ident(&p,bf,255))
387                                                         if(!strcmp(bf,"buffer")) {
388                                                                 cmd->start_buffering = 1;
389                                                         } else if(!strcmp(bf,"hold")) {
390                                                                 cmd->stop_buffering = 1;
391                                                         } else if(!strcmp(bf,"recolor")) {
392                                                                 parse_ws(&p,'#');
393                                                                 if(!parse_char(&p,'=')) {
394                                                                         parse_ws(&p,'#');
395                                                                         cmd->recolor = (int)ustolb(p, &np,
396                                                                             INT_MIN, INT_MAX, USTOL_TRIM);
397                                                                         if (!np)
398                                                                                 fprintf(stderr,"%s:%d: Missing value for option %s\n", name, line, bf);
399                                                                         else
400                                                                                 p = np;
401                                                                 } else
402                                                                         fprintf(stderr,"%s:%d: Missing value for option %s\n", name, line, bf);
403                                                         } else if(!strcmp(bf,"strings") || !strcmp(bf,"istrings")) {
404                                                                 if (bf[0]=='i')
405                                                                         cmd->ignore = 1;
406                                                                 while(fgets((char *)buf,1023,f)) {
407                                                                         ++line;
408                                                                         p = buf;
409                                                                         parse_ws(&p,'#');
410                                                                         if (*p) {
411                                                                                 if(!parse_field(&p,US "done"))
412                                                                                         break;
413                                                                                 if(!parse_string(&p,bf,255)) {
414                                                                                         parse_ws(&p,'#');
415                                                                                         if (cmd->ignore)
416                                                                                                 joe_strtolower(bf);
417                                                                                         if(!parse_ident(&p,bf1,255)) {
418                                                                                                 struct high_cmd *kw_cmd = calloc(1, sizeof(struct high_cmd));
419                                                                                                 kw_cmd->noeat=1;
420                                                                                                 kw_cmd->new_state = find_state(syntax,bf1);
421                                                                                                 if(!cmd->keywords)
422                                                                                                         cmd->keywords = htmk(64);
423                                                                                                 htadd(cmd->keywords,(unsigned char *)strdup((char *)bf),kw_cmd);
424                                                                                                 while (parse_ws(&p,'#'), !parse_ident(&p,bf,255))
425                                                                                                         if(!strcmp(bf,"buffer")) {
426                                                                                                                 kw_cmd->start_buffering = 1;
427                                                                                                         } else if(!strcmp(bf,"hold")) {
428                                                                                                                 kw_cmd->stop_buffering = 1;
429                                                                                                         } else if(!strcmp(bf,"recolor")) {
430                                                                                                                 parse_ws(&p,'#');
431                                                                                                                 if(!parse_char(&p,'=')) {
432                                                                                                                         parse_ws(&p,'#');
433                                                                                                                         kw_cmd->recolor = (int)ustolb(p, &np,
434                                                                                                                             INT_MIN, INT_MAX, USTOL_TRIM);
435                                                                                                                         if (!np)
436                                                                                                                                 fprintf(stderr,"%s:%d: Missing value for option %s\n", name, line, bf);
437                                                                                                                         else
438                                                                                                                                 p = np;
439                                                                                                                 } else
440                                                                                                                         fprintf(stderr,"%s:%d: Missing value for option %s\n", name, line, bf);
441                                                                                                         } else
442                                                                                                                 fprintf(stderr,"%s:%d: Unknown option '%s'\n", name, line, bf);
443                                                                                         } else
444                                                                                                 fprintf(stderr,"%s:%d: Missing state name\n", name, line);
445                                                                                 } else
446                                                                                         fprintf(stderr,"%s:%d: Missing string\n", name, line);
447                                                                         }
448                                                                 }
449                                                         } else if(!strcmp(bf,"noeat")) {
450                                                                 cmd->noeat = 1;
451                                                         } else if(!strcmp(bf,"mark")) {
452                                                                 /* not implemented yet */ ;
453                                                         } else if(!strcmp(bf,"markend")) {
454                                                                 /* not implemented yet */ ;
455                                                         } else if(!strcmp(bf,"recolormark")) {
456                                                                 /* not implemented yet */ ;
457                                                         } else
458                                                                 fprintf(stderr,"%s:%d: Unknown option '%s'\n", name, line, bf);
459
460                                                 /* Install command */
461                                                 for(z=0;z!=256;++z)
462                                                         if(clist[z])
463                                                                 state->cmd[z]=cmd;
464                                         } else
465                                                 fprintf(stderr,"%s:%d: Missing jump\n", name, line);
466                                 } else
467                                         fprintf(stderr,"%s:%d: No state\n", name, line);
468                         } else
469                                 fprintf(stderr,"%s:%d: Unknown character\n", name, line);
470                 }
471         }
472
473         fclose(f);
474
475         return syntax;
476 }