IDZEBRA 2.2.8
dfa.c
Go to the documentation of this file.
1/* This file is part of the Zebra server.
2 Copyright (C) Index Data
3
4Zebra is free software; you can redistribute it and/or modify it under
5the terms of the GNU General Public License as published by the Free
6Software Foundation; either version 2, or (at your option) any later
7version.
8
9Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10WARRANTY; without even the implied warranty of MERCHANTABILITY or
11FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12for more details.
13
14You should have received a copy of the GNU General Public License
15along with this program; if not, write to the Free Software
16Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
18*/
19
20
21#if HAVE_CONFIG_H
22#include <config.h>
23#endif
24#include <stdio.h>
25#include <assert.h>
26
27#include <stdlib.h>
28#include <string.h>
29#include <ctype.h>
30
31#include <yaz/snprintf.h>
32
33#include <idzebra/util.h>
34#include "dfap.h"
35#include "imalloc.h"
36
37#define CAT 16000
38#define OR 16001
39#define STAR 16002
40#define PLUS 16003
41#define EPSILON 16004
42
43struct Tnode {
44 union {
45 struct Tnode *p[2]; /* CAT,OR,STAR,PLUS (left,right) */
46 short ch[2]; /* if ch[0] >= 0 then this Tnode represents */
47 /* a character in range [ch[0]..ch[1]] */
48 /* otherwise ch[0] represents */
49 /* accepting no (-acceptno) */
50 } u;
51 unsigned pos : 15; /* CAT/OR/STAR/EPSILON or non-neg. position */
52 unsigned nullable : 1;
53 DFASet firstpos; /* first positions */
54 DFASet lastpos; /* last positions */
55};
56
57struct Tblock { /* Tnode bucket (block) */
58 struct Tblock *next; /* pointer to next bucket */
59 struct Tnode *tarray; /* array of nodes in bucket */
60};
61
62#define TADD 64
63#define STATE_HASH 199
64#define POSET_CHUNK 100
65
70
71static struct Tnode *mk_Tnode (struct DFA_parse *parse_info);
72static struct Tnode *mk_Tnode_cset (struct DFA_parse *parse_info,
73 BSet charset);
74static void term_Tnode (struct DFA_parse *parse_info);
75
76static void
77 del_followpos (struct DFA_parse *parse_info),
78 init_pos (struct DFA_parse *parse_info),
79 del_pos (struct DFA_parse *parse_info),
80 mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas),
81 add_follow (struct DFA_parse *parse_info, DFASet lastpos, DFASet firstpos),
82 dfa_trav (struct DFA_parse *parse_info, struct Tnode *n),
83 init_followpos (struct DFA_parse *parse_info),
84 pr_tran (struct DFA_parse *parse_info, struct DFA_states *dfas),
85 pr_verbose (struct DFA_parse *parse_info, struct DFA_states *dfas),
86 pr_followpos (struct DFA_parse *parse_info),
87 out_char (int c),
88 lex (struct DFA_parse *parse_info);
89
90static int
91 nextchar (struct DFA_parse *parse_info, int *esc),
92 read_charset (struct DFA_parse *parse_info);
93
94static const char
95 *str_char (unsigned c);
96
97#define L_LP 1
98#define L_RP 2
99#define L_CHAR 3
100#define L_CHARS 4
101#define L_ANY 5
102#define L_ALT 6
103#define L_ANYZ 7
104#define L_WILD 8
105#define L_QUEST 9
106#define L_CLOS1 10
107#define L_CLOS0 11
108#define L_END 12
109#define L_START 13
110
111
112static struct Tnode *expr_1 (struct DFA_parse *parse_info),
113 *expr_2 (struct DFA_parse *parse_info),
114 *expr_3 (struct DFA_parse *parse_info),
115 *expr_4 (struct DFA_parse *parse_info);
116
117static struct Tnode *expr_1 (struct DFA_parse *parse_info)
118{
119 struct Tnode *t1, *t2, *tn;
120
121 if (!(t1 = expr_2 (parse_info)))
122 return t1;
123 while (parse_info->lookahead == L_ALT)
124 {
125 lex (parse_info);
126 if (!(t2 = expr_2 (parse_info)))
127 return t2;
128
129 tn = mk_Tnode (parse_info);
130 tn->pos = OR;
131 tn->u.p[0] = t1;
132 tn->u.p[1] = t2;
133 t1 = tn;
134 }
135 return t1;
136}
137
138static struct Tnode *expr_2 (struct DFA_parse *parse_info)
139{
140 struct Tnode *t1, *t2, *tn;
141
142 if (!(t1 = expr_3 (parse_info)))
143 return t1;
144 while (parse_info->lookahead == L_WILD ||
145 parse_info->lookahead == L_ANYZ ||
146 parse_info->lookahead == L_ANY ||
147 parse_info->lookahead == L_LP ||
148 parse_info->lookahead == L_CHAR ||
149 parse_info->lookahead == L_CHARS)
150 {
151 if (!(t2 = expr_3 (parse_info)))
152 return t2;
153
154 tn = mk_Tnode (parse_info);
155 tn->pos = CAT;
156 tn->u.p[0] = t1;
157 tn->u.p[1] = t2;
158 t1 = tn;
159 }
160 return t1;
161}
162
163static struct Tnode *expr_3 (struct DFA_parse *parse_info)
164{
165 struct Tnode *t1, *tn;
166
167 if (!(t1 = expr_4 (parse_info)))
168 return t1;
169 if (parse_info->lookahead == L_CLOS0)
170 {
171 lex (parse_info);
172 tn = mk_Tnode (parse_info);
173 tn->pos = STAR;
174 tn->u.p[0] = t1;
175 t1 = tn;
176 }
177 else if (parse_info->lookahead == L_CLOS1)
178 {
179 lex (parse_info);
180 tn = mk_Tnode (parse_info);
181 tn->pos = PLUS;
182 tn->u.p[0] = t1;
183 t1 = tn;
184 }
185 else if (parse_info->lookahead == L_QUEST)
186 {
187 lex (parse_info);
188 tn = mk_Tnode(parse_info);
189 tn->pos = OR;
190 tn->u.p[0] = t1;
191 tn->u.p[1] = mk_Tnode(parse_info);
192 tn->u.p[1]->pos = EPSILON;
193 t1 = tn;
194 }
195 return t1;
196}
197
198static struct Tnode *expr_4 (struct DFA_parse *parse_info)
199{
200 struct Tnode *t1;
201
202 switch (parse_info->lookahead)
203 {
204 case L_LP:
205 lex (parse_info);
206 if (!(t1 = expr_1 (parse_info)))
207 return t1;
208 if (parse_info->lookahead == L_RP)
209 lex (parse_info);
210 else
211 return NULL;
212 break;
213 case L_CHAR:
214 t1 = mk_Tnode(parse_info);
215 t1->pos = ++parse_info->position;
216 t1->u.ch[1] = t1->u.ch[0] = parse_info->look_ch;
217 lex (parse_info);
218 break;
219 case L_CHARS:
220 t1 = mk_Tnode_cset (parse_info, parse_info->look_chars);
221 lex (parse_info);
222 break;
223 case L_ANY:
224 t1 = mk_Tnode_cset (parse_info, parse_info->anyset);
225 lex (parse_info);
226 break;
227 case L_ANYZ:
228 t1 = mk_Tnode(parse_info);
229 t1->pos = OR;
230 t1->u.p[0] = mk_Tnode_cset (parse_info, parse_info->anyset);
231 t1->u.p[1] = mk_Tnode(parse_info);
232 t1->u.p[1]->pos = EPSILON;
233 lex (parse_info);
234 break;
235 case L_WILD:
236 t1 = mk_Tnode(parse_info);
237 t1->pos = STAR;
238 t1->u.p[0] = mk_Tnode_cset (parse_info, parse_info->anyset);
239 lex (parse_info);
240 default:
241 t1 = NULL;
242 }
243 return t1;
244}
245
246static void do_parse (struct DFA_parse *parse_info, const char **s,
247 struct Tnode **tnp)
248{
249 int start_anchor_flag = 0;
250 struct Tnode *t1, *t2, *tn;
251
252 parse_info->err_code = 0;
253 parse_info->expr_ptr = * (const unsigned char **) s;
254
255 parse_info->inside_string = 0;
256 lex (parse_info);
257 if (parse_info->lookahead == L_START)
258 {
260 lex (parse_info);
261 }
262 if (parse_info->lookahead == L_END)
263 {
264 t1 = mk_Tnode (parse_info);
265 t1->pos = ++parse_info->position;
266 t1->u.ch[1] = t1->u.ch[0] = '\n';
267 lex (parse_info);
268 }
269 else
270 {
271 t1 = expr_1 (parse_info);
272 if (t1 && parse_info->lookahead == L_END)
273 {
274 t2 = mk_Tnode (parse_info);
275 t2->pos = ++parse_info->position;
276 t2->u.ch[1] = t2->u.ch[0] = '\n';
277
278 tn = mk_Tnode (parse_info);
279 tn->pos = CAT;
280 tn->u.p[0] = t1;
281 tn->u.p[1] = t2;
282 t1 = tn;
283
284 lex (parse_info);
285 }
286 }
287 if (t1 && parse_info->lookahead == 0)
288 {
289 t2 = mk_Tnode(parse_info);
290 t2->pos = ++parse_info->position;
291 t2->u.ch[0] = -(++parse_info->rule);
292 t2->u.ch[1] = start_anchor_flag ? 0 : -(parse_info->rule);
293
294 *tnp = mk_Tnode(parse_info);
295 (*tnp)->pos = CAT;
296 (*tnp)->u.p[0] = t1;
297 (*tnp)->u.p[1] = t2;
298 }
299 else
300 {
301 if (!parse_info->err_code)
302 {
303 if (parse_info->lookahead == L_RP)
304 parse_info->err_code = DFA_ERR_RP;
305 else if (parse_info->lookahead == L_LP)
306 parse_info->err_code = DFA_ERR_LP;
307 else
308 parse_info->err_code = DFA_ERR_SYNTAX;
309 }
310 }
311 *s = (const char *) parse_info->expr_ptr;
312}
313
314static int nextchar (struct DFA_parse *parse_info, int *esc)
315{
316 *esc = 0;
317 if (*parse_info->expr_ptr == '\0')
318 return 0;
319 else if (*parse_info->expr_ptr != '\\')
320 return *parse_info->expr_ptr++;
321 *esc = 1;
322 switch (*++parse_info->expr_ptr)
323 {
324 case '\r':
325 case '\n':
326 case '\0':
327 return '\\';
328 case '\t':
329 ++parse_info->expr_ptr;
330 return ' ';
331 case 'n':
332 ++parse_info->expr_ptr;
333 return '\n';
334 case 't':
335 ++parse_info->expr_ptr;
336 return '\t';
337 case 'r':
338 ++parse_info->expr_ptr;
339 return '\r';
340 case 'f':
341 ++parse_info->expr_ptr;
342 return '\f';
343 default:
344 return *parse_info->expr_ptr++;
345 }
346}
347
348static int nextchar_set (struct DFA_parse *parse_info, int *esc)
349{
350 if (*parse_info->expr_ptr == ' ')
351 {
352 *esc = 0;
353 return *parse_info->expr_ptr++;
354 }
355 return nextchar (parse_info, esc);
356}
357
358static int read_charset (struct DFA_parse *parse_info)
359{
360 int i, ch0, esc0, cc = 0;
361 parse_info->look_chars = mk_BSet (&parse_info->charset);
362 res_BSet (parse_info->charset, parse_info->look_chars);
363
364 ch0 = nextchar_set (parse_info, &esc0);
365 if (!esc0 && ch0 == '^')
366 {
367 cc = 1;
368 ch0 = nextchar_set (parse_info, &esc0);
369 }
374 while (ch0 != 0)
375 {
376 int ch1, esc1;
377 if (!esc0 && ch0 == ']')
378 break;
379 if (!esc0 && ch0 == '-')
380 {
381 ch1 = ch0;
382 esc1 = esc0;
383 ch0 = 1;
384 add_BSet (parse_info->charset, parse_info->look_chars, ch0);
385 }
386 else
387 {
388 if (ch0 == 1)
389 {
390 ch0 = nextchar(parse_info, &esc0);
391 }
392 else
393 {
394 if (parse_info->cmap)
395 {
396 const char **mapto;
397 char mapfrom[2];
398 const char *mcp = mapfrom;
399 mapfrom[0] = ch0;
400 mapto = parse_info->cmap(parse_info->cmap_data, &mcp, 1);
401 assert (mapto);
402 ch0 = mapto[0][0];
403 }
404 }
405 add_BSet (parse_info->charset, parse_info->look_chars, ch0);
406 ch1 = nextchar_set (parse_info, &esc1);
407 }
408 if (!esc1 && ch1 == '-')
409 {
410 int open_range = 0;
411 if ((ch1 = nextchar_set (parse_info, &esc1)) == 0)
412 break;
413 if (!esc1 && ch1 == ']')
414 {
415 ch1 = 255;
416 open_range = 1;
417 }
418 else if (ch1 == 1)
419 {
420 ch1 = nextchar(parse_info, &esc1);
421 }
422 else if (parse_info->cmap)
423 {
424 const char **mapto;
425 char mapfrom[2];
426 const char *mcp = mapfrom;
427 mapfrom[0] = ch1;
428 mapto = (*parse_info->cmap) (parse_info->cmap_data, &mcp, 1);
429 assert (mapto);
430 ch1 = mapto[0][0];
431 }
432 for (i = ch0; ++i <= ch1;)
433 add_BSet (parse_info->charset, parse_info->look_chars, i);
434
435 if (open_range)
436 break;
437 ch0 = nextchar_set (parse_info, &esc0);
438 }
439 else
440 {
441 esc0 = esc1;
442 ch0 = ch1;
443 }
444 }
445 if (cc)
446 com_BSet (parse_info->charset, parse_info->look_chars);
447 return L_CHARS;
448}
449
450static int map_l_char (struct DFA_parse *parse_info)
451{
452 const char **mapto;
453 const char *cp0 = (const char *) (parse_info->expr_ptr-1);
454 int i = 0, len = strlen(cp0);
455
456 if (cp0[0] == 1 && cp0[1])
457 {
458 parse_info->expr_ptr++;
459 parse_info->look_ch = ((unsigned char *) cp0)[1];
460 return L_CHAR;
461 }
462 if (!parse_info->cmap)
463 return L_CHAR;
464
465 mapto = (*parse_info->cmap) (parse_info->cmap_data, &cp0, len);
466 assert (mapto);
467
468 parse_info->expr_ptr = (const unsigned char *) cp0;
469 parse_info->look_ch = ((unsigned char **) mapto)[i][0];
470 yaz_log (YLOG_DEBUG, "map from %c to %d", parse_info->expr_ptr[-1], parse_info->look_ch);
471 return L_CHAR;
472}
473
474static int lex_sub(struct DFA_parse *parse_info)
475{
476 int esc;
477 while ((parse_info->look_ch = nextchar (parse_info, &esc)) != 0)
478 if (parse_info->look_ch == '\"')
479 {
480 if (esc)
481 return map_l_char (parse_info);
482 parse_info->inside_string = !parse_info->inside_string;
483 }
484 else if (esc || parse_info->inside_string)
485 return map_l_char (parse_info);
486 else if (parse_info->look_ch == '[')
487 return read_charset(parse_info);
488 else
489 {
490 const int *cc;
491 for (cc = parse_info->charMap; *cc; cc += 2)
492 if (*cc == (int) (parse_info->look_ch))
493 {
494 if (!cc[1])
495 --parse_info->expr_ptr;
496 return cc[1];
497 }
498 return map_l_char (parse_info);
499 }
500 return 0;
501}
502
503static void lex (struct DFA_parse *parse_info)
504{
505 parse_info->lookahead = lex_sub (parse_info);
506}
507
508static const char *str_char (unsigned c)
509{
510 static char s[6];
511 s[0] = '\\';
512 if (c < 32 || c >= 127)
513 switch (c)
514 {
515 case '\r':
516 strcpy (s+1, "r");
517 break;
518 case '\n':
519 strcpy (s+1, "n");
520 break;
521 case '\t':
522 strcpy (s+1, "t");
523 break;
524 default:
525 yaz_snprintf(s + 1, sizeof(s) - 1, "x%02x", c);
526 break;
527 }
528 else
529 switch (c)
530 {
531 case '\"':
532 strcpy (s+1, "\"");
533 break;
534 case '\'':
535 strcpy (s+1, "\'");
536 break;
537 case '\\':
538 strcpy (s+1, "\\");
539 break;
540 default:
541 s[0] = c;
542 s[1] = '\0';
543 }
544 return s;
545}
546
547static void out_char (int c)
548{
549 printf ("%s", str_char (c));
550}
551
552static void term_Tnode (struct DFA_parse *parse_info)
553{
554 struct Tblock *t, *t1;
555 for (t = parse_info->start; (t1 = t);)
556 {
557 t=t->next;
558 ifree (t1->tarray);
559 ifree (t1);
560 }
561}
562
563static struct Tnode *mk_Tnode (struct DFA_parse *parse_info)
564{
565 struct Tblock *tnew;
566
567 if (parse_info->use_Tnode == parse_info->max_Tnode)
568 {
569 tnew = (struct Tblock *) imalloc (sizeof(struct Tblock));
570 tnew->tarray = (struct Tnode *) imalloc (TADD*sizeof(struct Tnode));
571 if (!tnew->tarray)
572 return NULL;
573 if (parse_info->use_Tnode == 0)
574 parse_info->start = tnew;
575 else
576 parse_info->end->next = tnew;
577 parse_info->end = tnew;
578 tnew->next = NULL;
579 parse_info->max_Tnode += TADD;
580 }
581 return parse_info->end->tarray+(parse_info->use_Tnode++ % TADD);
582}
583
584struct Tnode *mk_Tnode_cset (struct DFA_parse *parse_info, BSet charset)
585{
586 struct Tnode *tn1, *tn0 = mk_Tnode(parse_info);
587 int ch1, ch0 = trav_BSet (parse_info->charset, charset, 0);
588 if (ch0 == -1)
589 tn0->pos = EPSILON;
590 else
591 {
592 tn0->u.ch[0] = ch0;
593 tn0->pos = ++parse_info->position;
594 ch1 = travi_BSet (parse_info->charset, charset, ch0+1) ;
595 if (ch1 == -1)
596 tn0->u.ch[1] = parse_info->charset->size;
597 else
598 {
599 tn0->u.ch[1] = ch1-1;
600 while ((ch0=trav_BSet (parse_info->charset, charset, ch1)) != -1)
601 {
602 tn1 = mk_Tnode(parse_info);
603 tn1->pos = OR;
604 tn1->u.p[0] = tn0;
605 tn0 = tn1;
606 tn1 = tn0->u.p[1] = mk_Tnode(parse_info);
607 tn1->u.ch[0] = ch0;
608 tn1->pos = ++(parse_info->position);
609 if ((ch1 = travi_BSet (parse_info->charset, charset,
610 ch0+1)) == -1)
611 {
612 tn1->u.ch[1] = parse_info->charset->size;
613 break;
614 }
615 tn1->u.ch[1] = ch1-1;
616 }
617 }
618 }
619 return tn0;
620}
621
622static void del_followpos (struct DFA_parse *parse_info)
623{
624 ifree (parse_info->followpos);
625}
626
627static void init_pos (struct DFA_parse *parse_info)
628{
629 parse_info->posar = (struct Tnode **) imalloc (sizeof(struct Tnode*)
630 * (1+parse_info->position));
631}
632
633static void del_pos (struct DFA_parse *parse_info)
634{
635 ifree (parse_info->posar);
636}
637
638static void add_follow (struct DFA_parse *parse_info,
640{
641 while (lastpos)
642 {
643 parse_info->followpos[lastpos->value] =
644 union_DFASet (parse_info->poset,
645 parse_info->followpos[lastpos->value], firstpos);
647 }
648}
649
650static void dfa_trav (struct DFA_parse *parse_info, struct Tnode *n)
651{
652 struct Tnode **posar = parse_info->posar;
653 DFASetType poset = parse_info->poset;
654
655 switch (n->pos)
656 {
657 case CAT:
658 dfa_trav (parse_info, n->u.p[0]);
659 dfa_trav (parse_info, n->u.p[1]);
660 n->nullable = n->u.p[0]->nullable & n->u.p[1]->nullable;
661 n->firstpos = mk_DFASet (poset);
662 n->firstpos = union_DFASet (poset, n->firstpos, n->u.p[0]->firstpos);
663 if (n->u.p[0]->nullable)
664 n->firstpos = union_DFASet (poset, n->firstpos, n->u.p[1]->firstpos);
665 n->lastpos = mk_DFASet (poset);
666 n->lastpos = union_DFASet (poset, n->lastpos, n->u.p[1]->lastpos);
667 if (n->u.p[1]->nullable)
668 n->lastpos = union_DFASet (poset, n->lastpos, n->u.p[0]->lastpos);
669
670 add_follow (parse_info, n->u.p[0]->lastpos, n->u.p[1]->firstpos);
671
672 n->u.p[0]->firstpos = rm_DFASet (poset, n->u.p[0]->firstpos);
673 n->u.p[0]->lastpos = rm_DFASet (poset, n->u.p[0]->lastpos);
674 n->u.p[1]->firstpos = rm_DFASet (poset, n->u.p[1]->firstpos);
675 n->u.p[1]->lastpos = rm_DFASet (poset, n->u.p[1]->lastpos);
676 if (debug_dfa_trav)
677 printf ("CAT");
678 break;
679 case OR:
680 dfa_trav (parse_info, n->u.p[0]);
681 dfa_trav (parse_info, n->u.p[1]);
682 n->nullable = n->u.p[0]->nullable | n->u.p[1]->nullable;
683
684 n->firstpos = merge_DFASet (poset, n->u.p[0]->firstpos,
685 n->u.p[1]->firstpos);
686 n->lastpos = merge_DFASet (poset, n->u.p[0]->lastpos,
687 n->u.p[1]->lastpos);
688 n->u.p[0]->firstpos = rm_DFASet (poset, n->u.p[0]->firstpos);
689 n->u.p[0]->lastpos = rm_DFASet (poset, n->u.p[0]->lastpos);
690 n->u.p[1]->firstpos = rm_DFASet (poset, n->u.p[1]->firstpos);
691 n->u.p[1]->lastpos = rm_DFASet (poset, n->u.p[1]->lastpos);
692 if (debug_dfa_trav)
693 printf ("OR");
694 break;
695 case PLUS:
696 dfa_trav (parse_info, n->u.p[0]);
697 n->nullable = n->u.p[0]->nullable;
698 n->firstpos = n->u.p[0]->firstpos;
699 n->lastpos = n->u.p[0]->lastpos;
700 add_follow (parse_info, n->lastpos, n->firstpos);
701 if (debug_dfa_trav)
702 printf ("PLUS");
703 break;
704 case STAR:
705 dfa_trav (parse_info, n->u.p[0]);
706 n->nullable = 1;
707 n->firstpos = n->u.p[0]->firstpos;
708 n->lastpos = n->u.p[0]->lastpos;
709 add_follow (parse_info, n->lastpos, n->firstpos);
710 if (debug_dfa_trav)
711 printf ("STAR");
712 break;
713 case EPSILON:
714 n->nullable = 1;
715 n->lastpos = mk_DFASet (poset);
716 n->firstpos = mk_DFASet (poset);
717 if (debug_dfa_trav)
718 printf ("EPSILON");
719 break;
720 default:
721 posar[n->pos] = n;
722 n->nullable = 0;
723 n->firstpos = mk_DFASet (poset);
724 n->firstpos = add_DFASet (poset, n->firstpos, n->pos);
725 n->lastpos = mk_DFASet (poset);
726 n->lastpos = add_DFASet (poset, n->lastpos, n->pos);
727 if (debug_dfa_trav)
728 {
729 if (n->u.ch[0] < 0)
730 printf ("#%d (n#%d)", -n->u.ch[0], -n->u.ch[1]);
731 else if (n->u.ch[1] > n->u.ch[0])
732 {
733 putchar ('[');
734 out_char (n->u.ch[0]);
735 if (n->u.ch[1] > n->u.ch[0]+1)
736 putchar ('-');
737 out_char (n->u.ch[1]);
738 putchar (']');
739 }
740 else
741 out_char (n->u.ch[0]);
742 }
743 }
744 if (debug_dfa_trav)
745 {
746 printf ("\n nullable : %c\n", n->nullable ? '1' : '0');
747 printf (" firstpos :");
748 pr_DFASet (poset, n->firstpos);
749 printf (" lastpos :");
750 pr_DFASet (poset, n->lastpos);
751 }
752}
753
754static void init_followpos (struct DFA_parse *parse_info)
755{
756 DFASet *fa;
757 int i;
758 parse_info->followpos = fa =
759 (DFASet *) imalloc (sizeof(DFASet) * (1+parse_info->position));
760 for (i = parse_info->position+1; --i >= 0; fa++)
761 *fa = mk_DFASet (parse_info->poset);
762}
763
764static void mk_dfa_tran (struct DFA_parse *parse_info, struct DFA_states *dfas)
765{
766 int i, j, c;
767 int max_char;
768 short *pos, *pos_i;
770 int char_0, char_1;
771 struct DFA_state *dfa_from, *dfa_to;
772 struct Tnode **posar;
773 DFASetType poset;
774 DFASet *followpos;
775
776 assert (parse_info);
777
778 posar = parse_info->posar;
779 max_char = parse_info->charset->size;
780 pos = (short *) imalloc (sizeof(*pos) * (parse_info->position+1));
781
782 tran_set = cp_DFASet (parse_info->poset, parse_info->root->firstpos);
784 assert (i);
785 dfa_from->rule_no = 0;
786 poset = parse_info->poset;
787 followpos = parse_info->followpos;
788 while ((dfa_from = get_DFA_state (dfas)))
789 {
790 pos_i = pos;
791 j = i = 0;
792 for (tran_set = dfa_from->set; tran_set; tran_set = tran_set->next)
793 if ((c = posar[tran_set->value]->u.ch[0]) >= 0 && c <= max_char)
794 *pos_i++ = tran_set->value;
795 else if (c < 0)
796 {
797 if (i == 0 || c > i)
798 i = c;
799 c = posar[tran_set->value]->u.ch[1];
800 if (j == 0 || c > j)
801 j = c;
802 }
803 *pos_i = -1;
804 dfa_from->rule_no = -i;
805 dfa_from->rule_nno = -j;
806
807 for (char_1 = 0; char_1 <= max_char; char_1++)
808 {
809 char_0 = max_char+1;
810 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
811 if (posar[i]->u.ch[1] >= char_1
812 && (c=posar[i]->u.ch[0]) < char_0)
813 {
814 if (c < char_1)
815 char_0 = char_1;
816 else
817 char_0 = c;
818 }
819
820 if (char_0 > max_char)
821 break;
822
824
825 tran_set = mk_DFASet (poset);
826 for (pos_i = pos; (i = *pos_i) != -1; ++pos_i)
827 {
828 if ((c=posar[i]->u.ch[0]) > char_0 && c <= char_1)
829 char_1 = c - 1; /* forward chunk */
830 else if ((c=posar[i]->u.ch[1]) >= char_0 && c < char_1)
831 char_1 = c; /* backward chunk */
832 if (posar[i]->u.ch[1] >= char_0 && posar[i]->u.ch[0] <= char_0)
833 tran_set = union_DFASet (poset, tran_set, followpos[i]);
834 }
835 if (tran_set)
836 {
839 }
840 }
841 }
842 ifree (pos);
844}
845
846static void pr_tran (struct DFA_parse *parse_info, struct DFA_states *dfas)
847{
848 struct DFA_state *s;
849 struct DFA_tran *tran;
850 int prev_no, i, c, no;
851
852 for (no=0; no < dfas->no; ++no)
853 {
854 s = dfas->sortarray[no];
855 assert (s->no == no);
856 printf ("DFA state %d", no);
857 if (s->rule_no)
858 printf ("#%d", s->rule_no);
859 if (s->rule_nno)
860 printf (" #%d", s->rule_nno);
861
862 putchar (':');
863 pr_DFASet (parse_info->poset, s->set);
864 prev_no = -1;
865 for (i=s->tran_no, tran=s->trans; --i >= 0; tran++)
866 {
867 if (prev_no != tran->to)
868 {
869 if (prev_no != -1)
870 printf ("]\n");
871 printf (" goto %d on [", tran->to);
872 prev_no = tran->to;
873 }
874 for (c = tran->ch[0]; c <= tran->ch[1]; c++)
875 printf ("%s", str_char(c));
876 }
877 if (prev_no != -1)
878 printf ("]\n");
879 putchar ('\n');
880 }
881}
882
883static void pr_verbose (struct DFA_parse *parse_info, struct DFA_states *dfas)
884{
885 long i, j;
886 int k;
887 printf ("%d/%d tree nodes used, %ld bytes each\n",
888 parse_info->use_Tnode, parse_info->max_Tnode, (long) sizeof (struct Tnode));
889 k = inf_BSetHandle (parse_info->charset, &i, &j);
890 printf ("%ld/%ld character sets, %ld bytes each\n",
891 i/k, j/k, (long) k*sizeof(BSetWord));
892 k = inf_DFASetType (parse_info->poset, &i, &j);
893 printf ("%ld/%ld poset items, %d bytes each\n", i, j, k);
894 printf ("%d DFA states\n", dfas->no);
895}
896
897static void pr_followpos (struct DFA_parse *parse_info)
898{
899 struct Tnode **posar = parse_info->posar;
900 int i;
901 printf ("\nfollowsets:\n");
902 for (i=1; i <= parse_info->position; i++)
903 {
904 printf ("%3d:", i);
905 pr_DFASet (parse_info->poset, parse_info->followpos[i]);
906 putchar ('\t');
907
908 if (posar[i]->u.ch[0] < 0)
909 printf ("#%d", -posar[i]->u.ch[0]);
910 else if (posar[i]->u.ch[1] > posar[i]->u.ch[0])
911 {
912 putchar ('[');
913 out_char (posar[i]->u.ch[0]);
914 if (posar[i]->u.ch[1] > posar[i]->u.ch[0]+1)
915 putchar ('-');
916 out_char (posar[i]->u.ch[1]);
917 putchar (']');
918 }
919 else
920 out_char (posar[i]->u.ch[0]);
921 putchar ('\n');
922 }
923 putchar ('\n');
924}
925
927{
928 struct DFA_parse *dfa = d->parse_info;
929
930 assert (dfa);
931 if (!dfa->charMap)
932 {
933 dfa->charMapSize = 7;
934 dfa->charMap = (int *)
935 imalloc (dfa->charMapSize * sizeof(*dfa->charMap));
936 }
937 dfa->charMap[0] = 0;
938}
939
940void dfa_parse_cmap_new (struct DFA *d, const int *cmap)
941{
942 struct DFA_parse *dfa = d->parse_info;
943 const int *cp;
944 int size;
945
946 assert (dfa);
947 for (cp = cmap; *cp; cp += 2)
948 ;
949 size = cp - cmap + 1;
950 if (size > dfa->charMapSize)
951 {
952 if (dfa->charMap)
953 ifree (dfa->charMap);
954 dfa->charMapSize = size;
955 dfa->charMap = (int *) imalloc (size * sizeof(*dfa->charMap));
956 }
957 memcpy (dfa->charMap, cmap, size * sizeof(*dfa->charMap));
958}
959
960void dfa_parse_cmap_del (struct DFA *d, int from)
961{
962 struct DFA_parse *dfa = d->parse_info;
963 int *cc;
964
965 assert (dfa);
966 for (cc = dfa->charMap; *cc; cc += 2)
967 if (*cc == from)
968 {
969 while ((cc[0] = cc[2]))
970 {
971 cc[1] = cc[3];
972 cc += 2;
973 }
974 break;
975 }
976}
977
978void dfa_parse_cmap_add (struct DFA *d, int from, int to)
979{
980 struct DFA_parse *dfa = d->parse_info;
981 int *cc;
982 int indx, size;
983
984 assert (dfa);
985 for (cc = dfa->charMap; *cc; cc += 2)
986 if (*cc == from)
987 {
988 cc[1] = to;
989 return ;
990 }
991 indx = cc - dfa->charMap;
992 size = dfa->charMapSize;
993 if (indx >= size)
994 {
995 int *cn = (int *) imalloc ((size+16) * sizeof(*dfa->charMap));
996 memcpy (cn, dfa->charMap, indx*sizeof(*dfa->charMap));
997 ifree (dfa->charMap);
998 dfa->charMap = cn;
999 dfa->charMapSize = size+16;
1000 }
1001 dfa->charMap[indx] = from;
1002 dfa->charMap[indx+1] = to;
1003 dfa->charMap[indx+2] = 0;
1004}
1005
1007{
1008 static int thompson_chars[] =
1009 {
1010 '*', L_CLOS0,
1011 '+', L_CLOS1,
1012 '|', L_ALT,
1013 '^', L_START,
1014 '$', L_END,
1015 '?', L_QUEST,
1016 '.', L_ANY,
1017 '(', L_LP,
1018 ')', L_RP,
1019 ' ', 0,
1020 '\t', 0,
1021 '\n', 0,
1022 0
1023 };
1024 dfa_parse_cmap_new (d, thompson_chars);
1025}
1026
1027static struct DFA_parse *dfa_parse_init (void)
1028{
1029 struct DFA_parse *parse_info =
1030 (struct DFA_parse *) imalloc (sizeof (struct DFA_parse));
1031
1032 parse_info->charset = mk_BSetHandle (255, 20);
1033 parse_info->position = 0;
1034 parse_info->rule = 0;
1035 parse_info->root = NULL;
1036
1037 /* initialize the anyset which by default does not include \n */
1038 parse_info->anyset = mk_BSet (&parse_info->charset);
1039 res_BSet (parse_info->charset, parse_info->anyset);
1040 add_BSet (parse_info->charset, parse_info->anyset, '\n');
1041 com_BSet (parse_info->charset, parse_info->anyset);
1042
1043 parse_info->use_Tnode = parse_info->max_Tnode = 0;
1044 parse_info->start = parse_info->end = NULL;
1045 parse_info->charMap = NULL;
1046 parse_info->charMapSize = 0;
1047 parse_info->cmap = NULL;
1048 return parse_info;
1049}
1050
1051static void rm_dfa_parse (struct DFA_parse **dfap)
1052{
1053 struct DFA_parse *parse_info = *dfap;
1054 assert (parse_info);
1055 term_Tnode(parse_info);
1056 rm_BSetHandle (&parse_info->charset);
1057 ifree (parse_info->charMap);
1058 ifree (parse_info);
1059 *dfap = NULL;
1060}
1061
1062static struct DFA_states *mk_dfas (struct DFA_parse *dfap, int poset_chunk)
1063{
1064 struct DFA_states *dfas;
1065 struct DFA_parse *parse_info = dfap;
1066
1067 assert (poset_chunk > 10);
1068 assert (dfap);
1069
1070 parse_info->poset = mk_DFASetType (poset_chunk);
1071 init_pos(parse_info);
1072 init_followpos(parse_info);
1073 assert (parse_info->root);
1074 dfa_trav (parse_info, parse_info->root);
1075
1077 pr_followpos(parse_info);
1078 init_DFA_states (&dfas, parse_info->poset, (int) (STATE_HASH));
1079 mk_dfa_tran (parse_info, dfas);
1080 if (debug_dfa_tran)
1081 {
1082 pr_tran (parse_info, dfas);
1083 }
1084 if (dfa_verbose)
1085 pr_verbose (parse_info, dfas);
1086 del_pos(parse_info);
1087 del_followpos(parse_info);
1088 rm_DFASetType (parse_info->poset);
1089 return dfas;
1090}
1091
1092struct DFA *dfa_init (void)
1093{
1094 struct DFA *dfa;
1095
1096 dfa = (struct DFA *) imalloc (sizeof(*dfa));
1097 dfa->parse_info = dfa_parse_init ();
1098 dfa->state_info = NULL;
1099 dfa->states = NULL;
1101 return dfa;
1102}
1103
1105{
1106 add_BSet (dfa->parse_info->charset, dfa->parse_info->anyset, '\n');
1107}
1108
1109void dfa_set_cmap (struct DFA *dfa, void *vp,
1110 const char **(*cmap)(void *vp, const char **from, int len))
1111{
1112 dfa->parse_info->cmap = cmap;
1113 dfa->parse_info->cmap_data = vp;
1114}
1115
1116int dfa_get_last_rule (struct DFA *dfa)
1117{
1118 return dfa->parse_info->rule;
1119}
1120
1121int dfa_parse (struct DFA *dfa, const char **pattern)
1122{
1123 struct Tnode *top;
1124 struct DFA_parse *parse_info;
1125
1126 assert (dfa);
1127 assert (dfa->parse_info);
1128 parse_info = dfa->parse_info;
1129
1130 do_parse (parse_info, pattern, &top);
1131 if (parse_info->err_code)
1132 return parse_info->err_code;
1133 if ( !dfa->parse_info->root )
1134 dfa->parse_info->root = top;
1135 else
1136 {
1137 struct Tnode *n;
1138
1139 n = mk_Tnode (parse_info);
1140 n->pos = OR;
1141 n->u.p[0] = dfa->parse_info->root;
1142 n->u.p[1] = top;
1143 dfa->parse_info->root = n;
1144 }
1145 return 0;
1146}
1147
1148void dfa_mkstate (struct DFA *dfa)
1149{
1150 assert (dfa);
1151
1153 dfa->no_states = dfa->state_info->no;
1154 dfa->states = dfa->state_info->sortarray;
1155 rm_dfa_parse (&dfa->parse_info);
1156}
1157
1158void dfa_delete (struct DFA **dfap)
1159{
1160 assert (dfap);
1161 assert (*dfap);
1162 if ((*dfap)->parse_info)
1163 rm_dfa_parse (&(*dfap)->parse_info);
1164 if ((*dfap)->state_info)
1165 rm_DFA_states (&(*dfap)->state_info);
1166 ifree (*dfap);
1167 *dfap = NULL;
1168}
1169/*
1170 * Local variables:
1171 * c-basic-offset: 4
1172 * c-file-style: "Stroustrup"
1173 * indent-tabs-mode: nil
1174 * End:
1175 * vim: shiftwidth=4 tabstop=8 expandtab
1176 */
1177
int trav_BSet(BSetHandle *sh, BSet src, unsigned member)
Definition bset.c:183
void add_BSet(BSetHandle *sh, BSet dst, unsigned member)
Definition bset.c:110
unsigned short BSetWord
Definition bset.h:27
void res_BSet(BSetHandle *sh, BSet dst)
Definition bset.c:135
BSetHandle * mk_BSetHandle(int size, int chunk)
Definition bset.c:37
BSet mk_BSet(BSetHandle **shp)
Definition bset.c:86
void rm_BSetHandle(BSetHandle **shp)
Definition bset.c:55
int inf_BSetHandle(BSetHandle *sh, long *used, long *alloc)
Definition bset.c:70
BSetWord * BSet
Definition bset.h:28
int travi_BSet(BSetHandle *sh, BSet src, unsigned member)
Definition bset.c:210
void com_BSet(BSetHandle *sh, BSet dst)
Definition bset.c:162
#define EPSILON
Definition dfa.c:41
#define L_END
Definition dfa.c:108
int dfa_get_last_rule(struct DFA *dfa)
Definition dfa.c:1116
static void del_followpos(struct DFA_parse *parse_info)
Definition dfa.c:622
static void init_followpos(struct DFA_parse *parse_info)
Definition dfa.c:754
static void add_follow(struct DFA_parse *parse_info, DFASet lastpos, DFASet firstpos)
Definition dfa.c:638
static void term_Tnode(struct DFA_parse *parse_info)
Definition dfa.c:552
#define PLUS
Definition dfa.c:40
void dfa_parse_cmap_add(struct DFA *d, int from, int to)
Definition dfa.c:978
void dfa_parse_cmap_thompson(struct DFA *d)
Definition dfa.c:1006
void dfa_parse_cmap_del(struct DFA *d, int from)
Definition dfa.c:960
static void mk_dfa_tran(struct DFA_parse *parse_info, struct DFA_states *dfas)
Definition dfa.c:764
static void pr_tran(struct DFA_parse *parse_info, struct DFA_states *dfas)
Definition dfa.c:846
static void lex(struct DFA_parse *parse_info)
Definition dfa.c:503
static int lex_sub(struct DFA_parse *parse_info)
Definition dfa.c:474
static struct DFA_parse * dfa_parse_init(void)
Definition dfa.c:1027
void dfa_anyset_includes_nl(struct DFA *dfa)
Definition dfa.c:1104
#define OR
Definition dfa.c:38
#define STATE_HASH
Definition dfa.c:63
#define L_CHARS
Definition dfa.c:100
#define L_LP
Definition dfa.c:97
static int read_charset(struct DFA_parse *parse_info)
Definition dfa.c:358
static struct Tnode * mk_Tnode_cset(struct DFA_parse *parse_info, BSet charset)
Definition dfa.c:584
#define TADD
Definition dfa.c:62
int dfa_parse(struct DFA *dfa, const char **pattern)
Definition dfa.c:1121
static const char * str_char(unsigned c)
Definition dfa.c:508
void dfa_mkstate(struct DFA *dfa)
Definition dfa.c:1148
#define L_CLOS0
Definition dfa.c:107
static struct DFA_states * mk_dfas(struct DFA_parse *dfap, int poset_chunk)
Definition dfa.c:1062
void dfa_parse_cmap_new(struct DFA *d, const int *cmap)
Definition dfa.c:940
static void del_pos(struct DFA_parse *parse_info)
Definition dfa.c:633
#define CAT
Definition dfa.c:37
#define L_CLOS1
Definition dfa.c:106
static struct Tnode * expr_2(struct DFA_parse *parse_info)
Definition dfa.c:138
#define STAR
Definition dfa.c:39
static int nextchar_set(struct DFA_parse *parse_info, int *esc)
Definition dfa.c:348
static void init_pos(struct DFA_parse *parse_info)
Definition dfa.c:627
static struct Tnode * expr_1(struct DFA_parse *parse_info)
Definition dfa.c:117
static void out_char(int c)
Definition dfa.c:547
static int nextchar(struct DFA_parse *parse_info, int *esc)
Definition dfa.c:314
static void pr_verbose(struct DFA_parse *parse_info, struct DFA_states *dfas)
Definition dfa.c:883
#define L_ANY
Definition dfa.c:101
static struct Tnode * expr_4(struct DFA_parse *parse_info)
Definition dfa.c:198
#define L_WILD
Definition dfa.c:104
#define L_ANYZ
Definition dfa.c:103
void dfa_parse_cmap_clean(struct DFA *d)
Definition dfa.c:926
#define L_CHAR
Definition dfa.c:99
static void do_parse(struct DFA_parse *parse_info, const char **s, struct Tnode **tnp)
Definition dfa.c:246
int debug_dfa_followpos
Definition dfa.c:68
static struct Tnode * expr_3(struct DFA_parse *parse_info)
Definition dfa.c:163
static int map_l_char(struct DFA_parse *parse_info)
Definition dfa.c:450
int debug_dfa_trav
Definition dfa.c:66
static void dfa_trav(struct DFA_parse *parse_info, struct Tnode *n)
Definition dfa.c:650
#define L_START
Definition dfa.c:109
void dfa_delete(struct DFA **dfap)
Definition dfa.c:1158
struct DFA * dfa_init(void)
Definition dfa.c:1092
#define POSET_CHUNK
Definition dfa.c:64
int dfa_verbose
Definition dfa.c:69
#define L_QUEST
Definition dfa.c:105
static struct Tnode * mk_Tnode(struct DFA_parse *parse_info)
Definition dfa.c:563
int debug_dfa_tran
Definition dfa.c:67
void dfa_set_cmap(struct DFA *dfa, void *vp, const char **(*cmap)(void *vp, const char **from, int len))
Definition dfa.c:1109
#define L_ALT
Definition dfa.c:102
#define L_RP
Definition dfa.c:98
static void pr_followpos(struct DFA_parse *parse_info)
Definition dfa.c:897
static void rm_dfa_parse(struct DFA_parse **dfap)
Definition dfa.c:1051
#define DFA_ERR_LP
Definition dfa.h:98
#define DFA_ERR_RP
Definition dfa.h:99
#define DFA_ERR_SYNTAX
Definition dfa.h:97
void sort_DFA_states(struct DFA_states *dfas)
Definition states.c:188
void add_DFA_tran(struct DFA_states *, struct DFA_state *, int, int, int)
Definition states.c:144
int rm_DFA_states(struct DFA_states **dfasp)
Definition states.c:70
int add_DFA_state(struct DFA_states *dfas, DFASet *s, struct DFA_state **sp)
Definition states.c:98
struct DFA_state * get_DFA_state(struct DFA_states *dfas)
Definition states.c:174
int init_DFA_states(struct DFA_states **dfasp, DFASetType st, int hash)
Definition states.c:36
DFASetType mk_DFASetType(int chunk)
Definition set.c:35
DFASet add_DFASet(DFASetType st, DFASet s, int value)
Definition set.c:135
DFASet cp_DFASet(DFASetType st, DFASet s)
Definition set.c:189
void pr_DFASet(DFASetType st, DFASet s)
Definition set.c:227
DFASet mk_DFASet(DFASetType st)
Definition set.c:73
DFASet merge_DFASet(DFASetType st, DFASet s1, DFASet s2)
Definition set.c:194
DFASet union_DFASet(DFASetType st, DFASet s1, DFASet s2)
Definition set.c:152
DFASet rm_DFASet(DFASetType st, DFASet s)
Definition set.c:115
int inf_DFASetType(DFASetType st, long *used, long *allocated)
Definition set.c:50
DFASetType rm_DFASetType(DFASetType st)
Definition set.c:61
void ifree(void *p)
Definition imalloc.c:93
void * imalloc(size_t size)
Definition imalloc.c:43
unsigned size
Definition bset.h:31
struct DFASetElement_ * next
Definition dfaset.h:28
int rule
Definition dfap.h:34
BSet anyset
Definition dfap.h:36
const char **(* cmap)(void *vp, const char **from, int len)
Definition dfap.h:57
int max_Tnode
Definition dfap.h:38
struct Tnode * root
Definition dfap.h:32
unsigned look_ch
Definition dfap.h:45
int charMapSize
Definition dfap.h:42
BSet look_chars
Definition dfap.h:47
int lookahead
Definition dfap.h:46
int err_code
Definition dfap.h:48
struct Tblock * start
Definition dfap.h:39
BSetHandle * charset
Definition dfap.h:35
int position
Definition dfap.h:33
int * charMap
Definition dfap.h:41
DFASet * followpos
Definition dfap.h:55
DFASetType poset
Definition dfap.h:54
struct Tnode ** posar
Definition dfap.h:52
int inside_string
Definition dfap.h:49
struct Tblock * end
Definition dfap.h:40
void * cmap_data
Definition dfap.h:43
const unsigned char * expr_ptr
Definition dfap.h:50
int use_Tnode
Definition dfap.h:37
short rule_no
Definition dfa.h:49
short tran_no
Definition dfa.h:48
short rule_nno
Definition dfa.h:50
DFASet set
Definition dfa.h:46
struct DFA_tran * trans
Definition dfa.h:45
short no
Definition dfa.h:47
int no
Definition dfap.h:70
struct DFA_state ** sortarray
Definition dfap.h:74
Definition dfa.h:30
unsigned short to
Definition dfa.h:32
unsigned char ch[2]
Definition dfa.h:31
Definition dfa.h:53
struct DFA_states * state_info
Definition dfa.h:56
struct DFA_state ** states
Definition dfa.h:55
struct DFA_parse * parse_info
Definition dfa.h:57
int no_states
Definition dfa.h:54
Definition dfa.c:57
struct Tblock * next
Definition dfa.c:58
struct Tnode * tarray
Definition dfa.c:59
Definition dfa.c:43
unsigned pos
Definition dfa.c:51
short ch[2]
Definition dfa.c:46
DFASet firstpos
Definition dfa.c:53
struct Tnode * p[2]
Definition dfa.c:45
unsigned nullable
Definition dfa.c:52
DFASet lastpos
Definition dfa.c:54
union Tnode::@13 u