IDZEBRA 2.2.8
mod_grs_regx.c
Go to the documentation of this file.
1/* This file is part of the Zebra server.
2 Copyright (C) Index Data
3
4Zebra is free software; you can redistribute it and/or modify it under
5the terms of the GNU General Public License as published by the Free
6Software Foundation; either version 2, or (at your option) any later
7version.
8
9Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10WARRANTY; without even the implied warranty of MERCHANTABILITY or
11FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12for more details.
13
14You should have received a copy of the GNU General Public License
15along with this program; if not, write to the Free Software
16Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
18*/
19
20#if HAVE_CONFIG_H
21#include <config.h>
22#endif
23#include <stdio.h>
24#include <stdlib.h>
25#include <assert.h>
26#include <string.h>
27#include <ctype.h>
28
29#include <yaz/tpath.h>
30#include <yaz/snprintf.h>
31#include <idzebra/util.h>
32#include <dfa.h>
33#include <idzebra/recgrs.h>
34
35#if HAVE_TCL_H
36#include <tcl.h>
37
38#if MAJOR_VERSION >= 8
39#define HAVE_TCL_OBJECTS
40#endif
41#endif
42
43#define REGX_DEBUG 0
44
45#define F_WIN_EOF 2000000000
46#define F_WIN_READ 1
47
48#define REGX_EOF 0
49#define REGX_PATTERN 1
50#define REGX_BODY 2
51#define REGX_BEGIN 3
52#define REGX_END 4
53#define REGX_CODE 5
54#define REGX_CONTEXT 6
55#define REGX_INIT 7
56
57struct regxCode {
58 char *str;
59#if HAVE_TCL_OBJECTS
60 Tcl_Obj *tcl_obj;
61#endif
62};
63
65 int which;
66 union {
67 struct {
68 struct DFA *dfa; /* REGX_PATTERN */
69 int body;
71 struct regxCode *code; /* REGX_CODE */
72 } u;
74};
75
77 int no;
79};
80
81struct lexRule {
83 struct lexRule *next;
84};
85
99
101 int max;
102 char *buf;
103};
104
105struct lexSpec {
106 char *name;
108
112
114 NMEM m;
116#if HAVE_TCL_H
117 Tcl_Interp *tcl_interp;
118#endif
120 off_t (*f_win_ef)(struct ZebraRecStream *s, off_t *);
121
122 int f_win_start; /* first byte of buffer is this file offset */
123 int f_win_end; /* last byte of buffer is this offset - 1 */
124 int f_win_size; /* size of buffer */
125 char *f_win_buf; /* buffer itself */
126 int (*f_win_rf)(struct ZebraRecStream *, char *, size_t);
127 off_t (*f_win_sf)(struct ZebraRecStream *, off_t);
128
134
138 int ptr;
139};
140
141struct lexSpecs {
142 struct lexSpec *spec;
143 char type[256];
144};
145
146static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
147 int *size)
148{
149 int i, r, off = start_pos - spec->f_win_start;
150
151 if (off >= 0 && end_pos <= spec->f_win_end)
152 {
153 *size = end_pos - start_pos;
154 return spec->f_win_buf + off;
155 }
156 if (off < 0 || start_pos >= spec->f_win_end)
157 {
158 (*spec->f_win_sf)(spec->stream, start_pos);
159 spec->f_win_start = start_pos;
160
161 if (!spec->f_win_buf)
162 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
163 *size = (*spec->f_win_rf)(spec->stream, spec->f_win_buf,
164 spec->f_win_size);
165 spec->f_win_end = spec->f_win_start + *size;
166
167 if (*size > end_pos - start_pos)
168 *size = end_pos - start_pos;
169 return spec->f_win_buf;
170 }
171 for (i = 0; i<spec->f_win_end - start_pos; i++)
172 spec->f_win_buf[i] = spec->f_win_buf[i + off];
173 r = (*spec->f_win_rf)(spec->stream,
174 spec->f_win_buf + i,
175 spec->f_win_size - i);
176 spec->f_win_start = start_pos;
177 spec->f_win_end += r;
178 *size = i + r;
179 if (*size > end_pos - start_pos)
180 *size = end_pos - start_pos;
181 return spec->f_win_buf;
182}
183
184static int f_win_advance (struct lexSpec *spec, int *pos)
185{
186 int size;
187 char *buf;
188
189 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
190 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
191 if (*pos == F_WIN_EOF)
192 return 0;
193 buf = f_win_get (spec, *pos, *pos+1, &size);
194 if (size == 1)
195 {
196 (*pos)++;
197 return *buf;
198 }
199 *pos = F_WIN_EOF;
200 return 0;
201}
202
203static void regxCodeDel (struct regxCode **pp)
204{
205 struct regxCode *p = *pp;
206 if (p)
207 {
208#if HAVE_TCL_OBJECTS
209 if (p->tcl_obj)
210 Tcl_DecrRefCount (p->tcl_obj);
211#endif
212 xfree (p->str);
213 xfree (p);
214 *pp = NULL;
215 }
216}
217
218static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
219{
220 struct regxCode *p;
221
222 p = (struct regxCode *) xmalloc (sizeof(*p));
223 p->str = (char *) xmalloc (len+1);
224 memcpy (p->str, buf, len);
225 p->str[len] = '\0';
226#if HAVE_TCL_OBJECTS
227 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
228 if (p->tcl_obj)
229 Tcl_IncrRefCount (p->tcl_obj);
230#endif
231 *pp = p;
232}
233
234static struct DFA *lexSpecDFA (void)
235{
236 struct DFA *dfa;
237
238 dfa = dfa_init ();
239 dfa_parse_cmap_del (dfa, ' ');
240 dfa_parse_cmap_del (dfa, '\t');
241 dfa_parse_cmap_add (dfa, '/', 0);
242 return dfa;
243}
244
245static void actionListDel (struct lexRuleAction **rap)
246{
247 struct lexRuleAction *ra1, *ra;
248
249 for (ra = *rap; ra; ra = ra1)
250 {
251 ra1 = ra->next;
252 switch (ra->which)
253 {
254 case REGX_PATTERN:
255 dfa_delete (&ra->u.pattern.dfa);
256 break;
257 case REGX_CODE:
258 regxCodeDel (&ra->u.code);
259 break;
260 }
261 xfree (ra);
262 }
263 *rap = NULL;
264}
265
266static struct lexContext *lexContextCreate (const char *name)
267{
268 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
269
270 p->name = xstrdup (name);
271 p->ruleNo = 1;
272 p->initFlag = 0;
273 p->dfa = lexSpecDFA ();
274 p->rules = NULL;
275 p->fastRule = NULL;
276 p->beginActionList = NULL;
277 p->endActionList = NULL;
278 p->initActionList = NULL;
279 p->next = NULL;
280 return p;
281}
282
283static void lexContextDestroy (struct lexContext *p)
284{
285 struct lexRule *rp, *rp1;
286
287 dfa_delete (&p->dfa);
288 xfree (p->fastRule);
289 for (rp = p->rules; rp; rp = rp1)
290 {
291 rp1 = rp->next;
293 xfree (rp);
294 }
298 xfree (p->name);
299 xfree (p);
300}
301
302static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
303{
304 struct lexSpec *p;
305 int i;
306
307 p = (struct lexSpec *) xmalloc (sizeof(*p));
308 p->name = (char *) xmalloc (strlen(name)+1);
309 strcpy (p->name, name);
310
311#if HAVE_TCL_H
312 p->tcl_interp = 0;
313#endif
314 p->dh = dh;
315 p->context = NULL;
316 p->context_stack_size = 100;
317 p->context_stack = (struct lexContext **)
318 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
319 p->f_win_buf = NULL;
320
321 p->maxLevel = 128;
322 p->concatBuf = (struct lexConcatBuf *)
323 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
324 for (i = 0; i < p->maxLevel; i++)
325 {
326 p->concatBuf[i].max = 0;
327 p->concatBuf[i].buf = 0;
328 }
329 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
330 p->d1_level = 0;
331 return p;
332}
333
334static void lexSpecDestroy (struct lexSpec **pp)
335{
336 struct lexSpec *p;
337 struct lexContext *lt;
338 int i;
339
340 assert (pp);
341 p = *pp;
342 if (!p)
343 return ;
344
345 for (i = 0; i < p->maxLevel; i++)
346 xfree (p->concatBuf[i].buf);
347 xfree (p->concatBuf);
348
349 lt = p->context;
350 while (lt)
351 {
352 struct lexContext *lt_next = lt->next;
354 lt = lt_next;
355 }
356#if HAVE_TCL_OBJECTS
357 if (p->tcl_interp)
358 Tcl_DeleteInterp (p->tcl_interp);
359#endif
360 xfree (p->name);
361 xfree (p->f_win_buf);
362 xfree (p->context_stack);
363 xfree (p->d1_stack);
364 xfree (p);
365 *pp = NULL;
366}
367
368static int readParseToken (const char **cpp, int *len)
369{
370 const char *cp = *cpp;
371 char cmd[32];
372 int i, level;
373
374 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
375 cp++;
376 switch (*cp)
377 {
378 case '\0':
379 return 0;
380 case '/':
381 *cpp = cp+1;
382 return REGX_PATTERN;
383 case '{':
384 *cpp = cp+1;
385 level = 1;
386 while (*++cp)
387 {
388 if (*cp == '{')
389 level++;
390 else if (*cp == '}')
391 {
392 level--;
393 if (level == 0)
394 break;
395 }
396 }
397 *len = cp - *cpp;
398 return REGX_CODE;
399 default:
400 i = 0;
401 while (1)
402 {
403 if (*cp >= 'a' && *cp <= 'z')
404 cmd[i] = *cp;
405 else if (*cp >= 'A' && *cp <= 'Z')
406 cmd[i] = *cp + 'a' - 'A';
407 else
408 break;
409 if (i < (int) sizeof(cmd)-2)
410 i++;
411 cp++;
412 }
413 cmd[i] = '\0';
414 if (i == 0)
415 {
416 yaz_log (YLOG_WARN, "bad character %d %c", *cp, *cp);
417 cp++;
418 while (*cp && *cp != ' ' && *cp != '\t' &&
419 *cp != '\n' && *cp != '\r')
420 cp++;
421 *cpp = cp;
422 return 0;
423 }
424 *cpp = cp;
425 if (!strcmp (cmd, "begin"))
426 return REGX_BEGIN;
427 else if (!strcmp (cmd, "end"))
428 return REGX_END;
429 else if (!strcmp (cmd, "body"))
430 return REGX_BODY;
431 else if (!strcmp (cmd, "context"))
432 return REGX_CONTEXT;
433 else if (!strcmp (cmd, "init"))
434 return REGX_INIT;
435 else
436 {
437 yaz_log (YLOG_WARN, "bad command %s", cmd);
438 return 0;
439 }
440 }
441}
442
443static int actionListMk (struct lexSpec *spec, const char *s,
444 struct lexRuleAction **ap)
445{
446 int r, tok, len;
447 int bodyMark = 0;
448 const char *s0;
449
450 while ((tok = readParseToken (&s, &len)))
451 {
452 switch (tok)
453 {
454 case REGX_BODY:
455 bodyMark = 1;
456 continue;
457 case REGX_CODE:
458 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
459 (*ap)->which = tok;
460 regxCodeMk (&(*ap)->u.code, s, len);
461 s += len+1;
462 break;
463 case REGX_PATTERN:
464 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
465 (*ap)->which = tok;
466 (*ap)->u.pattern.body = bodyMark;
467 bodyMark = 0;
468 (*ap)->u.pattern.dfa = lexSpecDFA ();
469 s0 = s;
470 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
471 if (r || *s != '/')
472 {
473 int pos = s - s0;
474 xfree (*ap);
475 *ap = NULL;
476 yaz_log(YLOG_WARN, "regular expression error '%.*s'", pos, s0);
477 return -1;
478 }
479 else
480 {
481 int pos = s - s0;
482 if (debug_dfa_tran)
483 printf("pattern: %.*s\n", pos, s0);
484 dfa_mkstate((*ap)->u.pattern.dfa);
485 s++;
486 }
487 break;
488 case REGX_BEGIN:
489 yaz_log (YLOG_WARN, "cannot use BEGIN here");
490 continue;
491 case REGX_INIT:
492 yaz_log (YLOG_WARN, "cannot use INIT here");
493 continue;
494 case REGX_END:
495 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
496 (*ap)->which = tok;
497 break;
498 }
499 ap = &(*ap)->next;
500 }
501 *ap = NULL;
502 return 0;
503}
504
505int readOneSpec (struct lexSpec *spec, const char *s)
506{
507 int len, r, tok;
508 struct lexRule *rp;
509 struct lexContext *lc;
510
511 tok = readParseToken (&s, &len);
512 if (tok == REGX_CONTEXT)
513 {
514 char context_name[32];
515 tok = readParseToken (&s, &len);
516 if (tok != REGX_CODE)
517 {
518 yaz_log (YLOG_WARN, "missing name after CONTEXT keyword");
519 return 0;
520 }
521 if (len > 31)
522 len = 31;
523 memcpy (context_name, s, len);
524 context_name[len] = '\0';
525 lc = lexContextCreate (context_name);
526 lc->next = spec->context;
527 spec->context = lc;
528 return 0;
529 }
530 if (!spec->context)
531 spec->context = lexContextCreate ("main");
532
533 switch (tok)
534 {
535 case REGX_BEGIN:
537 actionListMk (spec, s, &spec->context->beginActionList);
538 break;
539 case REGX_END:
541 actionListMk (spec, s, &spec->context->endActionList);
542 break;
543 case REGX_INIT:
545 actionListMk (spec, s, &spec->context->initActionList);
546 break;
547 case REGX_PATTERN:
548#if REGX_DEBUG
549 yaz_log (YLOG_LOG, "rule %d %s", spec->context->ruleNo, s);
550#endif
551 r = dfa_parse (spec->context->dfa, &s);
552 if (r)
553 {
554 yaz_log (YLOG_WARN, "regular expression error. r=%d", r);
555 return -1;
556 }
557 if (*s != '/')
558 {
559 yaz_log (YLOG_WARN, "expects / at end of pattern. got %c", *s);
560 return -1;
561 }
562 s++;
563 rp = (struct lexRule *) xmalloc (sizeof(*rp));
564 rp->info.no = spec->context->ruleNo++;
565 rp->next = spec->context->rules;
566 spec->context->rules = rp;
567 actionListMk (spec, s, &rp->info.actionList);
568 }
569 return 0;
570}
571
572int readFileSpec (struct lexSpec *spec)
573{
574 struct lexContext *lc;
575 int c, i, errors = 0;
576 FILE *spec_inf = 0;
577 WRBUF lineBuf;
578 char fname[256];
579
580#if HAVE_TCL_H
581 if (spec->tcl_interp)
582 {
583 yaz_snprintf(fname, sizeof(fname), "%s.tflt", spec->name);
584 spec_inf = data1_path_fopen (spec->dh, fname, "r");
585 }
586#endif
587 if (!spec_inf)
588 {
589 yaz_snprintf(fname, sizeof(fname), "%s.flt", spec->name);
590 spec_inf = data1_path_fopen (spec->dh, fname, "r");
591 }
592 if (!spec_inf)
593 {
594 yaz_log(YLOG_ERRNO|YLOG_WARN, "cannot read spec file %s", spec->name);
595 return -1;
596 }
597 yaz_log(YLOG_LOG, "reading regx filter %s", fname);
598#if HAVE_TCL_H
599 if (spec->tcl_interp)
600 yaz_log(YLOG_LOG, "Tcl enabled");
601#endif
602
603#if 0
604 debug_dfa_trav = 0;
605 debug_dfa_tran = 1;
607 dfa_verbose = 1;
608#endif
609
610 lineBuf = wrbuf_alloc();
611 spec->lineNo = 0;
612 c = getc (spec_inf);
613 while (c != EOF)
614 {
615 wrbuf_rewind (lineBuf);
616 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
617 {
618 while (c != '\n' && c != EOF)
619 c = getc (spec_inf);
620 spec->lineNo++;
621 if (c == '\n')
622 c = getc (spec_inf);
623 }
624 else
625 {
626 int addLine = 0;
627
628 while (1)
629 {
630 int c1 = c;
631 wrbuf_putc(lineBuf, c);
632 c = getc (spec_inf);
633 while (c == '\r')
634 c = getc (spec_inf);
635 if (c == EOF)
636 break;
637 if (c1 == '\n')
638 {
639 if (c != ' ' && c != '\t')
640 break;
641 addLine++;
642 }
643 }
644 wrbuf_putc(lineBuf, '\0');
645 readOneSpec (spec, wrbuf_buf(lineBuf));
646 spec->lineNo += addLine;
647 }
648 }
649 fclose (spec_inf);
650 wrbuf_destroy(lineBuf);
651
652 for (lc = spec->context; lc; lc = lc->next)
653 {
654 struct lexRule *rp;
655 lc->fastRule = (struct lexRuleInfo **)
656 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
657 for (i = 0; i < lc->ruleNo; i++)
658 lc->fastRule[i] = NULL;
659 for (rp = lc->rules; rp; rp = rp->next)
660 lc->fastRule[rp->info.no] = &rp->info;
661 dfa_mkstate (lc->dfa);
662 }
663 if (errors)
664 return -1;
665
666 return 0;
667}
668
669#if 0
670static struct lexSpec *curLexSpec = NULL;
671#endif
672
673static void execData (struct lexSpec *spec,
674 const char *ebuf, int elen, int formatted_text,
675 const char *attribute_str, int attribute_len)
676{
677 struct data1_node *res, *parent;
678 int org_len;
679
680 if (elen == 0) /* shouldn't happen, but it does! */
681 return ;
682#if REGX_DEBUG
683 if (elen > 80)
684 yaz_log (YLOG_LOG, "data(%d bytes) %.40s ... %.*s", elen,
685 ebuf, 40, ebuf + elen-40);
686 else if (elen == 1 && ebuf[0] == '\n')
687 {
688 yaz_log (YLOG_LOG, "data(new line)");
689 }
690 else if (elen > 0)
691 yaz_log (YLOG_LOG, "data(%d bytes) %.*s", elen, elen, ebuf);
692 else
693 yaz_log (YLOG_LOG, "data(%d bytes)", elen);
694#endif
695
696 if (spec->d1_level <= 1)
697 return;
698
699 parent = spec->d1_stack[spec->d1_level -1];
700 assert (parent);
701
702 if (attribute_str)
703 {
704 data1_xattr **ap;
705 res = parent;
706 if (res->which != DATA1N_tag)
707 return;
708 /* sweep through exising attributes.. */
709 for (ap = &res->u.tag.attributes; *ap; ap = &(*ap)->next)
710 if (strlen((*ap)->name) == attribute_len &&
711 !memcmp((*ap)->name, attribute_str, attribute_len))
712 break;
713 if (!*ap)
714 {
715 /* new attribute. Create it with name + value */
716 *ap = nmem_malloc(spec->m, sizeof(**ap));
717
718 (*ap)->name = nmem_malloc(spec->m, attribute_len+1);
719 memcpy((*ap)->name, attribute_str, attribute_len);
720 (*ap)->name[attribute_len] = '\0';
721
722 (*ap)->value = nmem_malloc(spec->m, elen+1);
723 memcpy((*ap)->value, ebuf, elen);
724 (*ap)->value[elen] = '\0';
725 (*ap)->next = 0;
726 }
727 else
728 {
729 /* append to value if attribute already exists */
730 char *nv = nmem_malloc(spec->m, elen + 1 + strlen((*ap)->value));
731 strcpy(nv, (*ap)->value);
732 memcpy (nv + strlen(nv), ebuf, elen);
733 nv[strlen(nv)+elen] = '\0';
734 (*ap)->value = nv;
735 }
736 }
737 else
738 {
739 if ((res = spec->d1_stack[spec->d1_level]) &&
740 res->which == DATA1N_data)
741 org_len = res->u.data.len;
742 else
743 {
744 org_len = 0;
745
746 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_data, parent);
747 res->u.data.what = DATA1I_text;
748 res->u.data.len = 0;
749 res->u.data.formatted_text = formatted_text;
750 res->u.data.data = 0;
751
752 if (spec->d1_stack[spec->d1_level])
753 spec->d1_stack[spec->d1_level]->next = res;
754 spec->d1_stack[spec->d1_level] = res;
755 }
756 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
757 {
758 char *old_buf, *new_buf;
759
760 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
761 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
762 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
763 {
764 memcpy (new_buf, old_buf, org_len);
765 xfree (old_buf);
766 }
767 spec->concatBuf[spec->d1_level].buf = new_buf;
768 }
769 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
770 res->u.data.len += elen;
771 }
772}
773
774static void execDataP (struct lexSpec *spec,
775 const char *ebuf, int elen, int formatted_text)
776{
777 execData (spec, ebuf, elen, formatted_text, 0, 0);
778}
779
780static void tagDataRelease (struct lexSpec *spec)
781{
782 data1_node *res;
783
784 if ((res = spec->d1_stack[spec->d1_level]) &&
785 res->which == DATA1N_data &&
786 res->u.data.what == DATA1I_text)
787 {
788 assert (!res->u.data.data);
789 assert (res->u.data.len > 0);
790 if (res->u.data.len > DATA1_LOCALDATA)
791 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
792 else
793 res->u.data.data = res->lbuf;
794 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
795 res->u.data.len);
796 }
797}
798
799static void variantBegin (struct lexSpec *spec,
800 const char *class_str, int class_len,
801 const char *type_str, int type_len,
802 const char *value_str, int value_len)
803{
804 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
805 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
806 data1_vartype *tp;
807 int i;
808 data1_node *res;
809
810 if (spec->d1_level == 0)
811 {
812 yaz_log (YLOG_WARN, "in variant begin. No record type defined");
813 return ;
814 }
815 if (class_len >= DATA1_MAX_SYMBOL)
816 class_len = DATA1_MAX_SYMBOL-1;
817 memcpy (tclass, class_str, class_len);
818 tclass[class_len] = '\0';
819
820 if (type_len >= DATA1_MAX_SYMBOL)
821 type_len = DATA1_MAX_SYMBOL-1;
822 memcpy (ttype, type_str, type_len);
823 ttype[type_len] = '\0';
824
825#if REGX_DEBUG
826 yaz_log (YLOG_LOG, "variant begin(%s,%s,%d)", tclass, ttype,
827 spec->d1_level);
828#endif
829
830 if (!(tp =
832 tclass, ttype)))
833 return;
834
836 {
837 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
838 if (spec->d1_stack[spec->d1_level])
839 tagDataRelease (spec);
840 spec->d1_stack[spec->d1_level] = res;
841 spec->d1_stack[++(spec->d1_level)] = NULL;
842 }
843 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
844 if (spec->d1_stack[i]->u.variant.type == tp)
845 {
846 spec->d1_level = i;
847 break;
848 }
849
850#if REGX_DEBUG
851 yaz_log (YLOG_LOG, "variant node(%d)", spec->d1_level);
852#endif
853 parent = spec->d1_stack[spec->d1_level-1];
854 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
855 res->u.variant.type = tp;
856
857 if (value_len >= DATA1_LOCALDATA)
858 value_len =DATA1_LOCALDATA-1;
859 memcpy (res->lbuf, value_str, value_len);
860 res->lbuf[value_len] = '\0';
861
862 res->u.variant.value = res->lbuf;
863
864 if (spec->d1_stack[spec->d1_level])
865 tagDataRelease (spec);
866 spec->d1_stack[spec->d1_level] = res;
867 spec->d1_stack[++(spec->d1_level)] = NULL;
868}
869
870static void tagStrip (const char **tag, int *len)
871{
872 int i;
873
874 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
875 ;
876 *len = i;
877 for (i = 0; i < *len && isspace((*tag)[i]); i++)
878 ;
879 *tag += i;
880 *len -= i;
881}
882
883static void tagBegin (struct lexSpec *spec,
884 const char *tag, int len)
885{
886 if (spec->d1_level == 0)
887 {
888 yaz_log (YLOG_WARN, "in element begin. No record type defined");
889 return ;
890 }
891 tagStrip (&tag, &len);
892 if (spec->d1_stack[spec->d1_level])
893 tagDataRelease (spec);
894
895#if REGX_DEBUG
896 yaz_log (YLOG_LOG, "begin tag(%.*s, %d)", len, tag, spec->d1_level);
897#endif
898
899 spec->d1_stack[spec->d1_level] = data1_mk_tag_n (
900 spec->dh, spec->m, tag, len, 0, spec->d1_stack[spec->d1_level -1]);
901 spec->d1_stack[++(spec->d1_level)] = NULL;
902}
903
904static void tagEnd (struct lexSpec *spec, int min_level,
905 const char *tag, int len)
906{
907 tagStrip (&tag, &len);
908 while (spec->d1_level > min_level)
909 {
910 tagDataRelease (spec);
911 (spec->d1_level)--;
912 if (spec->d1_level == 0)
913 break;
914 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
915 (!tag ||
916 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
917 (size_t) len &&
918 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
919 break;
920 }
921#if REGX_DEBUG
922 yaz_log (YLOG_LOG, "end tag(%d)", spec->d1_level);
923#endif
924}
925
926
927static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
928 struct DFA *dfa, int greedy)
929{
930 struct DFA_state *state = dfa->states[0];
931 struct DFA_tran *t;
932 unsigned char c = 0;
933 unsigned char c_prev = 0;
934 int ptr = *pptr; /* current pointer */
935 int start_ptr = *pptr; /* first char of match */
936 int last_ptr = 0; /* last char of match */
937 int last_rule = 0; /* rule number of current match */
938 int restore_ptr = 0;
939 int i;
940
941 if (ptr)
942 {
943 --ptr;
944 c = f_win_advance (spec, &ptr);
945 }
946 while (1)
947 {
948 if (dfa->states[0] == state)
949 {
950 c_prev = c;
951 restore_ptr = ptr;
952 }
953 c = f_win_advance (spec, &ptr);
954
955 if (ptr == F_WIN_EOF)
956 {
957 if (last_rule)
958 {
959 *mptr = start_ptr;
960 *pptr = last_ptr;
961 return 1;
962 }
963 break;
964 }
965
966 t = state->trans;
967 i = state->tran_no;
968 while (1)
969 if (--i < 0) /* no transition for character c */
970 {
971 if (last_rule)
972 {
973 *mptr = start_ptr; /* match starts here */
974 *pptr = last_ptr; /* match end here (+1) */
975 return 1;
976 }
977 state = dfa->states[0];
978
979 ptr = restore_ptr;
980 c = f_win_advance (spec, &ptr);
981
982 start_ptr = ptr;
983
984 break;
985 }
986 else if (c >= t->ch[0] && c <= t->ch[1])
987 {
988 state = dfa->states[t->to];
989 if (state->rule_no && c_prev == '\n')
990 {
991 last_rule = state->rule_no;
992 last_ptr = ptr;
993 }
994 else if (state->rule_nno)
995 {
996 last_rule = state->rule_nno;
997 last_ptr = ptr;
998 }
999 break;
1000 }
1001 else
1002 t++;
1003 }
1004 return 0;
1005}
1006
1007static int execTok (struct lexSpec *spec, const char **src,
1008 const char **tokBuf, int *tokLen)
1009{
1010 const char *s = *src;
1011
1012 while (*s == ' ' || *s == '\t')
1013 s++;
1014 if (!*s)
1015 return 0;
1016 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1017 {
1018 int n = 0;
1019 s++;
1020 while (*s >= '0' && *s <= '9')
1021 n = n*10 + (*s++ -'0');
1022 if (spec->arg_no == 0)
1023 {
1024 *tokBuf = "";
1025 *tokLen = 0;
1026 }
1027 else
1028 {
1029 if (n >= spec->arg_no)
1030 n = spec->arg_no-1;
1031 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1032 tokLen);
1033 }
1034 }
1035 else if (*s == '\"')
1036 {
1037 *tokBuf = ++s;
1038 while (*s && *s != '\"')
1039 s++;
1040 *tokLen = s - *tokBuf;
1041 if (*s)
1042 s++;
1043 *src = s;
1044 }
1045 else if (*s == '\n' || *s == ';')
1046 {
1047 *src = s+1;
1048 return 1;
1049 }
1050 else if (*s == '-')
1051 {
1052 *tokBuf = s++;
1053 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1054 *s != ';')
1055 s++;
1056 *tokLen = s - *tokBuf;
1057 *src = s;
1058 return 3;
1059 }
1060 else
1061 {
1062 *tokBuf = s++;
1063 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1064 *s != ';')
1065 s++;
1066 *tokLen = s - *tokBuf;
1067 }
1068 *src = s;
1069 return 2;
1070}
1071
1072static char *regxStrz (const char *src, int len, char *str)
1073{
1074 if (len > 63)
1075 len = 63;
1076 memcpy (str, src, len);
1077 str[len] = '\0';
1078 return str;
1079}
1080
1081#if HAVE_TCL_H
1082static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1083 int argc, const char **argv)
1084{
1085 struct lexSpec *spec = (struct lexSpec *) clientData;
1086 if (argc < 2)
1087 return TCL_ERROR;
1088 if (!strcmp(argv[1], "record") && argc == 3)
1089 {
1090 const char *absynName = argv[2];
1091 data1_node *res;
1092
1093#if REGX_DEBUG
1094 yaz_log (YLOG_LOG, "begin record %s", absynName);
1095#endif
1096 res = data1_mk_root (spec->dh, spec->m, absynName);
1097
1098 spec->d1_level = 0;
1099
1100 spec->d1_stack[spec->d1_level++] = res;
1101
1102 res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1103
1104 spec->d1_stack[spec->d1_level++] = res;
1105
1106 spec->d1_stack[spec->d1_level] = NULL;
1107 }
1108 else if (!strcmp(argv[1], "element") && argc == 3)
1109 {
1110 tagBegin (spec, argv[2], strlen(argv[2]));
1111 }
1112 else if (!strcmp (argv[1], "variant") && argc == 5)
1113 {
1114 variantBegin (spec, argv[2], strlen(argv[2]),
1115 argv[3], strlen(argv[3]),
1116 argv[4], strlen(argv[4]));
1117 }
1118 else if (!strcmp (argv[1], "context") && argc == 3)
1119 {
1120 struct lexContext *lc = spec->context;
1121#if REGX_DEBUG
1122 yaz_log (YLOG_LOG, "begin context %s",argv[2]);
1123#endif
1124 while (lc && strcmp (argv[2], lc->name))
1125 lc = lc->next;
1126 if (lc)
1127 {
1128 spec->context_stack[++(spec->context_stack_top)] = lc;
1129 }
1130 else
1131 yaz_log (YLOG_WARN, "unknown context %s", argv[2]);
1132 }
1133 else
1134 return TCL_ERROR;
1135 return TCL_OK;
1136}
1137
1138static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1139 int argc, const char **argv)
1140{
1141 struct lexSpec *spec = (struct lexSpec *) clientData;
1142 if (argc < 2)
1143 return TCL_ERROR;
1144
1145 if (!strcmp (argv[1], "record"))
1146 {
1147 while (spec->d1_level)
1148 {
1149 tagDataRelease (spec);
1150 (spec->d1_level)--;
1151 }
1152#if REGX_DEBUG
1153 yaz_log (YLOG_LOG, "end record");
1154#endif
1155 spec->stop_flag = 1;
1156 }
1157 else if (!strcmp (argv[1], "element"))
1158 {
1159 int min_level = 2;
1160 const char *element = 0;
1161 if (argc >= 3 && !strcmp(argv[2], "-record"))
1162 {
1163 min_level = 0;
1164 if (argc == 4)
1165 element = argv[3];
1166 }
1167 else
1168 if (argc == 3)
1169 element = argv[2];
1170 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1171 if (spec->d1_level <= 1)
1172 {
1173#if REGX_DEBUG
1174 yaz_log (YLOG_LOG, "end element end records");
1175#endif
1176 spec->stop_flag = 1;
1177 }
1178 }
1179 else if (!strcmp (argv[1], "context"))
1180 {
1181#if REGX_DEBUG
1182 yaz_log (YLOG_LOG, "end context");
1183#endif
1184 if (spec->context_stack_top)
1185 (spec->context_stack_top)--;
1186 }
1187 else
1188 return TCL_ERROR;
1189 return TCL_OK;
1190}
1191
1192static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1193 int argc, const char **argv)
1194{
1195 int argi = 1;
1196 int textFlag = 0;
1197 const char *element = 0;
1198 const char *attribute = 0;
1199 struct lexSpec *spec = (struct lexSpec *) clientData;
1200
1201 while (argi < argc)
1202 {
1203 if (!strcmp("-text", argv[argi]))
1204 {
1205 textFlag = 1;
1206 argi++;
1207 }
1208 else if (!strcmp("-element", argv[argi]))
1209 {
1210 argi++;
1211 if (argi < argc)
1212 element = argv[argi++];
1213 }
1214 else if (!strcmp("-attribute", argv[argi]))
1215 {
1216 argi++;
1217 if (argi < argc)
1218 attribute = argv[argi++];
1219 }
1220 else
1221 break;
1222 }
1223 if (element)
1224 tagBegin (spec, element, strlen(element));
1225
1226 while (argi < argc)
1227 {
1228#if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0)
1229 Tcl_DString ds;
1230 char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds);
1231 execData (spec, native, strlen(native), textFlag, attribute,
1232 attribute ? strlen(attribute) : 0);
1233 Tcl_DStringFree (&ds);
1234#else
1235 execData (spec, argv[argi], strlen(argv[argi]), textFlag, attribute,
1236 attribute ? strlen(attribute) : 0);
1237#endif
1238 argi++;
1239 }
1240 if (element)
1241 tagEnd (spec, 2, NULL, 0);
1242 return TCL_OK;
1243}
1244
1245static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1246 int argc, const char **argv)
1247{
1248 struct lexSpec *spec = (struct lexSpec *) clientData;
1249 int argi = 1;
1250 int offset = 0;
1251 int no;
1252
1253 while (argi < argc)
1254 {
1255 if (!strcmp("-offset", argv[argi]))
1256 {
1257 argi++;
1258 if (argi < argc)
1259 {
1260 offset = atoi(argv[argi]);
1261 argi++;
1262 }
1263 }
1264 else
1265 break;
1266 }
1267 if (argi != argc-1)
1268 return TCL_ERROR;
1269 no = atoi(argv[argi]);
1270 if (no >= spec->arg_no)
1271 no = spec->arg_no - 1;
1272 spec->ptr = spec->arg_start[no] + offset;
1273 return TCL_OK;
1274}
1275
1276static void execTcl (struct lexSpec *spec, struct regxCode *code)
1277{
1278 int i;
1279 int ret;
1280 for (i = 0; i < spec->arg_no; i++)
1281 {
1282 char var_name[10], *var_buf;
1283 int var_len, ch;
1284
1285 yaz_snprintf(var_name, sizeof(var_name), "%d", i);
1286 var_buf = f_win_get(spec, spec->arg_start[i], spec->arg_end[i],
1287 &var_len);
1288 if (var_buf)
1289 {
1290 ch = var_buf[var_len];
1291 var_buf[var_len] = '\0';
1292 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1293 var_buf[var_len] = ch;
1294 }
1295 }
1296#if HAVE_TCL_OBJECTS
1297 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1298#else
1299 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1300#endif
1301 if (ret != TCL_OK)
1302 {
1303 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1304 yaz_log(YLOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1305#if TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION <= 5
1306 spec->tcl_interp->errorLine,
1307 spec->tcl_interp->result,
1308#else
1309 Tcl_GetErrorLine(spec->tcl_interp),
1310 Tcl_GetStringResult(spec->tcl_interp),
1311#endif
1312 err ? err : "[NO ERRORINFO]");
1313 }
1314}
1315/* HAVE_TCL_H */
1316#endif
1317
1318static void execCode (struct lexSpec *spec, struct regxCode *code)
1319{
1320 const char *s = code->str;
1321 int cmd_len, r;
1322 const char *cmd_str;
1323
1324 r = execTok (spec, &s, &cmd_str, &cmd_len);
1325 while (r)
1326 {
1327 char *p, ptmp[64];
1328
1329 if (r == 1)
1330 {
1331 r = execTok (spec, &s, &cmd_str, &cmd_len);
1332 continue;
1333 }
1334 p = regxStrz (cmd_str, cmd_len, ptmp);
1335 if (!strcmp (p, "begin"))
1336 {
1337 r = execTok (spec, &s, &cmd_str, &cmd_len);
1338 if (r < 2)
1339 {
1340 yaz_log (YLOG_WARN, "missing keyword after 'begin'");
1341 continue;
1342 }
1343 p = regxStrz (cmd_str, cmd_len, ptmp);
1344 if (!strcmp (p, "record"))
1345 {
1346 r = execTok (spec, &s, &cmd_str, &cmd_len);
1347 if (r < 2)
1348 continue;
1349 if (spec->d1_level <= 1)
1350 {
1351 static char absynName[64];
1352 data1_node *res;
1353
1354 if (cmd_len > 63)
1355 cmd_len = 63;
1356 memcpy (absynName, cmd_str, cmd_len);
1357 absynName[cmd_len] = '\0';
1358#if REGX_DEBUG
1359 yaz_log (YLOG_LOG, "begin record %s", absynName);
1360#endif
1361 res = data1_mk_root (spec->dh, spec->m, absynName);
1362
1363 spec->d1_level = 0;
1364
1365 spec->d1_stack[spec->d1_level++] = res;
1366
1367 res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1368
1369 spec->d1_stack[spec->d1_level++] = res;
1370
1371 spec->d1_stack[spec->d1_level] = NULL;
1372 }
1373 r = execTok (spec, &s, &cmd_str, &cmd_len);
1374 }
1375 else if (!strcmp (p, "element"))
1376 {
1377 r = execTok (spec, &s, &cmd_str, &cmd_len);
1378 if (r < 2)
1379 continue;
1380 tagBegin (spec, cmd_str, cmd_len);
1381 r = execTok (spec, &s, &cmd_str, &cmd_len);
1382 }
1383 else if (!strcmp (p, "variant"))
1384 {
1385 int class_len;
1386 const char *class_str = NULL;
1387 int type_len;
1388 const char *type_str = NULL;
1389 int value_len;
1390 const char *value_str = NULL;
1391 r = execTok (spec, &s, &cmd_str, &cmd_len);
1392 if (r < 2)
1393 continue;
1394 class_str = cmd_str;
1395 class_len = cmd_len;
1396 r = execTok (spec, &s, &cmd_str, &cmd_len);
1397 if (r < 2)
1398 continue;
1399 type_str = cmd_str;
1400 type_len = cmd_len;
1401
1402 r = execTok (spec, &s, &cmd_str, &cmd_len);
1403 if (r < 2)
1404 continue;
1405 value_str = cmd_str;
1406 value_len = cmd_len;
1407
1408 variantBegin (spec, class_str, class_len,
1409 type_str, type_len, value_str, value_len);
1410
1411
1412 r = execTok (spec, &s, &cmd_str, &cmd_len);
1413 }
1414 else if (!strcmp (p, "context"))
1415 {
1416 if (r > 1)
1417 {
1418 struct lexContext *lc = spec->context;
1419 r = execTok (spec, &s, &cmd_str, &cmd_len);
1420 p = regxStrz (cmd_str, cmd_len, ptmp);
1421#if REGX_DEBUG
1422 yaz_log (YLOG_LOG, "begin context %s", p);
1423#endif
1424 while (lc && strcmp (p, lc->name))
1425 lc = lc->next;
1426 if (lc)
1427 spec->context_stack[++(spec->context_stack_top)] = lc;
1428 else
1429 yaz_log (YLOG_WARN, "unknown context %s", p);
1430
1431 }
1432 r = execTok (spec, &s, &cmd_str, &cmd_len);
1433 }
1434 else
1435 {
1436 yaz_log (YLOG_WARN, "bad keyword '%s' after begin", p);
1437 }
1438 }
1439 else if (!strcmp (p, "end"))
1440 {
1441 r = execTok (spec, &s, &cmd_str, &cmd_len);
1442 if (r < 2)
1443 {
1444 yaz_log (YLOG_WARN, "missing keyword after 'end'");
1445 continue;
1446 }
1447 p = regxStrz (cmd_str, cmd_len, ptmp);
1448 if (!strcmp (p, "record"))
1449 {
1450 while (spec->d1_level)
1451 {
1452 tagDataRelease (spec);
1453 (spec->d1_level)--;
1454 }
1455 r = execTok (spec, &s, &cmd_str, &cmd_len);
1456#if REGX_DEBUG
1457 yaz_log (YLOG_LOG, "end record");
1458#endif
1459 spec->stop_flag = 1;
1460 }
1461 else if (!strcmp (p, "element"))
1462 {
1463 int min_level = 2;
1464 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1465 {
1466 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1467 min_level = 0;
1468 }
1469 if (r > 2)
1470 {
1471 tagEnd (spec, min_level, cmd_str, cmd_len);
1472 r = execTok (spec, &s, &cmd_str, &cmd_len);
1473 }
1474 else
1475 tagEnd (spec, min_level, NULL, 0);
1476 if (spec->d1_level <= 1)
1477 {
1478#if REGX_DEBUG
1479 yaz_log (YLOG_LOG, "end element end records");
1480#endif
1481 spec->stop_flag = 1;
1482 }
1483
1484 }
1485 else if (!strcmp (p, "context"))
1486 {
1487#if REGX_DEBUG
1488 yaz_log (YLOG_LOG, "end context");
1489#endif
1490 if (spec->context_stack_top)
1491 (spec->context_stack_top)--;
1492 r = execTok (spec, &s, &cmd_str, &cmd_len);
1493 }
1494 else
1495 yaz_log (YLOG_WARN, "bad keyword '%s' after end", p);
1496 }
1497 else if (!strcmp (p, "data"))
1498 {
1499 int textFlag = 0;
1500 int element_len;
1501 const char *element_str = NULL;
1502 int attribute_len;
1503 const char *attribute_str = NULL;
1504
1505 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1506 {
1507 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1508 textFlag = 1;
1509 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1510 {
1511 r = execTok (spec, &s, &element_str, &element_len);
1512 if (r < 2)
1513 break;
1514 }
1515 else if (cmd_len==10 && !memcmp ("-attribute", cmd_str,
1516 cmd_len))
1517 {
1518 r = execTok (spec, &s, &attribute_str, &attribute_len);
1519 if (r < 2)
1520 break;
1521 }
1522 else
1523 yaz_log (YLOG_WARN, "bad data option: %.*s",
1524 cmd_len, cmd_str);
1525 }
1526 if (r != 2)
1527 {
1528 yaz_log (YLOG_WARN, "missing data item after data");
1529 continue;
1530 }
1531 if (element_str)
1532 tagBegin (spec, element_str, element_len);
1533 do
1534 {
1535 execData (spec, cmd_str, cmd_len, textFlag,
1536 attribute_str, attribute_len);
1537 r = execTok (spec, &s, &cmd_str, &cmd_len);
1538 } while (r > 1);
1539 if (element_str)
1540 tagEnd (spec, 2, NULL, 0);
1541 }
1542 else if (!strcmp (p, "unread"))
1543 {
1544 int no, offset;
1545 r = execTok (spec, &s, &cmd_str, &cmd_len);
1546 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1547 {
1548 r = execTok (spec, &s, &cmd_str, &cmd_len);
1549 if (r < 2)
1550 {
1551 yaz_log (YLOG_WARN, "missing number after -offset");
1552 continue;
1553 }
1554 p = regxStrz (cmd_str, cmd_len, ptmp);
1555 offset = atoi (p);
1556 r = execTok (spec, &s, &cmd_str, &cmd_len);
1557 }
1558 else
1559 offset = 0;
1560 if (r < 2)
1561 {
1562 yaz_log (YLOG_WARN, "missing index after unread command");
1563 continue;
1564 }
1565 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1566 {
1567 yaz_log (YLOG_WARN, "bad index after unread command");
1568 continue;
1569 }
1570 else
1571 {
1572 no = *cmd_str - '0';
1573 if (no >= spec->arg_no)
1574 no = spec->arg_no - 1;
1575 spec->ptr = spec->arg_start[no] + offset;
1576 }
1577 r = execTok (spec, &s, &cmd_str, &cmd_len);
1578 }
1579 else if (!strcmp (p, "context"))
1580 {
1581 if (r > 1)
1582 {
1583 struct lexContext *lc = spec->context;
1584 r = execTok (spec, &s, &cmd_str, &cmd_len);
1585 p = regxStrz (cmd_str, cmd_len, ptmp);
1586
1587 while (lc && strcmp (p, lc->name))
1588 lc = lc->next;
1589 if (lc)
1590 spec->context_stack[spec->context_stack_top] = lc;
1591 else
1592 yaz_log (YLOG_WARN, "unknown context %s", p);
1593
1594 }
1595 r = execTok (spec, &s, &cmd_str, &cmd_len);
1596 }
1597 else
1598 {
1599 yaz_log (YLOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1600 r = execTok (spec, &s, &cmd_str, &cmd_len);
1601 continue;
1602 }
1603 if (r > 1)
1604 {
1605 yaz_log (YLOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1606 do {
1607 r = execTok (spec, &s, &cmd_str, &cmd_len);
1608 } while (r > 1);
1609 }
1610 }
1611}
1612
1613
1614static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1615 int start_ptr, int *pptr)
1616{
1617 int sptr;
1618 int arg_start[20];
1619 int arg_end[20];
1620 int arg_no = 1;
1621
1622 if (!ap)
1623 return 1;
1624 arg_start[0] = start_ptr;
1625 arg_end[0] = *pptr;
1626 spec->arg_start = arg_start;
1627 spec->arg_end = arg_end;
1628
1629 while (ap)
1630 {
1631 switch (ap->which)
1632 {
1633 case REGX_PATTERN:
1634 if (ap->u.pattern.body)
1635 {
1636 arg_start[arg_no] = *pptr;
1637 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 0))
1638 {
1639 arg_end[arg_no] = F_WIN_EOF;
1640 arg_no++;
1641 arg_start[arg_no] = F_WIN_EOF;
1642 arg_end[arg_no] = F_WIN_EOF;
1643 yaz_log(YLOG_DEBUG, "Pattern match rest of record");
1644 *pptr = F_WIN_EOF;
1645 }
1646 else
1647 {
1648 arg_end[arg_no] = sptr;
1649 arg_no++;
1650 arg_start[arg_no] = sptr;
1651 arg_end[arg_no] = *pptr;
1652 }
1653 }
1654 else
1655 {
1656 arg_start[arg_no] = *pptr;
1657 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 1))
1658 return 1;
1659 if (sptr != arg_start[arg_no])
1660 return 1;
1661 arg_end[arg_no] = *pptr;
1662 }
1663 arg_no++;
1664 break;
1665 case REGX_CODE:
1666 spec->arg_no = arg_no;
1667 spec->ptr = *pptr;
1668#if HAVE_TCL_H
1669 if (spec->tcl_interp)
1670 execTcl(spec, ap->u.code);
1671 else
1672 execCode (spec, ap->u.code);
1673#else
1674 execCode (spec, ap->u.code);
1675#endif
1676 *pptr = spec->ptr;
1677 if (spec->stop_flag)
1678 return 0;
1679 break;
1680 case REGX_END:
1681 arg_start[arg_no] = *pptr;
1682 arg_end[arg_no] = F_WIN_EOF;
1683 arg_no++;
1684 *pptr = F_WIN_EOF;
1685 }
1686 ap = ap->next;
1687 }
1688 return 1;
1689}
1690
1691static int execRule (struct lexSpec *spec, struct lexContext *context,
1692 int ruleNo, int start_ptr, int *pptr)
1693{
1694#if REGX_DEBUG
1695 yaz_log (YLOG_LOG, "exec rule %d", ruleNo);
1696#endif
1697 return execAction (spec, context->fastRule[ruleNo]->actionList,
1698 start_ptr, pptr);
1699}
1700
1701int lexNode (struct lexSpec *spec, int *ptr)
1702{
1703 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1704 struct DFA_state *state = context->dfa->states[0];
1705 struct DFA_tran *t;
1706 unsigned char c;
1707 unsigned char c_prev = '\n';
1708 int i;
1709 int last_rule = 0; /* rule number of current match */
1710 int last_ptr = *ptr; /* last char of match */
1711 int start_ptr = *ptr; /* first char of match */
1712 int skip_ptr = *ptr; /* first char of run */
1713 int more = 0;
1714
1715 while (1)
1716 {
1717 c = f_win_advance (spec, ptr);
1718 if (*ptr == F_WIN_EOF)
1719 {
1720 /* end of file met */
1721 if (last_rule)
1722 {
1723 /* there was a match */
1724 if (skip_ptr < start_ptr)
1725 {
1726 /* deal with chars that didn't match */
1727 int size;
1728 char *buf;
1729 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1730 execDataP (spec, buf, size, 0);
1731 }
1732 /* restore pointer */
1733 *ptr = last_ptr;
1734 /* execute rule */
1735 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1736 return more;
1737 /* restore skip pointer */
1738 skip_ptr = *ptr;
1739 last_rule = 0;
1740 }
1741 else if (skip_ptr < *ptr)
1742 {
1743 /* deal with chars that didn't match */
1744 int size;
1745 char *buf;
1746 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1747 execDataP (spec, buf, size, 0);
1748 }
1749 state = context->dfa->states[0];
1750 if (*ptr == F_WIN_EOF)
1751 return more;
1752 }
1753 t = state->trans;
1754 i = state->tran_no;
1755 while (1)
1756 if (--i < 0)
1757 { /* no transition for character c ... */
1758 if (last_rule)
1759 {
1760 if (skip_ptr < start_ptr)
1761 {
1762 /* deal with chars that didn't match */
1763 int size;
1764 char *buf;
1765 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1766 execDataP (spec, buf, size, 0);
1767 }
1768 /* restore pointer */
1769 *ptr = last_ptr;
1770 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1771 {
1772 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1773 {
1774 off_t end_offset = *ptr;
1775#if REGX_DEBUG
1776 yaz_log (YLOG_LOG, "regx: endf ptr=%d", *ptr);
1777#endif
1778 (*spec->f_win_ef)(spec->stream, &end_offset);
1779 }
1780 return more;
1781 }
1782 context = spec->context_stack[spec->context_stack_top];
1783 skip_ptr = *ptr;
1784 last_rule = 0;
1785 last_ptr = start_ptr = *ptr;
1786 if (start_ptr > 0)
1787 {
1788 --start_ptr;
1789 c_prev = f_win_advance (spec, &start_ptr);
1790 }
1791 }
1792 else
1793 {
1794 c_prev = f_win_advance (spec, &start_ptr);
1795 *ptr = start_ptr;
1796 }
1797 state = context->dfa->states[0];
1798 break;
1799 }
1800 else if (c >= t->ch[0] && c <= t->ch[1])
1801 { /* transition ... */
1802 state = context->dfa->states[t->to];
1803 if (state->rule_no)
1804 {
1805 if (c_prev == '\n')
1806 {
1807 last_rule = state->rule_no;
1808 last_ptr = *ptr;
1809 }
1810 else if (state->rule_nno)
1811 {
1812 last_rule = state->rule_nno;
1813 last_ptr = *ptr;
1814 }
1815 more = 1;
1816 }
1817 break;
1818 }
1819 else
1820 t++;
1821 }
1822 return more;
1823}
1824
1825static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1826 const char *context_name)
1827{
1828 struct lexContext *lt = spec->context;
1829 int ptr = offset;
1830 int ret;
1831
1832 spec->stop_flag = 0;
1833 spec->d1_level = 0;
1834 spec->context_stack_top = 0;
1835 while (lt)
1836 {
1837 if (!strcmp (lt->name, context_name))
1838 break;
1839 lt = lt->next;
1840 }
1841 if (!lt)
1842 {
1843 yaz_log (YLOG_WARN, "cannot find context %s", context_name);
1844 return NULL;
1845 }
1846 spec->context_stack[spec->context_stack_top] = lt;
1847 spec->d1_stack[spec->d1_level] = NULL;
1848#if 1
1849 if (!lt->initFlag)
1850 {
1851 lt->initFlag = 1;
1852 execAction (spec, lt->initActionList, ptr, &ptr);
1853 }
1854#endif
1855 execAction (spec, lt->beginActionList, ptr, &ptr);
1856
1857 ret = lexNode (spec, &ptr);
1858 while (spec->d1_level)
1859 {
1860 tagDataRelease (spec);
1861 (spec->d1_level)--;
1862 }
1863 if (!ret)
1864 return 0;
1865 execAction (spec, lt->endActionList, ptr, &ptr);
1866 return spec->d1_stack[0];
1867}
1868
1869void grs_destroy(void *clientData)
1870{
1871 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1872 if (specs->spec)
1873 {
1874 lexSpecDestroy(&specs->spec);
1875 }
1876 xfree (specs);
1877}
1878
1880{
1881 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
1882 specs->spec = 0;
1883 strcpy(specs->type, "");
1884 return specs;
1885}
1886
1887
1888ZEBRA_RES grs_config(void *clientData, Res res, const char *args)
1889{
1890 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1891 if (strlen(args) < sizeof(specs->type))
1892 strcpy(specs->type, args);
1893 return ZEBRA_OK;
1894}
1895
1897{
1898 int res;
1899 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1900 struct lexSpec **curLexSpec = &specs->spec;
1901 off_t start_offset;
1902
1903#if REGX_DEBUG
1904 yaz_log (YLOG_LOG, "grs_read_regx");
1905#endif
1906 if (!*curLexSpec || strcmp ((*curLexSpec)->name, specs->type))
1907 {
1908 if (*curLexSpec)
1909 lexSpecDestroy (curLexSpec);
1910 *curLexSpec = lexSpecCreate (specs->type, p->dh);
1911 res = readFileSpec (*curLexSpec);
1912 if (res)
1913 {
1914 lexSpecDestroy (curLexSpec);
1915 return NULL;
1916 }
1917 }
1918 (*curLexSpec)->dh = p->dh;
1919 start_offset = p->stream->tellf(p->stream);
1920 if (start_offset == 0)
1921 {
1922 (*curLexSpec)->f_win_start = 0;
1923 (*curLexSpec)->f_win_end = 0;
1924 (*curLexSpec)->f_win_rf = p->stream->readf;
1925 (*curLexSpec)->f_win_sf = p->stream->seekf;
1926 (*curLexSpec)->stream = p->stream;
1927 (*curLexSpec)->f_win_ef = p->stream->endf;
1928 (*curLexSpec)->f_win_size = 500000;
1929 }
1930 (*curLexSpec)->m = p->mem;
1931 return lexRoot (*curLexSpec, start_offset, "main");
1932}
1933
1934static int extract_regx(void *clientData, struct recExtractCtrl *ctrl)
1935{
1936 return zebra_grs_extract(clientData, ctrl, grs_read_regx);
1937}
1938
1939static int retrieve_regx(void *clientData, struct recRetrieveCtrl *ctrl)
1940{
1941 return zebra_grs_retrieve(clientData, ctrl, grs_read_regx);
1942}
1943
1944static struct recType regx_type = {
1945 0,
1946 "grs.regx",
1947 grs_init,
1948 grs_config,
1952};
1953
1954
1955#if HAVE_TCL_H
1956data1_node *grs_read_tcl (struct grs_read_info *p)
1957{
1958 int res;
1959 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1960 struct lexSpec **curLexSpec = &specs->spec;
1961 off_t start_offset;
1962
1963#if REGX_DEBUG
1964 yaz_log (YLOG_LOG, "grs_read_tcl");
1965#endif
1966 if (!*curLexSpec || strcmp ((*curLexSpec)->name, specs->type))
1967 {
1968 Tcl_Interp *tcl_interp;
1969 if (*curLexSpec)
1970 lexSpecDestroy (curLexSpec);
1971 *curLexSpec = lexSpecCreate (specs->type, p->dh);
1972 Tcl_FindExecutable("");
1973 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
1974 Tcl_Init(tcl_interp);
1975 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
1976 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
1977 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
1978 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
1979 *curLexSpec, 0);
1980 res = readFileSpec (*curLexSpec);
1981 if (res)
1982 {
1983 lexSpecDestroy (curLexSpec);
1984 return NULL;
1985 }
1986 }
1987 (*curLexSpec)->dh = p->dh;
1988 start_offset = p->stream->tellf(p->stream);
1989 if (start_offset == 0)
1990 {
1991 (*curLexSpec)->f_win_start = 0;
1992 (*curLexSpec)->f_win_end = 0;
1993 (*curLexSpec)->f_win_rf = p->stream->readf;
1994 (*curLexSpec)->f_win_sf = p->stream->seekf;
1995 (*curLexSpec)->stream = p->stream;
1996 (*curLexSpec)->f_win_ef = p->stream->endf;
1997 (*curLexSpec)->f_win_size = 500000;
1998 }
1999 (*curLexSpec)->m = p->mem;
2000 return lexRoot (*curLexSpec, start_offset, "main");
2001}
2002
2003static int extract_tcl(void *clientData, struct recExtractCtrl *ctrl)
2004{
2005 return zebra_grs_extract(clientData, ctrl, grs_read_tcl);
2006}
2007
2008static int retrieve_tcl(void *clientData, struct recRetrieveCtrl *ctrl)
2009{
2010 return zebra_grs_retrieve(clientData, ctrl, grs_read_tcl);
2011}
2012
2013static struct recType tcl_type = {
2014 0,
2015 "grs.tcl",
2016 grs_init,
2017 grs_config,
2019 extract_tcl,
2020 retrieve_tcl,
2021};
2022
2023#endif
2024
2025RecType
2026#if IDZEBRA_STATIC_GRS_REGX
2027idzebra_filter_grs_regx
2028#else
2030#endif
2031
2032[] = {
2033 &regx_type,
2034#if HAVE_TCL_H
2035 &tcl_type,
2036#endif
2037 0,
2038};
2039/*
2040 * Local variables:
2041 * c-basic-offset: 4
2042 * c-file-style: "Stroustrup"
2043 * indent-tabs-mode: nil
2044 * End:
2045 * vim: shiftwidth=4 tabstop=8 expandtab
2046 */
2047
#define DATA1N_variant
Definition data1.h:280
data1_node * data1_mk_root(data1_handle dh, NMEM nmem, const char *name)
Definition d1_read.c:174
data1_node * data1_mk_tag_n(data1_handle dh, NMEM nmem, const char *tag, size_t len, const char **attr, data1_node *at)
Definition d1_read.c:259
#define DATA1N_tag
Definition data1.h:276
#define DATA1N_data
Definition data1.h:278
FILE * data1_path_fopen(data1_handle dh, const char *file, const char *mode)
Definition d1_handle.c:147
data1_vartype * data1_getvartypeby_absyn(data1_handle dh, data1_absyn *absyn, char *zclass, char *type)
Definition d1_varset.c:50
data1_node * data1_mk_node2(data1_handle dh, NMEM m, int type, data1_node *parent)
Definition d1_read.c:146
#define DATA1_LOCALDATA
Definition data1.h:338
#define DATA1_MAX_SYMBOL
Definition data1.h:38
data1_node * data1_mk_tag(data1_handle dh, NMEM nmem, const char *tag, const char **attr, data1_node *at)
Definition d1_read.c:295
#define DATA1I_text
Definition data1.h:314
void dfa_parse_cmap_add(struct DFA *d, int from, int to)
Definition dfa.c:978
void dfa_parse_cmap_del(struct DFA *d, int from)
Definition dfa.c:960
int dfa_parse(struct DFA *, const char **)
Definition dfa.c:1121
void dfa_mkstate(struct DFA *)
Definition dfa.c:1148
void dfa_delete(struct DFA **)
Definition dfa.c:1158
int debug_dfa_followpos
Definition dfa.c:68
int debug_dfa_trav
Definition dfa.c:66
struct DFA * dfa_init(void)
Definition dfa.c:1092
int dfa_verbose
Definition dfa.c:69
int debug_dfa_tran
Definition dfa.c:67
static void tagEnd(struct lexSpec *spec, int min_level, const char *tag, int len)
#define REGX_BODY
static void lexContextDestroy(struct lexContext *p)
static void execData(struct lexSpec *spec, const char *ebuf, int elen, int formatted_text, const char *attribute_str, int attribute_len)
static int execRule(struct lexSpec *spec, struct lexContext *context, int ruleNo, int start_ptr, int *pptr)
static int extract_regx(void *clientData, struct recExtractCtrl *ctrl)
static struct DFA * lexSpecDFA(void)
RecType idzebra_filter[]
static void lexSpecDestroy(struct lexSpec **pp)
static char * f_win_get(struct lexSpec *spec, off_t start_pos, off_t end_pos, int *size)
static int readParseToken(const char **cpp, int *len)
static int execTok(struct lexSpec *spec, const char **src, const char **tokBuf, int *tokLen)
static void tagDataRelease(struct lexSpec *spec)
static int execAction(struct lexSpec *spec, struct lexRuleAction *ap, int start_ptr, int *pptr)
#define REGX_CONTEXT
#define REGX_END
#define REGX_BEGIN
static void variantBegin(struct lexSpec *spec, const char *class_str, int class_len, const char *type_str, int type_len, const char *value_str, int value_len)
static int f_win_advance(struct lexSpec *spec, int *pos)
static void regxCodeDel(struct regxCode **pp)
static data1_node * lexRoot(struct lexSpec *spec, off_t offset, const char *context_name)
static void actionListDel(struct lexRuleAction **rap)
static void tagStrip(const char **tag, int *len)
static int actionListMk(struct lexSpec *spec, const char *s, struct lexRuleAction **ap)
static struct lexContext * lexContextCreate(const char *name)
static struct lexSpec * lexSpecCreate(const char *name, data1_handle dh)
int readFileSpec(struct lexSpec *spec)
static void execCode(struct lexSpec *spec, struct regxCode *code)
static void tagBegin(struct lexSpec *spec, const char *tag, int len)
#define REGX_INIT
#define F_WIN_EOF
static void regxCodeMk(struct regxCode **pp, const char *buf, int len)
static char * regxStrz(const char *src, int len, char *str)
ZEBRA_RES grs_config(void *clientData, Res res, const char *args)
int readOneSpec(struct lexSpec *spec, const char *s)
static void execDataP(struct lexSpec *spec, const char *ebuf, int elen, int formatted_text)
void * grs_init(Res res, RecType recType)
int lexNode(struct lexSpec *spec, int *ptr)
data1_node * grs_read_regx(struct grs_read_info *p)
void grs_destroy(void *clientData)
static int tryMatch(struct lexSpec *spec, int *pptr, int *mptr, struct DFA *dfa, int greedy)
#define REGX_PATTERN
static int retrieve_regx(void *clientData, struct recRetrieveCtrl *ctrl)
static struct recType regx_type
#define REGX_CODE
int zebra_grs_retrieve(void *clientData, struct recRetrieveCtrl *p, data1_node *(*grs_read)(struct grs_read_info *))
Definition recgrs.c:1072
int zebra_grs_extract(void *clientData, struct recExtractCtrl *p, data1_node *(*grs_read)(struct grs_read_info *))
Definition recgrs.c:936
short rule_no
Definition dfa.h:49
short tran_no
Definition dfa.h:48
short rule_nno
Definition dfa.h:50
struct DFA_tran * trans
Definition dfa.h:45
Definition dfa.h:30
unsigned short to
Definition dfa.h:32
unsigned char ch[2]
Definition dfa.h:31
Definition dfa.h:53
struct DFA_state ** states
Definition dfa.h:55
record reader stream
Definition recctrl.h:71
off_t(* seekf)(struct ZebraRecStream *s, off_t offset)
seek function
Definition recctrl.h:77
off_t(* endf)(struct ZebraRecStream *s, off_t *offset)
set and get of record position
Definition recctrl.h:81
int(* readf)(struct ZebraRecStream *s, char *buf, size_t count)
read function
Definition recctrl.h:75
off_t(* tellf)(struct ZebraRecStream *s)
tell function
Definition recctrl.h:79
char lbuf[DATA1_LOCALDATA]
Definition data1.h:339
struct data1_node::@2::@3 root
char * type
Definition data1.h:290
struct data1_node * parent
Definition data1.h:343
char * tag
Definition data1.h:296
char * data
Definition data1.h:307
struct data1_node * next
Definition data1.h:340
int len
Definition data1.h:308
struct data1_absyn * absyn
Definition data1.h:291
unsigned formatted_text
Definition data1.h:322
union data1_node::@2 u
int which
Definition data1.h:285
struct data1_node::@2::@6 variant
char * value
Definition data1.h:328
struct data1_xattr * next
Definition data1.h:262
data1_handle dh
Definition recgrs.h:31
struct ZebraRecStream * stream
Definition recgrs.h:28
void * clientData
Definition recgrs.h:29
struct lexRuleAction * endActionList
char * name
struct lexRuleInfo ** fastRule
struct lexRule * rules
struct lexRuleAction * beginActionList
struct DFA * dfa
struct lexRuleAction * initActionList
struct lexContext * next
struct lexRuleAction::@22::@23 pattern
struct lexRuleAction * next
struct regxCode * code
union lexRuleAction::@22 u
struct DFA * dfa
struct lexRuleAction * actionList
struct lexRule * next
struct lexRuleInfo info
char * f_win_buf
int * arg_end
data1_handle dh
int f_win_size
int f_win_end
int context_stack_size
off_t(* f_win_ef)(struct ZebraRecStream *s, off_t *)
struct lexContext * context
off_t(* f_win_sf)(struct ZebraRecStream *, off_t)
int stop_flag
struct ZebraRecStream * stream
data1_node ** d1_stack
struct lexContext ** context_stack
int f_win_start
int(* f_win_rf)(struct ZebraRecStream *, char *, size_t)
int context_stack_top
char * name
int * arg_start
struct lexConcatBuf * concatBuf
char type[256]
struct lexSpec * spec
record extract for indexing
Definition recctrl.h:101
char * str
#define ZEBRA_OK
Definition util.h:82
short ZEBRA_RES
Common return type for Zebra API.
Definition util.h:80