YAZ  5.34.0
tokenizer.c
Go to the documentation of this file.
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
9 #if HAVE_CONFIG_H
10 #include <config.h>
11 #endif
12 
13 #include <assert.h>
14 #include <stdio.h>
15 #include <string.h>
16 #include <yaz/log.h>
17 #include <yaz/wrbuf.h>
18 #include <yaz/tokenizer.h>
19 
20 struct yaz_tok_parse {
23  int look;
24 
28 };
29 
30 struct yaz_tok_cfg {
31  int ref_count;
32  char *comment;
33  char *white_space;
37 };
38 
39 void yaz_tok_cfg_single_tokens(yaz_tok_cfg_t t, const char *simple)
40 {
41  xfree(t->single_tokens);
42  t->single_tokens = xstrdup(simple);
43 }
44 
46 {
47  yaz_tok_cfg_t t = (yaz_tok_cfg_t) xmalloc(sizeof(*t));
48  t->white_space = xstrdup(" \t\r\n");
49  t->single_tokens = xstrdup("");
50  t->quote_tokens_begin = xstrdup("\"");
51  t->quote_tokens_end = xstrdup("\"");
52  t->comment = xstrdup("#");
53  t->ref_count = 1;
54  return t;
55 }
56 
58 {
59  t->ref_count--;
60  if (t->ref_count == 0)
61  {
62  xfree(t->white_space);
63  xfree(t->single_tokens);
66  xfree(t->comment);
67  xfree(t);
68  }
69 }
70 
71 static int read_buf(void **vp)
72 {
73  const char *cp = *(const char **) vp;
74  int ch = *cp;
75  if (ch)
76  {
77  cp++;
78  *(const char **)vp = cp;
79  }
80  return ch;
81 }
82 
84 {
85  return yaz_tok_parse_create(t, read_buf, (void *) buf);
86 }
87 
88 static int get_byte(yaz_tok_parse_t tp)
89 {
90  int ch = tp->unget_byte;
91  assert(tp->get_byte_func);
92  if (ch)
93  tp->unget_byte = 0;
94  else
95  ch = tp->get_byte_func(&tp->get_byte_data);
96  return ch;
97 }
98 
99 static void unget_byte(yaz_tok_parse_t tp, int ch)
100 {
101  tp->unget_byte = ch;
102 }
103 
106  void *vp)
107 {
108  yaz_tok_parse_t tp = (yaz_tok_parse_t) xmalloc(sizeof(*tp));
109 
110  tp->cfg = t;
111  tp->cfg->ref_count++;
112  tp->get_byte_func = h;
113  tp->get_byte_data = vp;
114 
115  tp->look = YAZ_TOK_ERROR;
116  tp->unget_byte = 0;
117 
118  tp->wr_string = wrbuf_alloc();
119  return tp;
120 }
121 
122 
124 {
127  xfree(tp);
128 }
129 
131 {
132  yaz_tok_cfg_t t = tp->cfg;
133  const char *cp;
134  int ch = get_byte(tp);
135 
136  /* skip white space */
137  while (ch && strchr(t->white_space, ch))
138  ch = get_byte(tp);
139  if (!ch)
140  ch = YAZ_TOK_EOF;
141  else if (strchr(t->comment, ch))
142  ch = YAZ_TOK_EOF;
143  else if ((cp = strchr(t->single_tokens, ch)))
144  ch = *cp; /* single token match */
145  else if ((cp = strchr(t->quote_tokens_begin, ch)))
146  { /* quoted string */
147  int end_ch = t->quote_tokens_end[cp - t->quote_tokens_begin];
148  ch = get_byte(tp);
149  wrbuf_rewind(tp->wr_string);
150  while (ch && ch != end_ch)
151  wrbuf_putc(tp->wr_string, ch);
152  if (!ch)
153  ch = YAZ_TOK_ERROR;
154  else
155  ch = YAZ_TOK_QSTRING;
156  }
157  else
158  { /* unquoted string */
159  wrbuf_rewind(tp->wr_string);
160  while (ch && !strchr(t->white_space, ch)
161  && !strchr(t->single_tokens, ch)
162  && !strchr(t->comment, ch))
163  {
164  wrbuf_putc(tp->wr_string, ch);
165  ch = get_byte(tp);
166  }
167  unget_byte(tp, ch);
168  ch = YAZ_TOK_STRING;
169  }
170  tp->look = ch;
171  return ch;
172 }
173 
175 {
176  return wrbuf_cstr(tp->wr_string);
177 }
178 
179 /*
180  * Local variables:
181  * c-basic-offset: 4
182  * c-file-style: "Stroustrup"
183  * indent-tabs-mode: nil
184  * End:
185  * vim: shiftwidth=4 tabstop=8 expandtab
186  */
187 
Logging utility.
string buffer
Definition: wrbuf.h:43
char * quote_tokens_end
Definition: tokenizer.c:36
char * white_space
Definition: tokenizer.c:33
char * quote_tokens_begin
Definition: tokenizer.c:35
char * single_tokens
Definition: tokenizer.c:34
int ref_count
Definition: tokenizer.c:31
char * comment
Definition: tokenizer.c:32
yaz_tok_get_byte_t get_byte_func
Definition: tokenizer.c:26
int unget_byte
Definition: tokenizer.c:21
WRBUF wr_string
Definition: tokenizer.c:22
void * get_byte_data
Definition: tokenizer.c:27
yaz_tok_cfg_t cfg
Definition: tokenizer.c:25
yaz_tok_parse_t yaz_tok_parse_buf(yaz_tok_cfg_t t, const char *buf)
Definition: tokenizer.c:83
yaz_tok_cfg_t yaz_tok_cfg_create(void)
Definition: tokenizer.c:45
static void unget_byte(yaz_tok_parse_t tp, int ch)
Definition: tokenizer.c:99
const char * yaz_tok_parse_string(yaz_tok_parse_t tp)
Definition: tokenizer.c:174
static int get_byte(yaz_tok_parse_t tp)
Definition: tokenizer.c:88
static int read_buf(void **vp)
Definition: tokenizer.c:71
void yaz_tok_parse_destroy(yaz_tok_parse_t tp)
Definition: tokenizer.c:123
void yaz_tok_cfg_single_tokens(yaz_tok_cfg_t t, const char *simple)
Definition: tokenizer.c:39
void yaz_tok_cfg_destroy(yaz_tok_cfg_t t)
Definition: tokenizer.c:57
int yaz_tok_move(yaz_tok_parse_t tp)
Definition: tokenizer.c:130
yaz_tok_parse_t yaz_tok_parse_create(yaz_tok_cfg_t t, yaz_tok_get_byte_t h, void *vp)
Definition: tokenizer.c:104
Header with public definitions about YAZ' tokenizer.
struct yaz_tok_parse * yaz_tok_parse_t
Definition: tokenizer.h:44
#define YAZ_TOK_QSTRING
Definition: tokenizer.h:41
#define YAZ_TOK_EOF
Definition: tokenizer.h:38
#define YAZ_TOK_STRING
Definition: tokenizer.h:40
struct yaz_tok_cfg * yaz_tok_cfg_t
Definition: tokenizer.h:43
int(* yaz_tok_get_byte_t)(void **vp)
Definition: tokenizer.h:46
#define YAZ_TOK_ERROR
Definition: tokenizer.h:39
void wrbuf_destroy(WRBUF b)
destroy WRBUF and its buffer
Definition: wrbuf.c:38
void wrbuf_rewind(WRBUF b)
empty WRBUF content (length of buffer set to 0)
Definition: wrbuf.c:47
WRBUF wrbuf_alloc(void)
construct WRBUF
Definition: wrbuf.c:25
const char * wrbuf_cstr(WRBUF b)
returns WRBUF content as C-string
Definition: wrbuf.c:281
Header for WRBUF (growing buffer)
#define wrbuf_putc(b, c)
Definition: wrbuf.h:268
#define xstrdup(s)
utility macro which calls xstrdup_f
Definition: xmalloc.h:55
#define xfree(x)
utility macro which calls xfree_f
Definition: xmalloc.h:53
#define xmalloc(x)
utility macro which calls malloc_f
Definition: xmalloc.h:49