IDZEBRA 2.2.8
mod_grs_xml.c
Go to the documentation of this file.
1/* This file is part of the Zebra server.
2 Copyright (C) Index Data
3
4Zebra is free software; you can redistribute it and/or modify it under
5the terms of the GNU General Public License as published by the Free
6Software Foundation; either version 2, or (at your option) any later
7version.
8
9Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10WARRANTY; without even the implied warranty of MERCHANTABILITY or
11FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12for more details.
13
14You should have received a copy of the GNU General Public License
15along with this program; if not, write to the Free Software
16Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
18*/
19
20#if HAVE_CONFIG_H
21#include <config.h>
22#endif
23#if HAVE_EXPAT_H
24
25#include <assert.h>
26#include <stdio.h>
27#include <stdlib.h>
28#if HAVE_ICONV_H
29#include <errno.h>
30#include <iconv.h>
31#endif
32
33#include <yaz/log.h>
34#include <yaz/snprintf.h>
35#include <yaz/log.h>
36#include <yaz/xmalloc.h>
37#include <idzebra/recgrs.h>
38
39#include <expat.h>
40
41#define XML_CHUNK 1024
42
43struct user_info {
44 data1_node *d1_stack[256];
45 int level;
46 data1_handle dh;
47 NMEM nmem;
48 int loglevel;
49};
50
51static void report_xml_error(XML_Parser parser)
52{
53 zint line = XML_GetCurrentLineNumber(parser);
54 zint col = XML_GetCurrentColumnNumber(parser);
55 yaz_log(YLOG_WARN, ZINT_FORMAT ":" ZINT_FORMAT ":XML error: %s",
56 line, col, XML_ErrorString(XML_GetErrorCode(parser)));
57}
58
59static void cb_start(void *user, const char *el, const char **attr)
60{
61 struct user_info *ui = (struct user_info*) user;
62 if (ui->level == 1)
63 data1_set_root (ui->dh, ui->d1_stack[0], ui->nmem, el);
64 ui->d1_stack[ui->level] = data1_mk_tag(ui->dh, ui->nmem, el, attr,
65 ui->d1_stack[ui->level-1]);
66 ui->level++;
67 yaz_log (ui->loglevel, "cb_start %s", el);
68}
69
70static void cb_end (void *user, const char *el)
71{
72 struct user_info *ui = (struct user_info*) user;
73
74 ui->level--;
75 yaz_log(ui->loglevel, "cb_end %s", el);
76}
77
78static void cb_chardata(void *user, const char *s, int len)
79{
80 struct user_info *ui = (struct user_info*) user;
81#if 0
82 yaz_log (ui->loglevel, "cb_chardata %.*s", len, s);
83#endif
84 ui->d1_stack[ui->level] = data1_mk_text_n(ui->dh, ui->nmem, s, len,
85 ui->d1_stack[ui->level -1]);
86}
87
88static void cb_decl(void *user, const char *version, const char *encoding,
89 int standalone)
90{
91 struct user_info *ui = (struct user_info*) user;
92 const char *attr_list[7];
93
94 attr_list[0] = "version";
95 attr_list[1] = version;
96
97 attr_list[2] = "encoding";
98 attr_list[3] = "UTF-8"; /* internally it's always UTF-8 */
99
100 attr_list[4] = "standalone";
101 attr_list[5] = standalone ? "yes" : "no";
102
103 attr_list[6] = 0;
104
105 data1_mk_preprocess(ui->dh, ui->nmem, "xml", attr_list,
106 ui->d1_stack[ui->level-1]);
107#if 0
108 yaz_log (YLOG_LOG, "decl version=%s encoding=%s",
109 version ? version : "null",
110 encoding ? encoding : "null");
111#endif
112}
113
114static void cb_processing(void *user, const char *target,
115 const char *data)
116{
117 struct user_info *ui = (struct user_info*) user;
118 data1_node *res =
119 data1_mk_preprocess(ui->dh, ui->nmem, target, 0,
120 ui->d1_stack[ui->level-1]);
121 data1_mk_text_nf(ui->dh, ui->nmem, data, strlen(data), res);
122
123 yaz_log(ui->loglevel, "decl processing target=%s data=%s",
124 target ? target : "null",
125 data ? data : "null");
126}
127
128static void cb_comment(void *user, const char *data)
129{
130 struct user_info *ui = (struct user_info*) user;
131 yaz_log(ui->loglevel, "decl comment data=%s", data ? data : "null");
132 data1_mk_comment(ui->dh, ui->nmem, data, ui->d1_stack[ui->level-1]);
133}
134
135static void cb_doctype_start(void *userData, const char *doctypeName,
136 const char *sysid, const char *pubid,
137 int has_internal_subset)
138{
139 struct user_info *ui = (struct user_info*) userData;
140 yaz_log(ui->loglevel, "doctype start doctype=%s sysid=%s pubid=%s",
141 doctypeName, sysid, pubid);
142}
143
144static void cb_doctype_end(void *userData)
145{
146 struct user_info *ui = (struct user_info*) userData;
147 yaz_log(ui->loglevel, "doctype end");
148}
149
150
151static void cb_entity_decl(void *userData, const char *entityName,
152 int is_parameter_entity,
153 const char *value, int value_length,
154 const char *base, const char *systemId,
155 const char *publicId, const char *notationName)
156{
157 struct user_info *ui = (struct user_info*) userData;
158 yaz_log(ui->loglevel,
159 "entity decl %s is_para_entry=%d value=%.*s base=%s systemId=%s"
160 " publicId=%s notationName=%s",
161 entityName, is_parameter_entity, value_length, value,
162 base, systemId, publicId, notationName);
163
164}
165
166static int cb_external_entity(XML_Parser pparser,
167 const char *context,
168 const char *base,
169 const char *systemId,
170 const char *publicId)
171{
172 struct user_info *ui = (struct user_info*) XML_GetUserData(pparser);
173 FILE *inf;
174 int done = 0;
175 XML_Parser parser;
176
177 yaz_log(ui->loglevel,
178 "external entity context=%s base=%s systemid=%s publicid=%s",
179 context, base, systemId, publicId);
180 if (!systemId)
181 return 1;
182
183 if (!(inf = fopen(systemId, "rb")))
184 {
185 yaz_log (YLOG_WARN|YLOG_ERRNO, "fopen %s", systemId);
186 return 0;
187 }
188
189 parser = XML_ExternalEntityParserCreate(pparser, "", 0);
190 while (!done)
191 {
192 int r;
193 void *buf = XML_GetBuffer(parser, XML_CHUNK);
194 if (!buf)
195 {
196 yaz_log(YLOG_WARN, "XML_GetBuffer fail");
197 break;
198 }
199 r = fread(buf, 1, XML_CHUNK, inf);
200 if (r == 0)
201 {
202 if (ferror(inf))
203 {
204 yaz_log(YLOG_WARN|YLOG_ERRNO, "fread %s", systemId);
205 break;
206 }
207 done = 1;
208 }
209 if (!XML_ParseBuffer(parser, r, done))
210 {
211 done = 1;
212 report_xml_error(parser);
213 }
214 }
215 fclose (inf);
216 XML_ParserFree(parser);
217 return done;
218}
219
220
221#if HAVE_ICONV_H
222static int cb_encoding_convert(void *data, const char *s)
223{
224 iconv_t t = (iconv_t) data;
225 size_t ret;
226 size_t outleft = 2;
227 char outbuf_[2], *outbuf = outbuf_;
228 size_t inleft = 4;
229 char *inbuf = (char *) s;
230 unsigned short code;
231
232#if 1
233 yaz_log(YLOG_LOG, "------------------------- cb_encoding_convert --- ");
234#endif
235 ret = iconv(t, &inbuf, &inleft, &outbuf, &outleft);
236 if (ret == (size_t) (-1) && errno != E2BIG)
237 {
238 iconv (t, 0, 0, 0, 0);
239 return -1;
240 }
241 if (outleft != 0)
242 return -1;
243 memcpy (&code, outbuf_, sizeof(short));
244 return code;
245}
246
247static void cb_encoding_release(void *data)
248{
249 iconv_t t = (iconv_t) data;
250 iconv_close (t);
251}
252
253static int cb_encoding_handler(void *userData, const char *name,
254 XML_Encoding *info)
255{
256 int i = 0;
257 int no_ok = 0;
258 struct user_info *ui = (struct user_info*) userData;
259
260 iconv_t t = iconv_open("UNICODE", name);
261 if (t == (iconv_t) (-1))
262 return 0;
263
264 info->data = 0; /* signal that multibyte is not in use */
265 yaz_log(ui->loglevel, "Encoding handler of %s", name);
266 for (i = 0; i<256; i++)
267 {
268 size_t ret;
269 char outbuf_[5];
270 char inbuf_[5];
271 char *inbuf = inbuf_;
272 char *outbuf = outbuf_;
273 size_t inleft = 1;
274 size_t outleft = 2;
275 inbuf_[0] = i;
276
277 iconv (t, 0, 0, 0, 0); /* reset iconv */
278
279 ret = iconv(t, &inbuf, &inleft, &outbuf, &outleft);
280 if (ret == (size_t) (-1))
281 {
282 if (errno == EILSEQ)
283 {
284 yaz_log(ui->loglevel, "Encoding %d: invalid sequence", i);
285 info->map[i] = -1; /* invalid sequence */
286 }
287 if (errno == EINVAL)
288 { /* multi byte input */
289 int len = 2;
290 int j = 0;
291 info->map[i] = -1;
292
293 while (len <= 4)
294 {
295 inbuf = inbuf_;
296 inleft = len;
297 outbuf = outbuf_;
298 outleft = 2;
299
300 inbuf_[len-1] = j;
301 iconv (t, 0,0,0,0);
302
303 assert (i >= 0 && i<255);
304
305 ret = iconv(t, &inbuf, &inleft, &outbuf, &outleft);
306 if (ret == (size_t) (-1))
307 {
308 if (errno == EILSEQ || errno == E2BIG)
309 {
310 j++;
311 if (j > 255)
312 break;
313 }
314 else if (errno == EINVAL)
315 {
316 len++;
317 j = 7;
318 }
319 }
320 else if (outleft == 0)
321 {
322 info->map[i] = -len;
323 info->data = t; /* signal that multibyte is in use */
324 break;
325 }
326 else
327 {
328 break;
329 }
330 }
331 if (info->map[i] < -1)
332 yaz_log(ui->loglevel, "Encoding %d: multibyte input %d",
333 i, -info->map[i]);
334 else
335 yaz_log(ui->loglevel, "Encoding %d: multibyte input failed",
336 i);
337 }
338 if (errno == E2BIG)
339 {
340 info->map[i] = -1; /* no room for output */
341 if (i != 0)
342 yaz_log(YLOG_WARN, "Encoding %d: no room for output",
343 i);
344 }
345 }
346 else if (outleft == 0)
347 {
348 unsigned short code;
349 memcpy (&code, outbuf_, sizeof(short));
350 info->map[i] = code;
351 no_ok++;
352 }
353 else
354 { /* should never happen */
355 info->map[i] = -1;
356 yaz_log (YLOG_DEBUG, "Encoding %d: bad state", i);
357 }
358 }
359 if (info->data)
360 { /* at least one multi byte */
361 info->convert = cb_encoding_convert;
362 info->release = cb_encoding_release;
363 }
364 else
365 {
366 /* no multi byte - we no longer need iconv handler */
367 iconv_close(t);
368 info->convert = 0;
369 info->release = 0;
370 }
371 if (!no_ok)
372 return 0;
373 return 1;
374}
375/* HAVE_ICONV_H */
376#endif
377
378static void cb_ns_start(void *userData, const char *prefix, const char *uri)
379{
380 struct user_info *ui = (struct user_info*) userData;
381 if (prefix && uri)
382 yaz_log(ui->loglevel, "cb_ns_start %s %s", prefix, uri);
383}
384
385static void cb_ns_end(void *userData, const char *prefix)
386{
387 struct user_info *ui = (struct user_info*) userData;
388 if (prefix)
389 yaz_log(ui->loglevel, "cb_ns_end %s", prefix);
390}
391
392data1_node *zebra_read_xml(data1_handle dh,
393 struct ZebraRecStream *stream,
394 NMEM m)
395{
396 XML_Parser parser;
397 struct user_info uinfo;
398 int done = 0;
399 data1_node *first_node;
400 int no_read = 0;
401
402 uinfo.loglevel = YLOG_DEBUG;
403 uinfo.level = 1;
404 uinfo.dh = dh;
405 uinfo.nmem = m;
406 uinfo.d1_stack[0] = data1_mk_node2 (dh, m, DATA1N_root, 0);
407 uinfo.d1_stack[1] = 0; /* indicate no children (see end of routine) */
408
409 parser = XML_ParserCreate (0 /* encoding */);
410
411 XML_SetElementHandler(parser, cb_start, cb_end);
412 XML_SetCharacterDataHandler(parser, cb_chardata);
413 XML_SetXmlDeclHandler(parser, cb_decl);
414 XML_SetProcessingInstructionHandler(parser, cb_processing);
415 XML_SetUserData(parser, &uinfo);
416 XML_SetCommentHandler(parser, cb_comment);
417 XML_SetDoctypeDeclHandler(parser, cb_doctype_start, cb_doctype_end);
418 XML_SetEntityDeclHandler(parser, cb_entity_decl);
419 XML_SetExternalEntityRefHandler(parser, cb_external_entity);
420 XML_SetNamespaceDeclHandler(parser, cb_ns_start, cb_ns_end);
421#if HAVE_ICONV_H
422 XML_SetUnknownEncodingHandler(parser, cb_encoding_handler, &uinfo);
423#endif
424 while (!done)
425 {
426 int r;
427 void *buf = XML_GetBuffer(parser, XML_CHUNK);
428 if (!buf)
429 {
430 /* error */
431 yaz_log(YLOG_WARN, "XML_GetBuffer fail");
432 break;
433 }
434 r = stream->readf(stream, buf, XML_CHUNK);
435 if (r < 0)
436 {
437 /* error */
438 yaz_log(YLOG_WARN, "XML read fail");
439 break;
440 }
441 else if (r == 0)
442 done = 1;
443 else
444 no_read += r;
445 if (no_read && !XML_ParseBuffer(parser, r, done))
446 {
447 done = 1;
448 report_xml_error(parser);
449 }
450 }
451 XML_ParserFree(parser);
452 if (no_read == 0)
453 return 0;
454 if (!uinfo.d1_stack[1] || !done)
455 return 0;
456 /* insert XML header if not present .. */
457 first_node = uinfo.d1_stack[0]->child;
458 if (first_node->which != DATA1N_preprocess ||
459 strcmp(first_node->u.preprocess.target, "xml"))
460 {
461 const char *attr_list[5];
462
463 attr_list[0] = "version";
464 attr_list[1] = "1.0";
465
466 attr_list[2] = "encoding";
467 attr_list[3] = "UTF-8"; /* encoding */
468
469 attr_list[4] = 0;
470
471 data1_insert_preprocess(uinfo.dh, uinfo.nmem, "xml", attr_list,
472 uinfo.d1_stack[0]);
473 }
474 return uinfo.d1_stack[0];
475}
476
477struct xml_info {
478 XML_Expat_Version expat_version;
479};
480
481static data1_node *grs_read_xml(struct grs_read_info *p)
482{
483 return zebra_read_xml(p->dh, p->stream, p->mem);
484}
485
486static void *filter_init(Res res, RecType recType)
487{
488 struct xml_info *p = (struct xml_info *) xmalloc (sizeof(*p));
489
490 p->expat_version = XML_ExpatVersionInfo();
491
492 return p;
493}
494
495static void filter_destroy(void *clientData)
496{
497 struct xml_info *p = (struct xml_info *) clientData;
498
499 xfree (p);
500}
501
502static int filter_extract(void *clientData, struct recExtractCtrl *ctrl)
503{
504 return zebra_grs_extract(clientData, ctrl, grs_read_xml);
505}
506
507static int filter_retrieve(void *clientData, struct recRetrieveCtrl *ctrl)
508{
509 return zebra_grs_retrieve(clientData, ctrl, grs_read_xml);
510}
511
512static struct recType filter_type = {
513 0,
514 "grs.xml",
516 0,
520};
521
523#if IDZEBRA_STATIC_GRS_XML
524idzebra_filter_grs_xml
525#else
527#endif
528
529[] = {
531 0,
532};
533
534#endif
535
536/*
537 * Local variables:
538 * c-basic-offset: 4
539 * c-file-style: "Stroustrup"
540 * indent-tabs-mode: nil
541 * End:
542 * vim: shiftwidth=4 tabstop=8 expandtab
543 */
544
data1_node * data1_insert_preprocess(data1_handle dh, NMEM nmem, const char *target, const char **attr, data1_node *at)
Definition d1_read.c:239
data1_node * data1_mk_comment(data1_handle dh, NMEM mem, const char *buf, data1_node *parent)
Definition d1_read.c:362
data1_node * data1_mk_preprocess(data1_handle dh, NMEM nmem, const char *target, const char **attr, data1_node *at)
Definition d1_read.c:219
data1_node * data1_mk_text_nf(data1_handle dh, NMEM mem, const char *buf, size_t len, data1_node *parent)
Definition d1_read.c:339
data1_node * data1_mk_text_n(data1_handle dh, NMEM mem, const char *buf, size_t len, data1_node *parent)
Definition d1_read.c:331
#define DATA1N_root
Definition data1.h:274
data1_node * data1_mk_node2(data1_handle dh, NMEM m, int type, data1_node *parent)
Definition d1_read.c:146
#define DATA1N_preprocess
Definition data1.h:284
data1_node * data1_mk_tag(data1_handle dh, NMEM nmem, const char *tag, const char **attr, data1_node *at)
Definition d1_read.c:295
void data1_set_root(data1_handle dh, data1_node *res, NMEM nmem, const char *name)
Definition d1_read.c:191
static void filter_destroy(void *clientData)
Definition mod_alvis.c:334
static void * filter_init(Res res, RecType recType)
Definition mod_alvis.c:126
RecType idzebra_filter[]
Definition mod_alvis.c:723
static struct recType filter_type
Definition mod_alvis.c:706
static FILE * inf
Definition readfile.c:37
int zebra_grs_retrieve(void *clientData, struct recRetrieveCtrl *p, data1_node *(*grs_read)(struct grs_read_info *))
Definition recgrs.c:1072
int zebra_grs_extract(void *clientData, struct recExtractCtrl *p, data1_node *(*grs_read)(struct grs_read_info *))
Definition recgrs.c:936
record reader stream
Definition recctrl.h:71
int(* readf)(struct ZebraRecStream *s, char *buf, size_t count)
read function
Definition recctrl.h:75
struct data1_node::@2::@7 preprocess
struct data1_node * child
Definition data1.h:341
char * target
Definition data1.h:333
union data1_node::@2 u
int which
Definition data1.h:285
data1_handle dh
Definition recgrs.h:31
struct ZebraRecStream * stream
Definition recgrs.h:28
record extract for indexing
Definition recctrl.h:101
long zint
Zebra integer.
Definition util.h:66
#define ZINT_FORMAT
Definition util.h:72