IDZEBRA  2.2.7
mod_dom.c
Go to the documentation of this file.
1 /* This file is part of the Zebra server.
2  Copyright (C) Index Data
3 
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8 
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 
18 */
19 
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23 #include <stdio.h>
24 #include <assert.h>
25 #include <ctype.h>
26 #include <stdarg.h>
27 
28 #include <yaz/diagbib1.h>
29 #include <yaz/tpath.h>
30 #include <yaz/snprintf.h>
31 
32 #include <libxml/xmlversion.h>
33 #include <libxml/parser.h>
34 #include <libxml/tree.h>
35 #include <libxml/xmlIO.h>
36 #include <libxml/xmlreader.h>
37 #include <libxslt/transform.h>
38 #include <libxslt/xsltutils.h>
39 
40 #if YAZ_HAVE_EXSLT
41 #include <libexslt/exslt.h>
42 #endif
43 
44 #include <idzebra/util.h>
45 #include <idzebra/recctrl.h>
46 #include <yaz/oid_db.h>
47 
48 /* DOM filter style indexing */
49 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
50 static const char *zebra_dom_ns = ZEBRA_DOM_NS;
51 
52 /* DOM filter style indexing */
53 #define ZEBRA_PI_NAME "zebra-2.0"
54 static const char *zebra_pi_name = ZEBRA_PI_NAME;
55 
59 };
60 
61 struct convert_xslt {
62  const char *stylesheet;
63  xsltStylesheetPtr stylesheet_xsp;
64 };
65 
66 struct convert_meta {
67  int dummy;
68 };
69 
70 struct convert_s {
71  enum convert_type which;
72  union {
73  struct convert_xslt xslt;
74  struct convert_meta meta;
75  } u;
76  struct convert_s *next;
77 };
78 
80  const char *name;
81  struct convert_s *convert;
82 };
83 
84 struct filter_store {
85  struct convert_s *convert;
86 };
87 
89  const char *name;
90  const char *identifier;
91  struct convert_s *convert;
93 };
94 
95 #define DOM_INPUT_XMLREADER 1
96 #define DOM_INPUT_MARC 2
97 struct filter_input {
98  const char *syntax;
99  const char *name;
101  int type;
102  union {
103  struct {
104  xmlTextReaderPtr reader;
107  struct {
108  const char *input_charset;
109  yaz_marc_t handle;
110  yaz_iconv_t iconv;
111  } marc;
112  } u;
114 };
115 
116 struct filter_info {
117  char *fname;
118  char *full_name;
119  const char *profile_path;
122  xmlDocPtr doc_config;
128 };
129 
130 
131 
132 #define XML_STRCMP(a,b) strcmp((char*)a, b)
133 #define XML_STRLEN(a) strlen((char*)a)
134 
135 
136 #define FOR_EACH_ELEMENT(ptr) for (; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE)
137 
138 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
139  const char *fmt, ...)
140 #ifdef __GNUC__
141  __attribute__ ((format (printf, 4, 5)))
142 #endif
143  ;
144 
145 static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr,
146  const char *fmt, ...)
147 {
148  va_list ap;
149  char buf[4096];
150 
151  va_start(ap, fmt);
152  yaz_vsnprintf(buf, sizeof(buf)-1, fmt, ap);
153  if (ptr)
154  {
155  yaz_log(level, "%s:%ld: %s", tinfo->fname ? tinfo->fname : "none",
156  xmlGetLineNo(ptr), buf);
157  }
158  else
159  {
160  yaz_log(level, "%s: %s", tinfo->fname ? tinfo->fname : "none", buf);
161  }
162  va_end(ap);
163 }
164 
165 
166 static void set_param_str(const char **params, const char *name,
167  const char *value, NMEM nmem)
168 {
169  char *quoted = nmem_malloc(nmem, 3 + strlen(value));
170  yaz_snprintf(quoted, 3 + strlen(value), "'%s'", value);
171  while (*params)
172  params++;
173  params[0] = name;
174  params[1] = quoted;
175  params[2] = 0;
176 }
177 
178 static void set_param_int(const char **params, const char *name,
179  zint value, NMEM nmem)
180 {
181  char *quoted = nmem_malloc(nmem, 30); /* 25 digits enough for 2^64 */
182  while (*params)
183  params++;
184  yaz_snprintf(quoted, 30, "'" ZINT_FORMAT "'", value);
185  params[0] = name;
186  params[1] = quoted;
187  params[2] = 0;
188 }
189 
190 static void *filter_init(Res res, RecType recType)
191 {
192  struct filter_info *tinfo = (struct filter_info *) xmalloc(sizeof(*tinfo));
193  tinfo->fname = 0;
194  tinfo->full_name = 0;
195  tinfo->profile_path = 0;
196  tinfo->nmem_record = nmem_create();
197  tinfo->nmem_config = nmem_create();
198  tinfo->extract = 0;
199  tinfo->retrieve_list = 0;
200  tinfo->input_list = 0;
201  tinfo->store = 0;
202  tinfo->doc_config = 0;
203  tinfo->record_info_invoked = 0;
204 
205 #if YAZ_HAVE_EXSLT
206  exsltRegisterAll();
207 #endif
208 
209  return tinfo;
210 }
211 
212 static int attr_content(struct _xmlAttr *attr, const char *name,
213  const char **dst_content)
214 {
215  if (!XML_STRCMP(attr->name, name) && attr->children
216  && attr->children->type == XML_TEXT_NODE)
217  {
218  *dst_content = (const char *)(attr->children->content);
219  return 1;
220  }
221  return 0;
222 }
223 
224 static void destroy_xsp(struct convert_s *c)
225 {
226  while (c)
227  {
228  if (c->which == convert_xslt_type)
229  {
230  if (c->u.xslt.stylesheet_xsp)
231  xsltFreeStylesheet(c->u.xslt.stylesheet_xsp);
232  }
233  c = c->next;
234  }
235 }
236 
237 static void destroy_dom(struct filter_info *tinfo)
238 {
239  if (tinfo->extract)
240  {
241  destroy_xsp(tinfo->extract->convert);
242  tinfo->extract = 0;
243  }
244  if (tinfo->store)
245  {
246  destroy_xsp(tinfo->store->convert);
247  tinfo->store = 0;
248  }
249  if (tinfo->input_list)
250  {
251  struct filter_input *i_ptr;
252  for (i_ptr = tinfo->input_list; i_ptr; i_ptr = i_ptr->next)
253  {
254  switch(i_ptr->type)
255  {
256  case DOM_INPUT_XMLREADER:
257  if (i_ptr->u.xmlreader.reader)
258  xmlFreeTextReader(i_ptr->u.xmlreader.reader);
259  break;
260  case DOM_INPUT_MARC:
261  yaz_iconv_close(i_ptr->u.marc.iconv);
262  yaz_marc_destroy(i_ptr->u.marc.handle);
263  break;
264  }
265  destroy_xsp(i_ptr->convert);
266  }
267  tinfo->input_list = 0;
268  }
269  if (tinfo->retrieve_list)
270  {
271  struct filter_retrieve *r_ptr;
272  for (r_ptr = tinfo->retrieve_list; r_ptr; r_ptr = r_ptr->next)
273  destroy_xsp(r_ptr->convert);
274  tinfo->retrieve_list = 0;
275  }
276 
277  if (tinfo->doc_config)
278  {
279  xmlFreeDoc(tinfo->doc_config);
280  tinfo->doc_config = 0;
281  }
282  nmem_reset(tinfo->nmem_config);
283 }
284 
285 static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr,
286  struct convert_s **l)
287 {
288  *l = 0;
289  FOR_EACH_ELEMENT(ptr) {
290  if (!XML_STRCMP(ptr->name, "xslt"))
291  {
292  struct _xmlAttr *attr;
293  struct convert_s *p = nmem_malloc(tinfo->nmem_config, sizeof(*p));
294 
295  p->next = 0;
297  p->u.xslt.stylesheet = 0;
298  p->u.xslt.stylesheet_xsp = 0;
299 
300  for (attr = ptr->properties; attr; attr = attr->next)
301  if (attr_content(attr, "stylesheet", &p->u.xslt.stylesheet))
302  ;
303  else
304  {
305  dom_log(YLOG_WARN, tinfo, ptr,
306  "bad attribute @%s", attr->name);
307  }
308  if (p->u.xslt.stylesheet)
309  {
310  char tmp_xslt_full_name[1024];
311  if (!yaz_filepath_resolve(p->u.xslt.stylesheet,
312  tinfo->profile_path,
313  NULL,
314  tmp_xslt_full_name))
315  {
316  dom_log(YLOG_WARN, tinfo, 0,
317  "stylesheet %s not found in "
318  "path %s",
319  p->u.xslt.stylesheet,
320  tinfo->profile_path);
321  return ZEBRA_FAIL;
322  }
323 
325  = xsltParseStylesheetFile((const xmlChar*)
326  tmp_xslt_full_name);
327  if (!p->u.xslt.stylesheet_xsp)
328  {
329  dom_log(YLOG_WARN, tinfo, 0,
330  "could not parse xslt stylesheet %s",
331  tmp_xslt_full_name);
332  return ZEBRA_FAIL;
333  }
334  }
335  else
336  {
337  dom_log(YLOG_WARN, tinfo, ptr,
338  "missing attribute 'stylesheet'");
339  return ZEBRA_FAIL;
340  }
341  *l = p;
342  l = &p->next;
343  }
344  else if (!XML_STRCMP(ptr->name, "process-meta"))
345  {
346  struct _xmlAttr *attr;
347  struct convert_s *p = nmem_malloc(tinfo->nmem_config, sizeof(*p));
348 
349  p->next = 0;
351 
352  for (attr = ptr->properties; attr; attr = attr->next)
353  dom_log(YLOG_WARN, tinfo, ptr,
354  "bad attribute @%s", attr->name);
355  *l = p;
356  l = &p->next;
357  }
358  else
359  {
360  dom_log(YLOG_WARN, tinfo, ptr,
361  "bad element '%s', expected <xslt>", ptr->name);
362  return ZEBRA_FAIL;
363  }
364  }
365  return ZEBRA_OK;
366 }
367 
368 static int process_meta(struct filter_info *tinfo, xmlDocPtr doc, xmlNodePtr node,
369  struct recRetrieveCtrl *retctr)
370 {
371 
372  if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href &&
373  0 == XML_STRCMP(node->ns->href, zebra_dom_ns))
374  {
375  if (0 == XML_STRCMP(node->name, "meta"))
376  {
377  const char *element_set_name = 0;
378 
379  struct _xmlAttr *attr;
380  for (attr = node->properties; attr; attr = attr->next)
381  {
382  if (attr_content(attr, "name", &element_set_name))
383  ;
384  else
385  {
386  dom_log(YLOG_WARN, tinfo, node,
387  "bad attribute @%s, expected @name", attr->name);
388  }
389  }
390  if (element_set_name)
391  {
392  WRBUF result = wrbuf_alloc();
393  WRBUF addinfo = wrbuf_alloc();
394  const Odr_oid *input_format = yaz_oid_recsyn_xml;
395  const Odr_oid *output_format = 0;
396  int ret;
397 
398  ret = retctr->special_fetch(retctr->handle,
399  element_set_name,
400  input_format, &output_format,
401  result, addinfo);
402  if (ret == 0)
403  {
404  xmlDocPtr sub_doc =
405  xmlParseMemory(wrbuf_buf(result), wrbuf_len(result));
406  if (sub_doc)
407  {
408  xmlNodePtr t = xmlDocGetRootElement(sub_doc);
409  xmlReplaceNode(node, xmlCopyNode(t, 1));
410  xmlFreeDoc(sub_doc);
411  }
412  }
413  wrbuf_destroy(result);
414  wrbuf_destroy(addinfo);
415  }
416  }
417  }
418  for (node = node->children; node; node = node->next)
419  process_meta(tinfo, doc, node, retctr);
420  return 0;
421 }
422 
423 static ZEBRA_RES perform_convert(struct filter_info *tinfo,
424  struct recExtractCtrl *extctr,
425  struct recRetrieveCtrl *retctr,
426  struct convert_s *convert,
427  const char **params,
428  xmlDocPtr *doc,
429  xsltStylesheetPtr *last_xsp)
430 {
431  for (; convert; convert = convert->next)
432  {
433  if (convert->which == convert_xslt_type)
434  {
435  xmlChar *buf_out = 0;
436  int len_out = 0;
437  xmlDocPtr res_doc = xsltApplyStylesheet(convert->u.xslt.stylesheet_xsp,
438  *doc, params);
439  if (last_xsp)
440  *last_xsp = convert->u.xslt.stylesheet_xsp;
441 
442  if (!res_doc)
443  break;
444 
445  /* now saving into buffer and re-reading into DOM to avoid annoing
446  XSLT problem with thrown-out indentation text nodes */
447  xsltSaveResultToString(&buf_out, &len_out, res_doc,
448  convert->u.xslt.stylesheet_xsp);
449  xmlFreeDoc(res_doc);
450 
451  xmlFreeDoc(*doc);
452 
453  *doc = xmlParseMemory((const char *) buf_out, len_out);
454 
455  /* writing debug info out */
456  if (extctr && extctr->flagShowRecords)
457  yaz_log(YLOG_LOG, "%s: XSLT %s\n %.*s",
458  tinfo->fname ? tinfo->fname : "(none)",
459  convert->u.xslt.stylesheet,
460  len_out, buf_out);
461 
462  xmlFree(buf_out);
463  }
464  else if (convert->which == convert_meta_type)
465  {
466  if (retctr) /* only execute meta on retrieval */
467  {
468  process_meta(tinfo, *doc, xmlDocGetRootElement(*doc), retctr);
469 
470  /* last stylesheet absent */
471  if (last_xsp)
472  *last_xsp = 0;
473  }
474  }
475  }
476  return ZEBRA_OK;
477 }
478 
479 static struct filter_input *new_input(struct filter_info *tinfo, int type)
480 {
481  struct filter_input *p;
482  struct filter_input **np = &tinfo->input_list;
483  for (;*np; np = &(*np)->next)
484  ;
485  p = *np = nmem_malloc(tinfo->nmem_config, sizeof(*p));
486  p->next = 0;
487  p->syntax = 0;
488  p->name = 0;
489  p->convert = 0;
490  p->type = type;
491  return p;
492 }
493 
494 static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr,
495  const char *syntax, const char *name)
496 {
497  FOR_EACH_ELEMENT(ptr) {
498  if (!XML_STRCMP(ptr->name, "marc"))
499  {
500  yaz_iconv_t iconv = 0;
501  const char *input_charset = "marc-8";
502  struct _xmlAttr *attr;
503 
504  for (attr = ptr->properties; attr; attr = attr->next)
505  {
506  if (attr_content(attr, "inputcharset", &input_charset))
507  ;
508  else
509  {
510  dom_log(YLOG_WARN, tinfo, ptr,
511  "bad attribute @%s, expected @inputcharset",
512  attr->name);
513  }
514  }
515  iconv = yaz_iconv_open("utf-8", input_charset);
516  if (!iconv)
517  {
518  dom_log(YLOG_WARN, tinfo, ptr,
519  "unsupported @charset '%s'", input_charset);
520  return ZEBRA_FAIL;
521  }
522  else
523  {
524  struct filter_input *p
525  = new_input(tinfo, DOM_INPUT_MARC);
526  p->u.marc.handle = yaz_marc_create();
527  p->u.marc.iconv = iconv;
528 
529  yaz_marc_iconv(p->u.marc.handle, p->u.marc.iconv);
530 
531  ptr = ptr->next;
532 
533  parse_convert(tinfo, ptr, &p->convert);
534  }
535  break;
536 
537  }
538  else if (!XML_STRCMP(ptr->name, "xmlreader"))
539  {
540  struct filter_input *p
541  = new_input(tinfo, DOM_INPUT_XMLREADER);
542  struct _xmlAttr *attr;
543  const char *level_str = 0;
544 
545  p->u.xmlreader.split_level = 0;
546  p->u.xmlreader.reader = 0;
547 
548  for (attr = ptr->properties; attr; attr = attr->next)
549  {
550  if (attr_content(attr, "level", &level_str))
551  ;
552  else
553  {
554  dom_log(YLOG_WARN, tinfo, ptr,
555  "bad attribute @%s, expected @level",
556  attr->name);
557  }
558  }
559  if (level_str)
560  p->u.xmlreader.split_level = atoi(level_str);
561 
562  ptr = ptr->next;
563 
564  parse_convert(tinfo, ptr, &p->convert);
565  break;
566  }
567  else
568  {
569  dom_log(YLOG_WARN, tinfo, ptr,
570  "bad element <%s>, expected <marc>|<xmlreader>",
571  ptr->name);
572  return ZEBRA_FAIL;
573  }
574  }
575  return ZEBRA_OK;
576 }
577 
578 static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
579 {
580  char tmp_full_name[1024];
581  xmlNodePtr ptr;
582  xmlDocPtr doc;
583 
584  tinfo->fname = nmem_strdup(tinfo->nmem_config, fname);
585 
586  if (yaz_filepath_resolve(tinfo->fname, tinfo->profile_path,
587  NULL, tmp_full_name))
588  tinfo->full_name = nmem_strdup(tinfo->nmem_config, tmp_full_name);
589  else
590  tinfo->full_name = nmem_strdup(tinfo->nmem_config, tinfo->fname);
591 
592  yaz_log(YLOG_LOG, "%s dom filter: "
593  "loading config file %s", tinfo->fname, tinfo->full_name);
594 
595  doc = xmlParseFile(tinfo->full_name);
596  if (!doc)
597  {
598  yaz_log(YLOG_WARN, "%s: dom filter: "
599  "failed to parse config file %s",
600  tinfo->fname, tinfo->full_name);
601  return ZEBRA_FAIL;
602  }
603  /* save because we store ptrs to the content */
604  tinfo->doc_config = doc;
605 
606  ptr = xmlDocGetRootElement(doc);
607  if (!ptr || ptr->type != XML_ELEMENT_NODE
608  || XML_STRCMP(ptr->name, "dom"))
609  {
610  dom_log(YLOG_WARN, tinfo, ptr,
611  "bad root element <%s>, expected root element <dom>",
612  ptr->name);
613  return ZEBRA_FAIL;
614  }
615 
616  ptr = ptr->children;
617  FOR_EACH_ELEMENT(ptr) {
618  if (!XML_STRCMP(ptr->name, "extract"))
619  {
620  /*
621  <extract name="index">
622  <xslt stylesheet="first.xsl"/>
623  <xslt stylesheet="second.xsl"/>
624  </extract>
625  */
626  struct _xmlAttr *attr;
627  struct filter_extract *f =
628  nmem_malloc(tinfo->nmem_config, sizeof(*f));
629 
630  tinfo->extract = f;
631  f->name = 0;
632  f->convert = 0;
633  for (attr = ptr->properties; attr; attr = attr->next)
634  {
635  if (attr_content(attr, "name", &f->name))
636  ;
637  else
638  {
639  dom_log(YLOG_WARN, tinfo, ptr,
640  "bad attribute @%s, expected @name",
641  attr->name);
642  }
643  }
644  parse_convert(tinfo, ptr->children, &f->convert);
645  }
646  else if (!XML_STRCMP(ptr->name, "retrieve"))
647  {
648  /*
649  <retrieve name="F">
650  <xslt stylesheet="some.xsl"/>
651  <xslt stylesheet="some.xsl"/>
652  </retrieve>
653  */
654  struct _xmlAttr *attr;
655  struct filter_retrieve **fp = &tinfo->retrieve_list;
656  struct filter_retrieve *f =
657  nmem_malloc(tinfo->nmem_config, sizeof(*f));
658 
659  while (*fp)
660  fp = &(*fp)->next;
661 
662  *fp = f;
663  f->name = 0;
664  f->identifier = 0;
665  f->convert = 0;
666  f->next = 0;
667 
668  for (attr = ptr->properties; attr; attr = attr->next)
669  {
670  if (attr_content(attr, "identifier",
671  &f->identifier))
672  ;
673  else if (attr_content(attr, "name", &f->name))
674  ;
675  else
676  {
677  dom_log(YLOG_WARN, tinfo, ptr,
678  "bad attribute @%s, expected @identifier|@name",
679  attr->name);
680  }
681  }
682  parse_convert(tinfo, ptr->children, &f->convert);
683  }
684  else if (!XML_STRCMP(ptr->name, "store"))
685  {
686  /*
687  <store name="F">
688  <xslt stylesheet="some.xsl"/>
689  <xslt stylesheet="some.xsl"/>
690  </retrieve>
691  */
692  struct filter_store *f =
693  nmem_malloc(tinfo->nmem_config, sizeof(*f));
694 
695  tinfo->store = f;
696  f->convert = 0;
697  parse_convert(tinfo, ptr->children, &f->convert);
698  }
699  else if (!XML_STRCMP(ptr->name, "input"))
700  {
701  /*
702  <input syntax="xml">
703  <xmlreader level="1"/>
704  </input>
705  <input syntax="usmarc">
706  <marc inputcharset="marc-8"/>
707  </input>
708  */
709  struct _xmlAttr *attr;
710  const char *syntax = 0;
711  const char *name = 0;
712  for (attr = ptr->properties; attr; attr = attr->next)
713  {
714  if (attr_content(attr, "syntax", &syntax))
715  ;
716  else if (attr_content(attr, "name", &name))
717  ;
718  else
719  {
720  dom_log(YLOG_WARN, tinfo, ptr,
721  "bad attribute @%s, expected @syntax|@name",
722  attr->name);
723  }
724  }
725  parse_input(tinfo, ptr->children, syntax, name);
726  }
727  else
728  {
729  dom_log(YLOG_WARN, tinfo, ptr,
730  "bad element <%s>, "
731  "expected <extract>|<input>|<retrieve>|<store>",
732  ptr->name);
733  return ZEBRA_FAIL;
734  }
735  }
736  if (!tinfo->input_list)
737  {
738  struct filter_input *p
739  = new_input(tinfo, DOM_INPUT_XMLREADER);
740  p->u.xmlreader.split_level = 0;
741  p->u.xmlreader.reader = 0;
742  }
743  return ZEBRA_OK;
744 }
745 
746 static struct filter_retrieve *lookup_retrieve(struct filter_info *tinfo,
747  const char *est)
748 {
749  struct filter_retrieve *f = tinfo->retrieve_list;
750 
751  /* return first schema if no est is provided */
752  if (!est)
753  return f;
754  for (; f; f = f->next)
755  {
756  /* find requested schema */
757  if (est)
758  {
759  if (f->identifier && !strcmp(f->identifier, est))
760  return f;
761  if (f->name && !strcmp(f->name, est))
762  return f;
763  }
764  }
765  return 0;
766 }
767 
768 static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
769 {
770  struct filter_info *tinfo = clientData;
771  if (!args || !*args)
772  {
773  yaz_log(YLOG_WARN, "dom filter: need config file");
774  return ZEBRA_FAIL;
775  }
776 
777  if (tinfo->fname && !strcmp(args, tinfo->fname))
778  return ZEBRA_OK;
779 
780  tinfo->profile_path = res_get(res, "profilePath");
781 
782  destroy_dom(tinfo);
783  return parse_dom(tinfo, args);
784 }
785 
786 static void filter_destroy(void *clientData)
787 {
788  struct filter_info *tinfo = clientData;
789  destroy_dom(tinfo);
790  nmem_destroy(tinfo->nmem_config);
791  nmem_destroy(tinfo->nmem_record);
792  xfree(tinfo);
793 }
794 
795 static int ioread_ex(void *context, char *buffer, int len)
796 {
797  struct recExtractCtrl *p = context;
798  return p->stream->readf(p->stream, buffer, len);
799 }
800 
801 static int ioclose_ex(void *context)
802 {
803  return 0;
804 }
805 
806 
807 
808 /* DOM filter style indexing */
809 static void index_value_of(struct filter_info *tinfo,
810  struct recExtractCtrl *extctr,
811  RecWord* recword,
812  xmlNodePtr node,
813  const char *index_p)
814 {
815  if (tinfo->record_info_invoked == 1)
816  {
817  xmlChar *text = xmlNodeGetContent(node);
818  size_t text_len = strlen((const char *)text);
819 
820  /* if there is no text, we do not need to proceed */
821  if (text_len)
822  {
823  /* keep seqno base so that all text will have
824  identical seqno's for multiple fields , e.g
825  <z:index name="title:w any:w title:p">.. */
826 
827  zint seqno_base = recword->seqno;
828  zint seqno_max = recword->seqno;
829 
830 
831  const char *look = index_p;
832  const char *bval;
833  const char *eval;
834 
835  xmlChar index[256];
836  xmlChar type[256];
837 
838  /* assingning text to be indexed */
839  recword->term_buf = (const char *)text;
840  recword->term_len = text_len;
841 
842  /* parsing all index name/type pairs */
843  /* may not start with ' ' or ':' */
844  while (*look && ' ' != *look && ':' != *look)
845  {
846  /* setting name and type to zero */
847  *index = '\0';
848  *type = '\0';
849 
850  /* parsing one index name */
851  bval = look;
852  while (*look && ':' != *look && ' ' != *look)
853  {
854  look++;
855  }
856  eval = look;
857  strncpy((char *)index, (const char *)bval, eval - bval);
858  index[eval - bval] = '\0';
859 
860 
861  /* parsing one index type, if existing */
862  if (':' == *look)
863  {
864  look++;
865 
866  bval = look;
867  while (*look && ' ' != *look)
868  {
869  look++;
870  }
871  eval = look;
872  strncpy((char *)type, (const char *)bval, eval - bval);
873  type[eval - bval] = '\0';
874  }
875 
876  /* actually indexing the text given */
877 
878  recword->seqno = seqno_base;
879  recword->index_name = (const char *)index;
880  if (*type)
881  recword->index_type = (const char *) type;
882 
883  /* writing debug out */
884  if (extctr->flagShowRecords)
885  dom_log(YLOG_LOG, tinfo, 0,
886  "INDEX '%s:%s' '%s'",
887  (const char *) index,
888  (const char *) type,
889  (const char *) text);
890 
891  (extctr->tokenAdd)(recword);
892 
893  if (seqno_max < recword->seqno)
894  seqno_max = recword->seqno;
895 
896  /* eat whitespaces */
897  if (*look && ' ' == *look)
898  {
899  look++;
900  }
901  }
902  recword->seqno = seqno_max;
903  }
904  xmlFree(text);
905  }
906 }
907 
908 
909 /* DOM filter style indexing */
910 static void set_record_info(struct filter_info *tinfo,
911  struct recExtractCtrl *extctr,
912  xmlNodePtr node,
913  const char * id_p,
914  const char * rank_p,
915  const char * type_p)
916 {
917  /* writing debug info out */
918  if (extctr && extctr->flagShowRecords)
919  dom_log(YLOG_LOG, tinfo, node,
920  "RECORD id=%s rank=%s type=%s",
921  id_p ? (const char *) id_p : "(null)",
922  rank_p ? (const char *) rank_p : "(null)",
923  type_p ? (const char *) type_p : "(null)");
924 
925 
926  if (id_p && *id_p)
927  {
928  size_t l = strlen(id_p);
929  if (l >= sizeof(extctr->match_criteria))
930  l = sizeof(extctr->match_criteria)-1;
931  memcpy(extctr->match_criteria, id_p, l);
932  extctr->match_criteria[l] = '\0';
933  }
934 
935  if (rank_p && *rank_p)
936  extctr->staticrank = atozint((const char *)rank_p);
937 
938  if (type_p && *type_p)
939  {
941  if (!strcmp(type_p, "insert"))
943  else if (!strcmp(type_p, "delete"))
945  else if (!strcmp(type_p, "replace"))
947  else if (!strcmp(type_p, "update"))
949  else if (!strcmp(type_p, "adelete"))
951  else
952  dom_log(YLOG_WARN, tinfo, node, "bad @type value: %s", type_p);
953  extctr->action = action;
954  }
955 
956  if (tinfo->record_info_invoked == 1)
957  {
958  /* warn about multiple only once */
959  dom_log(YLOG_WARN, tinfo, node, "multiple record elements");
960  }
961  tinfo->record_info_invoked++;
962 
963 }
964 
965 
966 static void process_xml_element_node(struct filter_info *tinfo,
967  struct recExtractCtrl *extctr,
968  RecWord* recword,
969  xmlNodePtr node);
970 
971 /* DOM filter style indexing */
972 static void process_xml_element_zebra_node(struct filter_info *tinfo,
973  struct recExtractCtrl *extctr,
974  RecWord* recword,
975  xmlNodePtr node)
976 {
977  if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href
978  && 0 == XML_STRCMP(node->ns->href, zebra_dom_ns))
979  {
980  if (0 == XML_STRCMP(node->name, "index"))
981  {
982  const char *index_p = 0;
983 
984  struct _xmlAttr *attr;
985  for (attr = node->properties; attr; attr = attr->next)
986  {
987  if (attr_content(attr, "name", &index_p))
988  {
989  index_value_of(tinfo, extctr, recword, node, index_p);
990  }
991  else
992  {
993  dom_log(YLOG_WARN, tinfo, node,
994  "bad attribute @%s, expected @name",
995  attr->name);
996  }
997  }
998  }
999  else if (0 == XML_STRCMP(node->name, "group"))
1000  {
1001  const char *unit_p = "element";
1002 
1003  struct _xmlAttr *attr;
1004  for (attr = node->properties; attr; attr = attr->next)
1005  {
1006  if (attr_content(attr, "unit", &unit_p))
1007  ;
1008  else
1009  {
1010  dom_log(YLOG_WARN, tinfo, node,
1011  "bad attribute @%s, expected @unit",
1012  attr->name);
1013  }
1014  }
1015  if (node->children)
1016  {
1017  WRBUF w = wrbuf_alloc();
1018  wrbuf_puts(w, ZEBRA_GROUP_INDEX_NAME);
1019  wrbuf_puts(w, unit_p);
1020  recword->term_buf = "begin";
1021  recword->term_len = 5;
1022  recword->index_name = wrbuf_cstr(w);
1023  recword->index_type = "0";
1024 
1025  if (extctr->flagShowRecords)
1026  dom_log(YLOG_LOG, tinfo, 0,
1027  "INDEX '%s:%s' '%s'",
1028  (const char *) recword->index_name,
1029  (const char *) recword->index_type,
1030  (const char *) recword->term_buf);
1031  (extctr->tokenAdd)(recword);
1032 
1033  for (node = node->children; node; node = node->next)
1034  process_xml_element_node(tinfo, extctr, recword,
1035  node);
1036  recword->term_buf = "end";
1037  recword->term_len = 3;
1038  recword->index_name = wrbuf_cstr(w);
1039  recword->index_type = "0";
1040  if (extctr->flagShowRecords)
1041  dom_log(YLOG_LOG, tinfo, 0,
1042  "INDEX '%s:%s' '%s'",
1043  (const char *) recword->index_name,
1044  (const char *) recword->index_type,
1045  (const char *) recword->term_buf);
1046  (extctr->tokenAdd)(recword);
1047  wrbuf_destroy(w);
1048  }
1049  }
1050  else if (0 == XML_STRCMP(node->name, "record"))
1051  {
1052  const char *id_p = 0;
1053  const char *rank_p = 0;
1054  const char *type_p = 0;
1055 
1056  struct _xmlAttr *attr;
1057  for (attr = node->properties; attr; attr = attr->next)
1058  {
1059  if (attr_content(attr, "id", &id_p))
1060  ;
1061  else if (attr_content(attr, "rank", &rank_p))
1062  ;
1063  else if (attr_content(attr, "type", &type_p))
1064  ;
1065  else
1066  {
1067  dom_log(YLOG_WARN, tinfo, node,
1068  "bad attribute @%s, expected @id|@rank|@type",
1069  attr->name);
1070  }
1071  }
1072  set_record_info(tinfo, extctr, node, id_p, rank_p, type_p);
1073  }
1074  else
1075  {
1076  dom_log(YLOG_WARN, tinfo, node,
1077  "bad element <%s>,"
1078  " expected <record>|<index> in namespace '%s'",
1079  node->name, zebra_dom_ns);
1080  }
1081  }
1082 }
1083 
1084 static int attr_content_pi(const char **c_ptr, const char *name,
1085  char *value, size_t value_max)
1086 {
1087  size_t name_len = strlen(name);
1088  const char *look = *c_ptr;
1089  int ret = 0;
1090 
1091  if (strlen(look) > name_len)
1092  {
1093  if (look[name_len] == '=' && !memcmp(look, name, name_len))
1094  {
1095  size_t i = 0;
1096  look += name_len+1;
1097  while (*look && ' ' != *look)
1098  {
1099  if (i < value_max-1)
1100  value[i++] = *look;
1101  look++;
1102  }
1103  value[i] = '\0';
1104  ret = 1;
1105  }
1106  }
1107  *c_ptr = look;
1108  return ret;
1109 }
1110 
1111 /* DOM filter style indexing */
1112 static void process_xml_pi_node(struct filter_info *tinfo,
1113  struct recExtractCtrl *extctr,
1114  xmlNodePtr node,
1115  const char **index_pp)
1116 {
1117  /* if right PI name, continue parsing PI */
1118  if (0 == strcmp(zebra_pi_name, (const char *)node->name))
1119  {
1120  xmlChar *pi_p = node->content;
1121  const char *look = (const char *) node->content;
1122 
1123  /* parsing PI record instructions */
1124  if (0 == strncmp((const char *)look, "record", 6))
1125  {
1126  char id[256];
1127  char rank[256];
1128  char type[256];
1129 
1130  *id = '\0';
1131  *rank = '\0';
1132  *type = '\0';
1133  look += 6;
1134  for (;;)
1135  {
1136  /* eat whitespace */
1137  while (' ' == *look)
1138  look++;
1139  if (*look == '\0')
1140  break;
1141  if (attr_content_pi(&look, "id", id, sizeof(id)))
1142  ;
1143  else if (attr_content_pi(&look, "rank", rank, sizeof(rank)))
1144  ;
1145  else if (attr_content_pi(&look, "type", type, sizeof(type)))
1146  ;
1147  else
1148  {
1149  dom_log(YLOG_WARN, tinfo, node,
1150  "content '%s', can not parse '%s'",
1151  pi_p, look);
1152  break;
1153  }
1154  }
1155  set_record_info(tinfo, extctr, node, id, rank, type);
1156  }
1157  /* parsing index instruction */
1158  else if (0 == strncmp((const char *)look, "index", 5))
1159  {
1160  look += 5;
1161 
1162  /* eat whitespace */
1163  while (*look && ' ' == *look)
1164  look++;
1165 
1166  /* export index instructions to outside */
1167  *index_pp = look;
1168  }
1169  else
1170  {
1171  dom_log(YLOG_WARN, tinfo, node,
1172  "content '%s', can not parse '%s'",
1173  pi_p, look);
1174  }
1175  }
1176 }
1177 
1178 /* DOM filter style indexing */
1179 static void process_xml_element_node(struct filter_info *tinfo,
1180  struct recExtractCtrl *extctr,
1181  RecWord* recword,
1182  xmlNodePtr node)
1183 {
1184  /* remember indexing instruction from PI to next element node */
1185  const char *index_p = 0;
1186 
1187  /* check if we are an element node in the special zebra namespace
1188  and either set record data or index value-of node content*/
1189  process_xml_element_zebra_node(tinfo, extctr, recword, node);
1190 
1191  /* loop through kid nodes */
1192  for (node = node->children; node; node = node->next)
1193  {
1194  /* check and set PI record and index index instructions */
1195  if (node->type == XML_PI_NODE)
1196  {
1197  process_xml_pi_node(tinfo, extctr, node, &index_p);
1198  }
1199  else if (node->type == XML_ELEMENT_NODE)
1200  {
1201  /* if there was a PI index instruction before this element */
1202  if (index_p)
1203  {
1204  index_value_of(tinfo, extctr, recword, node, index_p);
1205  index_p = 0;
1206  }
1207  process_xml_element_node(tinfo, extctr, recword,node);
1208  }
1209  else
1210  continue;
1211  }
1212 }
1213 
1214 
1215 /* DOM filter style indexing */
1216 static void extract_dom_doc_node(struct filter_info *tinfo,
1217  struct recExtractCtrl *extctr,
1218  xmlDocPtr doc)
1219 {
1220  /* only need to do the initialization once, reuse recword for all terms */
1221  RecWord recword;
1222  (*extctr->init)(extctr, &recword);
1223 
1224  process_xml_element_node(tinfo, extctr, &recword, (xmlNodePtr)doc);
1225 }
1226 
1227 
1228 static int convert_extract_doc(struct filter_info *tinfo,
1229  struct filter_input *input,
1230  struct recExtractCtrl *p,
1231  xmlDocPtr doc)
1232 {
1233  xmlChar *buf_out;
1234  int len_out;
1235  const char *params[10];
1236  xsltStylesheetPtr last_xsp = 0;
1237 
1238  /* per default do not ingest record */
1239  tinfo->record_info_invoked = 0;
1240 
1241  /* exit if empty document given */
1242  if (!doc)
1243  return RECCTRL_EXTRACT_SKIP;
1244 
1245  /* we actuallu have a document which needs to be processed further */
1246  params[0] = 0;
1247  set_param_str(params, "schema", zebra_dom_ns, tinfo->nmem_record);
1248 
1249  if (p && p->flagShowRecords)
1250  {
1251  xmlChar *buf_out;
1252  int len_out;
1253  xmlDocDumpMemory(doc, &buf_out, &len_out);
1254 #if 0
1255  FILE *outf = fopen("extract.xml", "w");
1256  fwrite(buf_out, 1, len_out, outf);
1257  fclose(outf);
1258 #endif
1259  yaz_log(YLOG_LOG, "Extract Doc: %.*s", len_out, buf_out);
1260  }
1261 
1262  if (p->setStoreData)
1263  {
1264  xmlDocPtr store_doc = 0;
1265 
1266  /* input conversion */
1267  perform_convert(tinfo, p, 0, input->convert, params, &doc, 0);
1268 
1269  if (tinfo->store)
1270  {
1271  /* store conversion */
1272  store_doc = xmlCopyDoc(doc, 1);
1273  perform_convert(tinfo, p, 0, tinfo->store->convert,
1274  params, &store_doc, &last_xsp);
1275  }
1276 
1277  /* saving either store doc or original doc in case no store doc exists */
1278  if (last_xsp)
1279  xsltSaveResultToString(&buf_out, &len_out,
1280  store_doc ? store_doc : doc, last_xsp);
1281  else
1282  xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1283 
1284  if (p->setStoreData)
1285  (*p->setStoreData)(p, buf_out, len_out);
1286  xmlFree(buf_out);
1287  if (store_doc)
1288  xmlFreeDoc(store_doc);
1289  }
1290 
1291 
1292  /* extract conversion */
1293  perform_convert(tinfo, p, 0, tinfo->extract->convert, params, &doc, 0);
1294 
1295 
1296  /* finally, do the indexing */
1297  if (doc){
1298  extract_dom_doc_node(tinfo, p, doc);
1299  xmlFreeDoc(doc);
1300  }
1301 
1302  /* there was nothing to index, so there is no inserted/updated record */
1303  if (tinfo->record_info_invoked == 0)
1304  return RECCTRL_EXTRACT_SKIP;
1305 
1306  return RECCTRL_EXTRACT_OK;
1307 }
1308 
1309 static int extract_xml_split(struct filter_info *tinfo,
1310  struct filter_input *input,
1311  struct recExtractCtrl *p)
1312 {
1313  int ret;
1314 
1315  if (p->first_record)
1316  {
1317  if (input->u.xmlreader.reader)
1318  xmlFreeTextReader(input->u.xmlreader.reader);
1319  input->u.xmlreader.reader = xmlReaderForIO(ioread_ex, ioclose_ex,
1320  p /* I/O handler */,
1321  0 /* URL */,
1322  0 /* encoding */,
1323  XML_PARSE_XINCLUDE
1324  | XML_PARSE_NOENT
1325  | XML_PARSE_NONET);
1326  }
1327  if (!input->u.xmlreader.reader)
1329 
1330  ret = xmlTextReaderRead(input->u.xmlreader.reader);
1331  while (ret == 1)
1332  {
1333  int type = xmlTextReaderNodeType(input->u.xmlreader.reader);
1334  int depth = xmlTextReaderDepth(input->u.xmlreader.reader);
1335 
1336  if (type == XML_READER_TYPE_ELEMENT &&
1337  input->u.xmlreader.split_level == depth)
1338  {
1339  xmlNodePtr ptr;
1340 
1341  /* per default do not ingest record */
1342  tinfo->record_info_invoked = 0;
1343 
1344  ptr = xmlTextReaderExpand(input->u.xmlreader.reader);
1345  if (ptr)
1346  {
1347  /* we have a new document */
1348 
1349  xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1350  xmlDocPtr doc = xmlNewDoc((const xmlChar*) "1.0");
1351 
1352  xmlDocSetRootElement(doc, ptr2);
1353 
1354  /* writing debug info out */
1355  if (p->flagShowRecords)
1356  {
1357  xmlChar *buf_out = 0;
1358  int len_out = 0;
1359  xmlDocDumpMemory(doc, &buf_out, &len_out);
1360  yaz_log(YLOG_LOG, "%s: XMLREADER level: %i\n%.*s",
1361  tinfo->fname ? tinfo->fname : "(none)",
1362  depth, len_out, buf_out);
1363  xmlFree(buf_out);
1364  }
1365 
1366  return convert_extract_doc(tinfo, input, p, doc);
1367  }
1368  else
1369  {
1370  xmlFreeTextReader(input->u.xmlreader.reader);
1371  input->u.xmlreader.reader = 0;
1373  }
1374  }
1375  ret = xmlTextReaderRead(input->u.xmlreader.reader);
1376  }
1377  xmlFreeTextReader(input->u.xmlreader.reader);
1378  input->u.xmlreader.reader = 0;
1379  return RECCTRL_EXTRACT_EOF;
1380 }
1381 
1382 static int extract_xml_full(struct filter_info *tinfo,
1383  struct filter_input *input,
1384  struct recExtractCtrl *p)
1385 {
1386  if (p->first_record) /* only one record per stream */
1387  {
1388  xmlDocPtr doc = xmlReadIO(ioread_ex, ioclose_ex,
1389  p /* I/O handler */,
1390  0 /* URL */,
1391  0 /* encoding */,
1392  XML_PARSE_XINCLUDE
1393  | XML_PARSE_NOENT
1394  | XML_PARSE_NONET);
1395  if (!doc)
1396  {
1398  }
1399  return convert_extract_doc(tinfo, input, p, doc);
1400  }
1401  else
1402  return RECCTRL_EXTRACT_EOF;
1403 }
1404 
1405 static int extract_iso2709(struct filter_info *tinfo,
1406  struct filter_input *input,
1407  struct recExtractCtrl *p)
1408 {
1409  char buf[100000];
1410  int record_length;
1411  int read_bytes, r;
1412 
1413  if (p->stream->readf(p->stream, buf, 5) != 5)
1414  return RECCTRL_EXTRACT_EOF;
1415  while (*buf < '0' || *buf > '9')
1416  {
1417  int i;
1418 
1419  dom_log(YLOG_WARN, tinfo, 0,
1420  "MARC: Skipping bad byte %d (0x%02X)",
1421  *buf & 0xff, *buf & 0xff);
1422  for (i = 0; i < 4; i++)
1423  buf[i] = buf[i+1];
1424 
1425  if (p->stream->readf(p->stream, buf+4, 1) != 1)
1426  return RECCTRL_EXTRACT_EOF;
1427  }
1428  record_length = atoi_n (buf, 5);
1429  if (record_length < 25)
1430  {
1431  dom_log(YLOG_WARN, tinfo, 0,
1432  "MARC record length < 25, is %d", record_length);
1434  }
1435  read_bytes = p->stream->readf(p->stream, buf+5, record_length-5);
1436  if (read_bytes < record_length-5)
1437  {
1438  dom_log(YLOG_WARN, tinfo, 0,
1439  "couldn't read whole MARC record");
1441  }
1442  r = yaz_marc_read_iso2709(input->u.marc.handle, buf, record_length);
1443  if (r < record_length)
1444  {
1445  dom_log (YLOG_WARN, tinfo, 0,
1446  "parsing of MARC record failed r=%d length=%d",
1447  r, record_length);
1449  }
1450  else
1451  {
1452  xmlDocPtr rdoc;
1453  xmlNode *root_ptr;
1454  yaz_marc_write_xml(input->u.marc.handle, &root_ptr,
1455  "http://www.loc.gov/MARC21/slim", 0, 0);
1456  rdoc = xmlNewDoc((const xmlChar*) "1.0");
1457  xmlDocSetRootElement(rdoc, root_ptr);
1458  return convert_extract_doc(tinfo, input, p, rdoc);
1459  }
1460  return RECCTRL_EXTRACT_OK;
1461 }
1462 
1463 static int filter_extract(void *clientData, struct recExtractCtrl *p)
1464 {
1465  struct filter_info *tinfo = clientData;
1466  struct filter_input *input = tinfo->input_list;
1467 
1468  if (!input)
1470 
1471  nmem_reset(tinfo->nmem_record);
1472 
1473  if (p->setStoreData == 0)
1474  return extract_xml_full(tinfo, input, p);
1475  switch(input->type)
1476  {
1477  case DOM_INPUT_XMLREADER:
1478  if (input->u.xmlreader.split_level == 0)
1479  return extract_xml_full(tinfo, input, p);
1480  else
1481  return extract_xml_split(tinfo, input, p);
1482  break;
1483  case DOM_INPUT_MARC:
1484  return extract_iso2709(tinfo, input, p);
1485  }
1487 }
1488 
1489 static int ioread_ret(void *context, char *buffer, int len)
1490 {
1491  struct recRetrieveCtrl *p = context;
1492  int r = p->stream->readf(p->stream, buffer, len);
1493  return r;
1494 }
1495 
1496 static int ioclose_ret(void *context)
1497 {
1498  return 0;
1499 }
1500 
1501 static int filter_retrieve(void *clientData, struct recRetrieveCtrl *p)
1502 {
1503  /* const char *esn = zebra_dom_ns; */
1504  const char *esn = 0;
1505  const char *params[32];
1506  struct filter_info *tinfo = clientData;
1507  xmlDocPtr doc;
1508  struct filter_retrieve *retrieve;
1509  xsltStylesheetPtr last_xsp = 0;
1510 
1511  if (p->comp)
1512  {
1513  if (p->comp->which == Z_RecordComp_simple
1514  && p->comp->u.simple->which == Z_ElementSetNames_generic)
1515  {
1516  esn = p->comp->u.simple->u.generic;
1517  }
1518  else if (p->comp->which == Z_RecordComp_complex
1519  && p->comp->u.complex->generic->elementSpec
1520  && p->comp->u.complex->generic->elementSpec->which ==
1521  Z_ElementSpec_elementSetName)
1522  {
1523  esn = p->comp->u.complex->generic->elementSpec->u.elementSetName;
1524  }
1525  }
1526  retrieve = lookup_retrieve(tinfo, esn);
1527  if (!retrieve)
1528  {
1529  p->diagnostic =
1530  YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1531  p->addinfo = odr_strdup_null(p->odr, esn);
1532  return 0;
1533  }
1534 
1535  params[0] = 0;
1536  set_param_int(params, "id", p->localno, p->odr->mem);
1537  if (p->fname)
1538  set_param_str(params, "filename", p->fname, p->odr->mem);
1539  if (p->staticrank >= 0)
1540  set_param_int(params, "rank", p->staticrank, p->odr->mem);
1541 
1542  if (esn)
1543  set_param_str(params, "schema", esn, p->odr->mem);
1544  else
1545  if (retrieve->name)
1546  set_param_str(params, "schema", retrieve->name, p->odr->mem);
1547  else if (retrieve->identifier)
1548  set_param_str(params, "schema", retrieve->identifier, p->odr->mem);
1549  else
1550  set_param_str(params, "schema", "", p->odr->mem);
1551 
1552  if (p->score >= 0)
1553  set_param_int(params, "score", p->score, p->odr->mem);
1554  set_param_int(params, "size", p->recordSize, p->odr->mem);
1555 
1556  doc = xmlReadIO(ioread_ret, ioclose_ret, p /* I/O handler */,
1557  0 /* URL */,
1558  0 /* encoding */,
1559  XML_PARSE_XINCLUDE | XML_PARSE_NOENT | XML_PARSE_NONET);
1560  if (!doc)
1561  {
1562  p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1563  return 0;
1564  }
1565 
1566  /* retrieve conversion */
1567  perform_convert(tinfo, 0, p, retrieve->convert, params, &doc, &last_xsp);
1568  if (!doc)
1569  {
1570  p->diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1571  }
1572  else if (!p->input_format
1573  || !oid_oidcmp(p->input_format, yaz_oid_recsyn_xml))
1574  {
1575  xmlChar *buf_out;
1576  int len_out;
1577 
1578  if (last_xsp)
1579  xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1580  else
1581  xmlDocDumpMemory(doc, &buf_out, &len_out);
1582 
1583  p->output_format = yaz_oid_recsyn_xml;
1584  p->rec_len = len_out;
1585  p->rec_buf = odr_malloc(p->odr, p->rec_len);
1586  memcpy(p->rec_buf, buf_out, p->rec_len);
1587  xmlFree(buf_out);
1588  }
1589  else if (!oid_oidcmp(p->output_format, yaz_oid_recsyn_sutrs))
1590  {
1591  xmlChar *buf_out;
1592  int len_out;
1593 
1594  if (last_xsp)
1595  xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1596  else
1597  xmlDocDumpMemory(doc, &buf_out, &len_out);
1598 
1599  p->output_format = yaz_oid_recsyn_sutrs;
1600  p->rec_len = len_out;
1601  p->rec_buf = odr_malloc(p->odr, p->rec_len);
1602  memcpy(p->rec_buf, buf_out, p->rec_len);
1603 
1604  xmlFree(buf_out);
1605  }
1606  else
1607  {
1608  p->diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1609  }
1610  xmlFreeDoc(doc);
1611  return 0;
1612 }
1613 
1614 static struct recType filter_type = {
1615  0,
1616  "dom",
1617  filter_init,
1618  filter_config,
1622 };
1623 
1624 RecType
1625 #if IDZEBRA_STATIC_DOM
1626 idzebra_filter_dom
1627 #else
1629 #endif
1630 
1631 [] = {
1632  &filter_type,
1633  0,
1634 };
1635 /*
1636  * Local variables:
1637  * c-basic-offset: 4
1638  * c-file-style: "Stroustrup"
1639  * indent-tabs-mode: nil
1640  * End:
1641  * vim: shiftwidth=4 tabstop=8 expandtab
1642  */
1643 
static void filter_destroy(void *clientData)
Definition: mod_dom.c:786
#define DOM_INPUT_MARC
Definition: mod_dom.c:96
RecType idzebra_filter[]
Definition: mod_dom.c:1631
#define XML_STRCMP(a, b)
Definition: mod_dom.c:132
static void destroy_dom(struct filter_info *tinfo)
Definition: mod_dom.c:237
#define ZEBRA_DOM_NS
Definition: mod_dom.c:49
static int extract_xml_split(struct filter_info *tinfo, struct filter_input *input, struct recExtractCtrl *p)
Definition: mod_dom.c:1309
#define DOM_INPUT_XMLREADER
Definition: mod_dom.c:95
static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
Definition: mod_dom.c:578
static ZEBRA_RES perform_convert(struct filter_info *tinfo, struct recExtractCtrl *extctr, struct recRetrieveCtrl *retctr, struct convert_s *convert, const char **params, xmlDocPtr *doc, xsltStylesheetPtr *last_xsp)
Definition: mod_dom.c:423
static void process_xml_element_zebra_node(struct filter_info *tinfo, struct recExtractCtrl *extctr, RecWord *recword, xmlNodePtr node)
Definition: mod_dom.c:972
static int filter_retrieve(void *clientData, struct recRetrieveCtrl *p)
Definition: mod_dom.c:1501
static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr, const char *syntax, const char *name)
Definition: mod_dom.c:494
static int extract_iso2709(struct filter_info *tinfo, struct filter_input *input, struct recExtractCtrl *p)
Definition: mod_dom.c:1405
static int ioclose_ret(void *context)
Definition: mod_dom.c:1496
#define FOR_EACH_ELEMENT(ptr)
Definition: mod_dom.c:136
static void process_xml_pi_node(struct filter_info *tinfo, struct recExtractCtrl *extctr, xmlNodePtr node, const char **index_pp)
Definition: mod_dom.c:1112
static int filter_extract(void *clientData, struct recExtractCtrl *p)
Definition: mod_dom.c:1463
static const char * zebra_pi_name
Definition: mod_dom.c:54
static void process_xml_element_node(struct filter_info *tinfo, struct recExtractCtrl *extctr, RecWord *recword, xmlNodePtr node)
Definition: mod_dom.c:1179
static struct recType filter_type
Definition: mod_dom.c:1614
static int ioclose_ex(void *context)
Definition: mod_dom.c:801
static struct filter_retrieve * lookup_retrieve(struct filter_info *tinfo, const char *est)
Definition: mod_dom.c:746
static void extract_dom_doc_node(struct filter_info *tinfo, struct recExtractCtrl *extctr, xmlDocPtr doc)
Definition: mod_dom.c:1216
static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr, struct convert_s **l)
Definition: mod_dom.c:285
static void index_value_of(struct filter_info *tinfo, struct recExtractCtrl *extctr, RecWord *recword, xmlNodePtr node, const char *index_p)
Definition: mod_dom.c:809
static int process_meta(struct filter_info *tinfo, xmlDocPtr doc, xmlNodePtr node, struct recRetrieveCtrl *retctr)
Definition: mod_dom.c:368
static void set_record_info(struct filter_info *tinfo, struct recExtractCtrl *extctr, xmlNodePtr node, const char *id_p, const char *rank_p, const char *type_p)
Definition: mod_dom.c:910
static int convert_extract_doc(struct filter_info *tinfo, struct filter_input *input, struct recExtractCtrl *p, xmlDocPtr doc)
Definition: mod_dom.c:1228
static const char * zebra_dom_ns
Definition: mod_dom.c:50
static int ioread_ret(void *context, char *buffer, int len)
Definition: mod_dom.c:1489
static int attr_content(struct _xmlAttr *attr, const char *name, const char **dst_content)
Definition: mod_dom.c:212
static void * filter_init(Res res, RecType recType)
Definition: mod_dom.c:190
#define ZEBRA_PI_NAME
Definition: mod_dom.c:53
static int attr_content_pi(const char **c_ptr, const char *name, char *value, size_t value_max)
Definition: mod_dom.c:1084
static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr, const char *fmt,...)
Definition: mod_dom.c:145
static int ioread_ex(void *context, char *buffer, int len)
Definition: mod_dom.c:795
static struct filter_input * new_input(struct filter_info *tinfo, int type)
Definition: mod_dom.c:479
static void set_param_str(const char **params, const char *name, const char *value, NMEM nmem)
Definition: mod_dom.c:166
static void destroy_xsp(struct convert_s *c)
Definition: mod_dom.c:224
static void set_param_int(const char **params, const char *name, zint value, NMEM nmem)
Definition: mod_dom.c:178
static int extract_xml_full(struct filter_info *tinfo, struct filter_input *input, struct recExtractCtrl *p)
Definition: mod_dom.c:1382
static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
Definition: mod_dom.c:768
convert_type
Definition: mod_dom.c:56
@ convert_xslt_type
Definition: mod_dom.c:57
@ convert_meta_type
Definition: mod_dom.c:58
static FILE * outf
Definition: readfile.c:38
#define RECCTRL_EXTRACT_EOF
Definition: recctrl.h:164
#define ZEBRA_GROUP_INDEX_NAME
Definition: recctrl.h:47
zebra_recctrl_action_t
Definition: recctrl.h:87
@ action_a_delete
Definition: recctrl.h:97
@ action_delete
Definition: recctrl.h:93
@ action_insert
Definition: recctrl.h:89
@ action_update
Definition: recctrl.h:95
@ action_replace
Definition: recctrl.h:91
#define RECCTRL_EXTRACT_ERROR_GENERIC
Definition: recctrl.h:165
#define RECCTRL_EXTRACT_OK
Definition: recctrl.h:163
#define RECCTRL_EXTRACT_SKIP
Definition: recctrl.h:167
const char * res_get(Res r, const char *name)
Definition: res.c:294
const char * term_buf
Definition: recctrl.h:56
const char * index_type
Definition: recctrl.h:52
zint seqno
Definition: recctrl.h:60
int term_len
Definition: recctrl.h:58
const char * index_name
Definition: recctrl.h:54
int(* readf)(struct ZebraRecStream *s, char *buf, size_t count)
read function
Definition: recctrl.h:75
int dummy
Definition: mod_dom.c:67
struct convert_s * next
Definition: mod_dom.c:76
union convert_s::@18 u
struct convert_meta meta
Definition: mod_dom.c:74
enum convert_type which
Definition: mod_dom.c:71
struct convert_xslt xslt
Definition: mod_dom.c:73
xsltStylesheetPtr stylesheet_xsp
Definition: mod_dom.c:63
const char * stylesheet
Definition: mod_dom.c:62
const char * name
Definition: mod_dom.c:80
struct convert_s * convert
Definition: mod_dom.c:81
struct filter_retrieve * retrieve_list
Definition: mod_dom.c:124
NMEM nmem_config
Definition: mod_dom.c:121
char * fname
Definition: mod_alvis.c:59
struct filter_extract * extract
Definition: mod_dom.c:123
NMEM nmem_record
Definition: mod_dom.c:120
xmlDocPtr doc_config
Definition: mod_dom.c:122
char * full_name
Definition: mod_alvis.c:60
struct filter_store * store
Definition: mod_dom.c:126
int record_info_invoked
Definition: mod_dom.c:127
struct filter_input * input_list
Definition: mod_dom.c:125
const char * profile_path
Definition: mod_alvis.c:61
xmlDocPtr doc
Definition: mod_alvis.c:58
xmlTextReaderPtr reader
Definition: mod_dom.c:104
const char * input_charset
Definition: mod_dom.c:108
const char * syntax
Definition: mod_dom.c:98
struct filter_input::@19::@21 marc
struct filter_input::@19::@20 xmlreader
union filter_input::@19 u
const char * name
Definition: mod_dom.c:99
struct convert_s * convert
Definition: mod_dom.c:100
struct filter_input * next
Definition: mod_dom.c:113
yaz_iconv_t iconv
Definition: mod_dom.c:110
yaz_marc_t handle
Definition: mod_dom.c:109
int split_level
Definition: mod_dom.c:105
struct filter_retrieve * next
Definition: mod_dom.c:92
struct convert_s * convert
Definition: mod_dom.c:91
const char * identifier
Definition: mod_dom.c:90
const char * name
Definition: mod_dom.c:89
struct convert_s * convert
Definition: mod_dom.c:85
record extract for indexing
Definition: recctrl.h:101
int flagShowRecords
Definition: recctrl.h:108
void(* init)(struct recExtractCtrl *p, RecWord *w)
Definition: recctrl.h:103
enum zebra_recctrl_action_t action
Definition: recctrl.h:114
char match_criteria[256]
Definition: recctrl.h:109
void(* tokenAdd)(RecWord *w)
Definition: recctrl.h:105
zint staticrank
Definition: recctrl.h:110
void(* setStoreData)(struct recExtractCtrl *p, void *buf, size_t size)
Definition: recctrl.h:106
int first_record
Definition: recctrl.h:107
struct ZebraRecStream * stream
Definition: recctrl.h:102
const Odr_oid * input_format
Definition: recctrl.h:123
int(* special_fetch)(void *handle, const char *esn, const Odr_oid *input_format, const Odr_oid **output_format, WRBUF result, WRBUF addinfo)
Definition: recctrl.h:142
char * addinfo
Definition: recctrl.h:138
Z_RecordComposition * comp
Definition: recctrl.h:124
struct ZebraRecStream * stream
Definition: recctrl.h:119
const Odr_oid * output_format
Definition: recctrl.h:134
void * handle
Definition: recctrl.h:141
zint staticrank
Definition: recctrl.h:128
char * fname
Definition: recctrl.h:130
void * rec_buf
Definition: recctrl.h:135
Definition: res.c:46
long zint
Zebra integer.
Definition: util.h:66
#define ZEBRA_FAIL
Definition: util.h:81
#define ZINT_FORMAT
Definition: util.h:72
zint atozint(const char *src)
Definition: zint.c:55
#define ZEBRA_OK
Definition: util.h:82
short ZEBRA_RES
Common return type for Zebra API.
Definition: util.h:80