28 #include <yaz/diagbib1.h>
29 #include <yaz/tpath.h>
30 #include <yaz/snprintf.h>
32 #include <libxml/xmlversion.h>
33 #include <libxml/parser.h>
34 #include <libxml/tree.h>
35 #include <libxml/xmlIO.h>
36 #include <libxml/xmlreader.h>
37 #include <libxslt/transform.h>
38 #include <libxslt/xsltutils.h>
41 #include <libexslt/exslt.h>
46 #include <yaz/oid_db.h>
49 #define ZEBRA_DOM_NS "http://indexdata.com/zebra-2.0"
53 #define ZEBRA_PI_NAME "zebra-2.0"
95 #define DOM_INPUT_XMLREADER 1
96 #define DOM_INPUT_MARC 2
132 #define XML_STRCMP(a,b) strcmp((char*)a, b)
133 #define XML_STRLEN(a) strlen((char*)a)
136 #define FOR_EACH_ELEMENT(ptr) for (; ptr; ptr = ptr->next) if (ptr->type == XML_ELEMENT_NODE)
139 const char *fmt, ...)
141 __attribute__ ((format (printf, 4, 5)))
146 const char *fmt, ...)
152 yaz_vsnprintf(buf,
sizeof(buf)-1, fmt, ap);
155 yaz_log(level,
"%s:%ld: %s", tinfo->
fname ? tinfo->
fname :
"none",
156 xmlGetLineNo(ptr), buf);
160 yaz_log(level,
"%s: %s", tinfo->
fname ? tinfo->
fname :
"none", buf);
167 const char *value, NMEM nmem)
169 char *quoted = nmem_malloc(nmem, 3 + strlen(value));
170 yaz_snprintf(quoted, 3 + strlen(value),
"'%s'", value);
179 zint value, NMEM nmem)
181 char *quoted = nmem_malloc(nmem, 30);
184 yaz_snprintf(quoted, 30,
"'" ZINT_FORMAT "'", value);
213 const char **dst_content)
215 if (!
XML_STRCMP(attr->name, name) && attr->children
216 && attr->children->type == XML_TEXT_NODE)
218 *dst_content = (
const char *)(attr->children->content);
261 yaz_iconv_close(i_ptr->
u.
marc.iconv);
262 yaz_marc_destroy(i_ptr->
u.
marc.handle);
292 struct _xmlAttr *attr;
300 for (attr = ptr->properties; attr; attr = attr->next)
306 "bad attribute @%s", attr->name);
310 char tmp_xslt_full_name[1024];
317 "stylesheet %s not found in "
325 = xsltParseStylesheetFile((
const xmlChar*)
330 "could not parse xslt stylesheet %s",
338 "missing attribute 'stylesheet'");
344 else if (!
XML_STRCMP(ptr->name,
"process-meta"))
346 struct _xmlAttr *attr;
352 for (attr = ptr->properties; attr; attr = attr->next)
354 "bad attribute @%s", attr->name);
361 "bad element '%s', expected <xslt>", ptr->name);
372 if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href &&
377 const char *element_set_name = 0;
379 struct _xmlAttr *attr;
380 for (attr = node->properties; attr; attr = attr->next)
386 dom_log(YLOG_WARN, tinfo, node,
387 "bad attribute @%s, expected @name", attr->name);
390 if (element_set_name)
392 WRBUF result = wrbuf_alloc();
393 WRBUF addinfo = wrbuf_alloc();
394 const Odr_oid *input_format = yaz_oid_recsyn_xml;
395 const Odr_oid *output_format = 0;
400 input_format, &output_format,
405 xmlParseMemory(wrbuf_buf(result), wrbuf_len(result));
408 xmlNodePtr t = xmlDocGetRootElement(sub_doc);
409 xmlReplaceNode(node, xmlCopyNode(t, 1));
413 wrbuf_destroy(result);
414 wrbuf_destroy(addinfo);
418 for (node = node->children; node; node = node->next)
429 xsltStylesheetPtr *last_xsp)
431 for (; convert; convert = convert->
next)
435 xmlChar *buf_out = 0;
447 xsltSaveResultToString(&buf_out, &len_out, res_doc,
453 *doc = xmlParseMemory((
const char *) buf_out, len_out);
457 yaz_log(YLOG_LOG,
"%s: XSLT %s\n %.*s",
468 process_meta(tinfo, *doc, xmlDocGetRootElement(*doc), retctr);
483 for (;*np; np = &(*np)->
next)
485 p = *np = nmem_malloc(tinfo->
nmem_config,
sizeof(*p));
500 yaz_iconv_t
iconv = 0;
502 struct _xmlAttr *attr;
504 for (attr = ptr->properties; attr; attr = attr->next)
511 "bad attribute @%s, expected @inputcharset",
515 iconv = yaz_iconv_open(
"utf-8", input_charset);
519 "unsupported @charset '%s'", input_charset);
526 p->
u.
marc.handle = yaz_marc_create();
529 yaz_marc_iconv(p->
u.
marc.handle, p->
u.
marc.iconv);
542 struct _xmlAttr *attr;
543 const char *level_str = 0;
548 for (attr = ptr->properties; attr; attr = attr->next)
555 "bad attribute @%s, expected @level",
570 "bad element <%s>, expected <marc>|<xmlreader>",
580 char tmp_full_name[1024];
587 NULL, tmp_full_name))
592 yaz_log(YLOG_LOG,
"%s dom filter: "
598 yaz_log(YLOG_WARN,
"%s: dom filter: "
599 "failed to parse config file %s",
606 ptr = xmlDocGetRootElement(doc);
607 if (!ptr || ptr->type != XML_ELEMENT_NODE
611 "bad root element <%s>, expected root element <dom>",
626 struct _xmlAttr *attr;
633 for (attr = ptr->properties; attr; attr = attr->next)
640 "bad attribute @%s, expected @name",
654 struct _xmlAttr *attr;
668 for (attr = ptr->properties; attr; attr = attr->next)
678 "bad attribute @%s, expected @identifier|@name",
709 struct _xmlAttr *attr;
710 const char *syntax = 0;
711 const char *name = 0;
712 for (attr = ptr->properties; attr; attr = attr->next)
721 "bad attribute @%s, expected @syntax|@name",
731 "expected <extract>|<input>|<retrieve>|<store>",
754 for (; f; f = f->
next)
761 if (f->
name && !strcmp(f->
name, est))
773 yaz_log(YLOG_WARN,
"dom filter: need config file");
777 if (tinfo->
fname && !strcmp(args, tinfo->
fname))
795 static int ioread_ex(
void *context,
char *buffer,
int len)
817 xmlChar *text = xmlNodeGetContent(node);
818 size_t text_len = strlen((
const char *)text);
831 const char *look = index_p;
839 recword->
term_buf = (
const char *)text;
844 while (*look &&
' ' != *look &&
':' != *look)
852 while (*look &&
':' != *look &&
' ' != *look)
857 strncpy((
char *)index, (
const char *)bval, eval - bval);
858 index[eval - bval] =
'\0';
867 while (*look &&
' ' != *look)
872 strncpy((
char *)type, (
const char *)bval, eval - bval);
873 type[eval - bval] =
'\0';
878 recword->
seqno = seqno_base;
886 "INDEX '%s:%s' '%s'",
887 (
const char *) index,
889 (
const char *) text);
893 if (seqno_max < recword->seqno)
894 seqno_max = recword->
seqno;
897 if (*look &&
' ' == *look)
902 recword->
seqno = seqno_max;
920 "RECORD id=%s rank=%s type=%s",
921 id_p ? (
const char *) id_p :
"(null)",
922 rank_p ? (
const char *) rank_p :
"(null)",
923 type_p ? (
const char *) type_p :
"(null)");
928 size_t l = strlen(id_p);
935 if (rank_p && *rank_p)
938 if (type_p && *type_p)
941 if (!strcmp(type_p,
"insert"))
943 else if (!strcmp(type_p,
"delete"))
945 else if (!strcmp(type_p,
"replace"))
947 else if (!strcmp(type_p,
"update"))
949 else if (!strcmp(type_p,
"adelete"))
952 dom_log(YLOG_WARN, tinfo, node,
"bad @type value: %s", type_p);
959 dom_log(YLOG_WARN, tinfo, node,
"multiple record elements");
977 if (node->type == XML_ELEMENT_NODE && node->ns && node->ns->href
982 const char *index_p = 0;
984 struct _xmlAttr *attr;
985 for (attr = node->properties; attr; attr = attr->next)
993 dom_log(YLOG_WARN, tinfo, node,
994 "bad attribute @%s, expected @name",
999 else if (0 ==
XML_STRCMP(node->name,
"group"))
1001 const char *unit_p =
"element";
1003 struct _xmlAttr *attr;
1004 for (attr = node->properties; attr; attr = attr->next)
1010 dom_log(YLOG_WARN, tinfo, node,
1011 "bad attribute @%s, expected @unit",
1017 WRBUF w = wrbuf_alloc();
1019 wrbuf_puts(w, unit_p);
1027 "INDEX '%s:%s' '%s'",
1033 for (node = node->children; node; node = node->next)
1042 "INDEX '%s:%s' '%s'",
1050 else if (0 ==
XML_STRCMP(node->name,
"record"))
1052 const char *id_p = 0;
1053 const char *rank_p = 0;
1054 const char *type_p = 0;
1056 struct _xmlAttr *attr;
1057 for (attr = node->properties; attr; attr = attr->next)
1067 dom_log(YLOG_WARN, tinfo, node,
1068 "bad attribute @%s, expected @id|@rank|@type",
1076 dom_log(YLOG_WARN, tinfo, node,
1078 " expected <record>|<index> in namespace '%s'",
1085 char *value,
size_t value_max)
1087 size_t name_len = strlen(name);
1088 const char *look = *c_ptr;
1091 if (strlen(look) > name_len)
1093 if (look[name_len] ==
'=' && !memcmp(look, name, name_len))
1097 while (*look &&
' ' != *look)
1099 if (i < value_max-1)
1115 const char **index_pp)
1120 xmlChar *pi_p = node->content;
1121 const char *look = (
const char *) node->content;
1124 if (0 == strncmp((
const char *)look,
"record", 6))
1137 while (
' ' == *look)
1149 dom_log(YLOG_WARN, tinfo, node,
1150 "content '%s', can not parse '%s'",
1158 else if (0 == strncmp((
const char *)look,
"index", 5))
1163 while (*look &&
' ' == *look)
1171 dom_log(YLOG_WARN, tinfo, node,
1172 "content '%s', can not parse '%s'",
1185 const char *index_p = 0;
1192 for (node = node->children; node; node = node->next)
1195 if (node->type == XML_PI_NODE)
1199 else if (node->type == XML_ELEMENT_NODE)
1222 (*extctr->
init)(extctr, &recword);
1235 const char *params[10];
1236 xsltStylesheetPtr last_xsp = 0;
1253 xmlDocDumpMemory(doc, &buf_out, &len_out);
1255 FILE *
outf = fopen(
"extract.xml",
"w");
1256 fwrite(buf_out, 1, len_out,
outf);
1259 yaz_log(YLOG_LOG,
"Extract Doc: %.*s", len_out, buf_out);
1264 xmlDocPtr store_doc = 0;
1272 store_doc = xmlCopyDoc(doc, 1);
1274 params, &store_doc, &last_xsp);
1279 xsltSaveResultToString(&buf_out, &len_out,
1280 store_doc ? store_doc : doc, last_xsp);
1282 xmlDocDumpMemory(store_doc ? store_doc : doc, &buf_out, &len_out);
1288 xmlFreeDoc(store_doc);
1318 xmlFreeTextReader(input->
u.
xmlreader.reader);
1330 ret = xmlTextReaderRead(input->
u.
xmlreader.reader);
1333 int type = xmlTextReaderNodeType(input->
u.
xmlreader.reader);
1334 int depth = xmlTextReaderDepth(input->
u.
xmlreader.reader);
1336 if (type == XML_READER_TYPE_ELEMENT &&
1344 ptr = xmlTextReaderExpand(input->
u.
xmlreader.reader);
1349 xmlNodePtr ptr2 = xmlCopyNode(ptr, 1);
1350 xmlDocPtr doc = xmlNewDoc((
const xmlChar*)
"1.0");
1352 xmlDocSetRootElement(doc, ptr2);
1357 xmlChar *buf_out = 0;
1359 xmlDocDumpMemory(doc, &buf_out, &len_out);
1360 yaz_log(YLOG_LOG,
"%s: XMLREADER level: %i\n%.*s",
1362 depth, len_out, buf_out);
1370 xmlFreeTextReader(input->
u.
xmlreader.reader);
1375 ret = xmlTextReaderRead(input->
u.
xmlreader.reader);
1377 xmlFreeTextReader(input->
u.
xmlreader.reader);
1415 while (*buf < '0' || *buf >
'9')
1420 "MARC: Skipping bad byte %d (0x%02X)",
1421 *buf & 0xff, *buf & 0xff);
1422 for (i = 0; i < 4; i++)
1428 record_length = atoi_n (buf, 5);
1429 if (record_length < 25)
1432 "MARC record length < 25, is %d", record_length);
1436 if (read_bytes < record_length-5)
1439 "couldn't read whole MARC record");
1442 r = yaz_marc_read_iso2709(input->
u.
marc.handle, buf, record_length);
1443 if (r < record_length)
1446 "parsing of MARC record failed r=%d length=%d",
1454 yaz_marc_write_xml(input->
u.
marc.handle, &root_ptr,
1455 "http://www.loc.gov/MARC21/slim", 0, 0);
1456 rdoc = xmlNewDoc((
const xmlChar*)
"1.0");
1457 xmlDocSetRootElement(rdoc, root_ptr);
1504 const char *esn = 0;
1505 const char *params[32];
1509 xsltStylesheetPtr last_xsp = 0;
1513 if (p->
comp->which == Z_RecordComp_simple
1514 && p->
comp->u.simple->which == Z_ElementSetNames_generic)
1516 esn = p->
comp->u.simple->u.generic;
1518 else if (p->
comp->which == Z_RecordComp_complex
1519 && p->
comp->u.complex->generic->elementSpec
1520 && p->
comp->u.complex->generic->elementSpec->which ==
1521 Z_ElementSpec_elementSetName)
1523 esn = p->
comp->u.complex->generic->elementSpec->u.elementSetName;
1530 YAZ_BIB1_SPECIFIED_ELEMENT_SET_NAME_NOT_VALID_FOR_SPECIFIED_;
1559 XML_PARSE_XINCLUDE | XML_PARSE_NOENT | XML_PARSE_NONET);
1562 p->
diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1570 p->
diagnostic = YAZ_BIB1_SYSTEM_ERROR_IN_PRESENTING_RECORDS;
1579 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1581 xmlDocDumpMemory(doc, &buf_out, &len_out);
1589 else if (!oid_oidcmp(p->
output_format, yaz_oid_recsyn_sutrs))
1595 xsltSaveResultToString(&buf_out, &len_out, doc, last_xsp);
1597 xmlDocDumpMemory(doc, &buf_out, &len_out);
1608 p->
diagnostic = YAZ_BIB1_RECORD_SYNTAX_UNSUPP;
1625 #if IDZEBRA_STATIC_DOM
static void filter_destroy(void *clientData)
static void destroy_dom(struct filter_info *tinfo)
static int extract_xml_split(struct filter_info *tinfo, struct filter_input *input, struct recExtractCtrl *p)
#define DOM_INPUT_XMLREADER
static ZEBRA_RES parse_dom(struct filter_info *tinfo, const char *fname)
static ZEBRA_RES perform_convert(struct filter_info *tinfo, struct recExtractCtrl *extctr, struct recRetrieveCtrl *retctr, struct convert_s *convert, const char **params, xmlDocPtr *doc, xsltStylesheetPtr *last_xsp)
static void process_xml_element_zebra_node(struct filter_info *tinfo, struct recExtractCtrl *extctr, RecWord *recword, xmlNodePtr node)
static int filter_retrieve(void *clientData, struct recRetrieveCtrl *p)
static ZEBRA_RES parse_input(struct filter_info *tinfo, xmlNodePtr ptr, const char *syntax, const char *name)
static int extract_iso2709(struct filter_info *tinfo, struct filter_input *input, struct recExtractCtrl *p)
static int ioclose_ret(void *context)
#define FOR_EACH_ELEMENT(ptr)
static void process_xml_pi_node(struct filter_info *tinfo, struct recExtractCtrl *extctr, xmlNodePtr node, const char **index_pp)
static int filter_extract(void *clientData, struct recExtractCtrl *p)
static const char * zebra_pi_name
static void process_xml_element_node(struct filter_info *tinfo, struct recExtractCtrl *extctr, RecWord *recword, xmlNodePtr node)
static struct recType filter_type
static int ioclose_ex(void *context)
static struct filter_retrieve * lookup_retrieve(struct filter_info *tinfo, const char *est)
static void extract_dom_doc_node(struct filter_info *tinfo, struct recExtractCtrl *extctr, xmlDocPtr doc)
static ZEBRA_RES parse_convert(struct filter_info *tinfo, xmlNodePtr ptr, struct convert_s **l)
static void index_value_of(struct filter_info *tinfo, struct recExtractCtrl *extctr, RecWord *recword, xmlNodePtr node, const char *index_p)
static int process_meta(struct filter_info *tinfo, xmlDocPtr doc, xmlNodePtr node, struct recRetrieveCtrl *retctr)
static void set_record_info(struct filter_info *tinfo, struct recExtractCtrl *extctr, xmlNodePtr node, const char *id_p, const char *rank_p, const char *type_p)
static int convert_extract_doc(struct filter_info *tinfo, struct filter_input *input, struct recExtractCtrl *p, xmlDocPtr doc)
static const char * zebra_dom_ns
static int ioread_ret(void *context, char *buffer, int len)
static int attr_content(struct _xmlAttr *attr, const char *name, const char **dst_content)
static void * filter_init(Res res, RecType recType)
static int attr_content_pi(const char **c_ptr, const char *name, char *value, size_t value_max)
static void dom_log(int level, struct filter_info *tinfo, xmlNodePtr ptr, const char *fmt,...)
static int ioread_ex(void *context, char *buffer, int len)
static struct filter_input * new_input(struct filter_info *tinfo, int type)
static void set_param_str(const char **params, const char *name, const char *value, NMEM nmem)
static void destroy_xsp(struct convert_s *c)
static void set_param_int(const char **params, const char *name, zint value, NMEM nmem)
static int extract_xml_full(struct filter_info *tinfo, struct filter_input *input, struct recExtractCtrl *p)
static ZEBRA_RES filter_config(void *clientData, Res res, const char *args)
#define RECCTRL_EXTRACT_EOF
#define ZEBRA_GROUP_INDEX_NAME
#define RECCTRL_EXTRACT_ERROR_GENERIC
#define RECCTRL_EXTRACT_OK
#define RECCTRL_EXTRACT_SKIP
const char * res_get(Res r, const char *name)
int(* readf)(struct ZebraRecStream *s, char *buf, size_t count)
read function
xsltStylesheetPtr stylesheet_xsp
struct filter_retrieve * retrieve_list
struct filter_extract * extract
struct filter_store * store
struct filter_input * input_list
const char * profile_path
struct filter_retrieve * next
struct convert_s * convert
struct convert_s * convert
const Odr_oid * input_format
int(* special_fetch)(void *handle, const char *esn, const Odr_oid *input_format, const Odr_oid **output_format, WRBUF result, WRBUF addinfo)
Z_RecordComposition * comp
struct ZebraRecStream * stream
const Odr_oid * output_format
zint atozint(const char *src)
short ZEBRA_RES
Common return type for Zebra API.