31 #include <unicode/ustring.h>
32 #include <unicode/uchar.h>
34 enum icu_chain_step_type {
35 ICU_chain_step_type_none,
36 ICU_chain_step_type_display,
37 ICU_chain_step_type_casemap,
38 ICU_chain_step_type_transform,
39 ICU_chain_step_type_tokenize,
40 ICU_chain_step_type_transliterate,
41 YAZ_chain_step_type_stemming,
42 ICU_chain_step_type_join
48 enum icu_chain_step_type
type;
50 struct icu_casemap *casemap;
51 struct icu_transform *transform;
52 struct icu_tokenizer *tokenizer;
56 struct icu_chain_step *previous;
68 struct icu_chain_step *csteps;
73 if (U_FAILURE(status))
81 static struct icu_chain_step *icu_chain_insert_step(
82 struct icu_chain *chain,
enum icu_chain_step_type
type,
83 const char *rule, UErrorCode *status)
85 struct icu_chain_step *step = 0;
90 step = (
struct icu_chain_step *)
xmalloc(
sizeof(*step));
95 case ICU_chain_step_type_display:
97 case ICU_chain_step_type_casemap:
101 case ICU_chain_step_type_transform:
106 case ICU_chain_step_type_tokenize:
110 case ICU_chain_step_type_transliterate:
115 case YAZ_chain_step_type_stemming:
119 case ICU_chain_step_type_join:
127 step->previous = chain->csteps;
128 chain->csteps = step;
134 static void icu_chain_step_destroy(
struct icu_chain_step *step)
139 icu_chain_step_destroy(step->previous);
143 case ICU_chain_step_type_display:
145 case ICU_chain_step_type_casemap:
148 case ICU_chain_step_type_transform:
149 case ICU_chain_step_type_transliterate:
152 case ICU_chain_step_type_tokenize:
155 case YAZ_chain_step_type_stemming:
158 case ICU_chain_step_type_join:
167 struct icu_chain_step *icu_chain_step_clone(
struct icu_chain_step *old)
169 struct icu_chain_step *step = 0;
170 struct icu_chain_step **sp = &step;
173 *sp = (
struct icu_chain_step *)
xmalloc(
sizeof(**sp));
174 (*sp)->type = old->type;
178 case ICU_chain_step_type_display:
180 case ICU_chain_step_type_casemap:
183 case ICU_chain_step_type_transform:
184 case ICU_chain_step_type_transliterate:
187 case ICU_chain_step_type_tokenize:
190 case YAZ_chain_step_type_stemming:
193 case ICU_chain_step_type_none:
195 case ICU_chain_step_type_join:
201 sp = &(*sp)->previous;
210 struct icu_chain *chain;
211 UCollator *coll = ucol_open(locale, status);
213 if (U_FAILURE(*status))
216 chain = (
struct icu_chain *)
xmalloc(
sizeof(*chain));
218 chain->locale =
xstrdup(locale);
231 ucol_close(chain->coll);
235 icu_chain_step_destroy(chain->csteps);
236 xfree(chain->locale);
247 struct icu_chain *chain = 0;
250 *status = U_ZERO_ERROR;
252 if (xml_node && xml_node->type == XML_ELEMENT_NODE)
267 struct icu_chain_step *step = 0;
268 const char *attr_str;
271 if (
node->type != XML_ELEMENT_NODE)
277 "element '%s'", attr_str,
node->name);
280 if (!rule &&
node->children)
283 if (!rule && strcmp((
const char *)
node->name,
"display"))
286 (
const char *)
node->name);
290 if (!strcmp((
const char *)
node->name,
"casemap"))
291 step = icu_chain_insert_step(chain,
292 ICU_chain_step_type_casemap,
294 else if (!strcmp((
const char *)
node->name,
"transform"))
295 step = icu_chain_insert_step(chain,
296 ICU_chain_step_type_transform,
298 else if (!strcmp((
const char *)
node->name,
"transliterate"))
299 step = icu_chain_insert_step(chain,
300 ICU_chain_step_type_transliterate,
302 else if (!strcmp((
const char *)
node->name,
"tokenize"))
303 step = icu_chain_insert_step(chain, ICU_chain_step_type_tokenize,
305 else if (!strcmp((
const char *)
node->name,
"display"))
306 step = icu_chain_insert_step(chain, ICU_chain_step_type_display,
308 else if (!strcmp((
const char *)
node->name,
"stemming"))
309 step = icu_chain_insert_step(chain, YAZ_chain_step_type_stemming,
311 else if (!strcmp((
const char *)
node->name,
"join"))
312 step = icu_chain_insert_step(chain, ICU_chain_step_type_join,
314 else if (!strcmp((
const char *)
node->name,
"normalize"))
317 "Use transform instead",
node->name);
318 step = icu_chain_insert_step(chain, ICU_chain_step_type_transform,
321 else if (!strcmp((
const char *)
node->name,
"index")
322 || !strcmp((
const char *)
node->name,
"sortkey"))
325 "Remove it from the configuration",
node->name);
338 if (step && U_FAILURE(*status))
341 *status, u_errorName(*status),
node->name, rule ?
357 struct icu_chain *chain;
370 struct icu_chain_step *steps;
375 UErrorCode status = U_ZERO_ERROR;
380 if (U_FAILURE(status))
393 struct icu_chain_step *step,
400 struct icu_buf_utf16 *dst = icu_iter_invoke(iter, step->previous, src);
404 case ICU_chain_step_type_casemap:
411 iter->chain->locale);
415 case ICU_chain_step_type_tokenize:
423 iter->utf8_base = iter->utf16_base = 0;
429 iter->status = U_ZERO_ERROR;
431 &iter->org_start, &iter->org_len))
437 case ICU_chain_step_type_transform:
438 case ICU_chain_step_type_transliterate:
447 case ICU_chain_step_type_display:
451 case YAZ_chain_step_type_stemming:
460 case ICU_chain_step_type_join:
466 icu_iter_invoke(iter, step->previous, 0);
487 iter->status = U_ZERO_ERROR;
494 iter->steps = icu_chain_step_clone(chain->csteps);
495 iter->token_count = 0;
505 iter->token_count = 0;
507 iter->utf8_base = iter->utf16_base = 0;
509 iter->last = icu_iter_invoke(iter, iter->steps, src);
521 icu_chain_step_destroy(iter->steps);
528 if (iter->token_count && iter->last)
529 iter->last = icu_iter_invoke(iter, iter->steps, 0);
535 if (iter->chain->sort)
538 iter->sort8, iter->last,
565 return iter->token_count;
572 int32_t len1 = 0, len2 = 0;
573 UErrorCode status = U_ZERO_ERROR;
575 if (iter->org_start < iter->utf16_base)
578 iter->utf16_base = 0;
580 u_strToUTF8(0, 0, &len1,
581 iter->org->utf16 + iter->utf16_base,
582 iter->org_start - iter->utf16_base,
585 status = U_ZERO_ERROR;
587 *start = len1 + iter->utf8_base;
589 u_strToUTF8(0, 0, &len2,
590 iter->org->utf16 + iter->utf16_base,
591 iter->org_start - iter->utf16_base + iter->org_len,
600 status = U_ZERO_ERROR;
604 iter->utf8_base = *start;
605 iter->utf16_base = iter->org_start;
625 *status = U_ZERO_ERROR;
631 if (chain && chain->iter)
632 return chain->iter->token_count;
664 size_t *len,
const char **cstr)
static int node(struct cql_node *cn, void(*pr)(const char *buf, void *client_data), void *client_data)
struct icu_iter * yaz_icu_iter_t
ICU tokenizer iterator type (opaque)
const char * icu_iter_get_display(yaz_icu_iter_t iter)
returns ICU display string
void icu_chain_destroy(yaz_icu_chain_t chain)
destroys ICU chain
int icu_iter_get_token_number(yaz_icu_iter_t iter)
returns ICU token count for iterator
void icu_iter_destroy(yaz_icu_iter_t iter)
destroy ICU tokenizer iterator
const char * icu_iter_get_sortkey(yaz_icu_iter_t iter)
returns ICU sortkey string
const char * icu_iter_get_norm(yaz_icu_iter_t iter)
returns ICU normalized token
void icu_chain_get_org_info(yaz_icu_chain_t chain, size_t *start, size_t *len)
returns token as it relates to original text (legacy)
void icu_chain_get_org_info2(yaz_icu_chain_t chain, size_t *start, size_t *len, const char **cstr)
returns token as it relates to original text (2nd version)
yaz_icu_chain_t icu_chain_xml_config(const xmlNode *xml_node, int sort, UErrorCode *status)
constructs ICU chain from XML specification
int icu_chain_assign_cstr(yaz_icu_chain_t chain, const char *src8cstr, UErrorCode *status)
pass string to ICU for parsing/tokenization/etc
void icu_iter_first(yaz_icu_iter_t iter, const char *src8cstr)
starts iteration over string
const char * icu_chain_token_norm(yaz_icu_chain_t chain)
returns normalized token of last token processed
int icu_chain_token_number(yaz_icu_chain_t chain)
returns token number of last token processed
yaz_icu_iter_t icu_iter_create(struct icu_chain *chain)
create ICU tokenizer iterator from chain
int icu_iter_next(yaz_icu_iter_t iter)
iterates over one token
void icu_iter_get_org_info(yaz_icu_iter_t iter, size_t *start, size_t *len)
returns ICU original token start (offset) and length (legacy)
int icu_chain_next_token(yaz_icu_chain_t chain, UErrorCode *status)
returns one token (if any)
void icu_iter_get_org_info2(yaz_icu_iter_t iter, size_t *start, size_t *len, const char **cstr)
returns ICU original token start (offset) and length
const char * icu_chain_token_display(yaz_icu_chain_t chain)
returns display token of last token processed
const char * icu_chain_token_sortkey(yaz_icu_chain_t chain)
returns sortkey token of last token processed
Internal header for ICU utilities.
struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 *dest16, const struct icu_buf_utf16 *src16)
struct icu_casemap * icu_casemap_create(char action, UErrorCode *status)
void icu_buf_utf8_destroy(struct icu_buf_utf8 *buf8)
struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, UErrorCode *status)
struct icu_transform * icu_transform_create(const char *id, char action, const char *rules, UErrorCode *status)
struct icu_tokenizer * icu_tokenizer_clone(struct icu_tokenizer *old)
void icu_sortkey8_from_utf16(UCollator *coll, struct icu_buf_utf8 *dest8, struct icu_buf_utf16 *src16, UErrorCode *status)
UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 *dest16, const char *src8cstr, UErrorCode *status)
yaz_icu_chain_t icu_chain_create(const char *locale, int sort, UErrorCode *status)
int icu_tokenizer_attach(struct icu_tokenizer *tokenizer, struct icu_buf_utf16 *src16, UErrorCode *status)
const char * icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
int icu_transform_trans(struct icu_transform *transform, struct icu_buf_utf16 *dest16, const struct icu_buf_utf16 *src16, UErrorCode *status)
struct icu_casemap * icu_casemap_clone(struct icu_casemap *old)
struct icu_buf_utf16 * icu_buf_utf16_append(struct icu_buf_utf16 *dest16, const struct icu_buf_utf16 *src16)
void icu_buf_utf16_destroy(struct icu_buf_utf16 *buf16)
void icu_tokenizer_destroy(struct icu_tokenizer *tokenizer)
struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
struct icu_transform * icu_transform_clone(struct icu_transform *old)
int icu_casemap_casemap(struct icu_casemap *casemap, struct icu_buf_utf16 *dest16, struct icu_buf_utf16 *src16, UErrorCode *status, const char *locale)
struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
int icu_check_status(UErrorCode status)
void icu_transform_destroy(struct icu_transform *transform)
UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 *dest8, const struct icu_buf_utf16 *src16, UErrorCode *status)
int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer, struct icu_buf_utf16 *tkn16, UErrorCode *status, size_t *start, size_t *len)
void icu_casemap_destroy(struct icu_casemap *casemap)
void yaz_log(int level, const char *fmt,...)
Writes log message.
#define YLOG_WARN
log level: warning
void nmem_reset(NMEM n)
releases memory associaged with an NMEM handle
NMEM nmem_create(void)
returns new NMEM handle
void nmem_destroy(NMEM n)
destroys NMEM handle and memory associated with it
Header for Nibble Memory functions.
Header for Nibble Memory functions + Libxml2 specific stuff.
char * nmem_text_node_cdata(const xmlNode *ptr_cdata, NMEM nmem)
copies TEXT Libxml2 node data to NMEM
Header for the stemming API.
yaz_stemmer_p yaz_stemmer_create(const char *locale, const char *rule, UErrorCode *status)
void yaz_stemmer_stem(yaz_stemmer_p stemmer, struct icu_buf_utf16 *dst, struct icu_buf_utf16 *src, UErrorCode *status)
void yaz_stemmer_destroy(yaz_stemmer_p stemmer)
yaz_stemmer_p yaz_stemmer_clone(yaz_stemmer_p stemmer)
struct yaz_stemmer_t * yaz_stemmer_p
Header for memory handling functions.
#define xstrdup(s)
utility macro which calls xstrdup_f
#define xfree(x)
utility macro which calls xfree_f
#define xmalloc(x)
utility macro which calls malloc_f
const char * yaz_xml_get_prop(const xmlNode *n, const char *fmt,...)
XML node getter/creation utilities.