YAZ  5.34.0
stemmer.c
Go to the documentation of this file.
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
5 
11 #if HAVE_CONFIG_H
12 #include <config.h>
13 #endif
14 
15 #if YAZ_HAVE_ICU
16 
17 #include <yaz/stemmer.h>
18 
19 #include <yaz/xmalloc.h>
20 
21 #include <libstemmer.h>
22 
23 #include <unicode/ustring.h> /* some more string fcns*/
24 #include <unicode/uchar.h> /* char names */
25 
26 enum stemmer_implementation {
27  yaz_no_operation,
28  yaz_snowball
29 };
30 struct yaz_stemmer_t
31 {
32  int implementation;
33  char *locale;
34  char *rule;
35  struct sb_stemmer *sb_stemmer;
36 };
37 
38 const char* yaz_stemmer_lookup_charenc(const char *charenc, const char *rule) {
39  return "UTF_8";
40 }
41 
42 const char* yaz_stemmer_lookup_algorithm(const char *locale, const char *rule) {
43  return locale;
44 }
45 
46 yaz_stemmer_p yaz_stemmer_snowball_create(const char *locale, const char *rule, UErrorCode *status) {
47  const char *charenc = yaz_stemmer_lookup_charenc(locale, rule);
48  const char *algorithm = yaz_stemmer_lookup_algorithm(locale,rule);
49  struct sb_stemmer *stemmer = sb_stemmer_new(algorithm, charenc);
50  yaz_stemmer_p yaz_stemmer;
51  if (stemmer == 0) {
52  *status = U_ILLEGAL_ARGUMENT_ERROR;
53  yaz_log(YLOG_FATAL, "yaz_stemmer: Failed to create snowball stemmer from locale %srule %s. Showball: charenc %s algorithm %s ",
54  locale, rule, charenc, algorithm);
55  return 0;
56  }
57  yaz_log(YLOG_DEBUG, "created snowball stemmer: algorithm %s charenc %s ", algorithm, charenc);
58  yaz_stemmer = xmalloc(sizeof(*yaz_stemmer));
59  yaz_stemmer->implementation = yaz_snowball;
60 
61  yaz_stemmer->locale = xstrdup(locale);
62  yaz_stemmer->rule = xstrdup(rule);
63  yaz_stemmer->sb_stemmer = stemmer;
64  yaz_log(YLOG_DEBUG, "created snowball stemmer: algorithm %s charenc %s ", algorithm, charenc);
65  return yaz_stemmer;
66 }
67 
68 yaz_stemmer_p yaz_stemmer_create(const char *locale, const char *rule, UErrorCode *status) {
69  *status = U_ZERO_ERROR;
70  yaz_log(YLOG_DEBUG, "create stemmer: locale %s rule %s ", locale, rule);
71  return yaz_stemmer_snowball_create(locale, rule, status);
72 }
73 
75  UErrorCode error = U_ZERO_ERROR;
76  if (stemmer == 0)
77  return 0;
78  return yaz_stemmer_create(stemmer->locale, stemmer->rule, &error);
79 }
80 
81 void yaz_stemmer_stem(yaz_stemmer_p stemmer, struct icu_buf_utf16 *dst, struct icu_buf_utf16* src, UErrorCode *status)
82 {
83  switch(stemmer->implementation) {
84  case yaz_snowball: {
85  struct icu_buf_utf8 *utf8_buf = icu_buf_utf8_create(0);
86  icu_utf16_to_utf8(utf8_buf, src, status);
87  if (*status == U_ZERO_ERROR) {
88  const sb_symbol *cstr = (const sb_symbol*) icu_buf_utf8_to_cstr(utf8_buf);
89  const sb_symbol *sb_symbol = sb_stemmer_stem(stemmer->sb_stemmer, cstr, utf8_buf->utf8_len);
90  if (sb_symbol == 0) {
91  icu_buf_utf16_copy(dst, src);
92  }
93  else {
94 
95  const char *cstr2 = (const char *) sb_symbol;
96  icu_utf16_from_utf8_cstr(dst, cstr2 , status);
97 #if 0
98  yaz_log(YLOG_DEBUG, "stemming %s to %s ", cstr, cstr2);
99 #endif
100  }
101  }
102  icu_buf_utf8_destroy(utf8_buf);
103  return ;
104  break;
105  }
106  case yaz_no_operation:
107  yaz_log(YLOG_DEBUG, "Stemmer (No operation) called");
108  default: {
109  icu_buf_utf16_copy(dst, src);
110  }
111  }
112 }
113 
115 {
116  /* Handle no stemmer correctly */
117  if (stemmer == 0)
118  return ;
119 
120  switch (stemmer->implementation) {
121  case yaz_snowball:
122  sb_stemmer_delete(stemmer->sb_stemmer);
123  break;
124  }
125  xfree(stemmer->locale);
126  xfree(stemmer->rule);
127  xfree(stemmer);
128 }
129 
130 #endif /* YAZ_HAVE_ICU */
struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 *dest16, const struct icu_buf_utf16 *src16)
void icu_buf_utf8_destroy(struct icu_buf_utf8 *buf8)
UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 *dest16, const char *src8cstr, UErrorCode *status)
const char * icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 *dest8, const struct icu_buf_utf16 *src16, UErrorCode *status)
unsigned char sb_symbol
Definition: libstemmer.h:7
struct sb_stemmer * sb_stemmer_new(const char *algorithm, const char *charenc)
Definition: libstemmer.c:35
const sb_symbol * sb_stemmer_stem(struct sb_stemmer *stemmer, const sb_symbol *word, int size)
Definition: libstemmer.c:77
void sb_stemmer_delete(struct sb_stemmer *stemmer)
Definition: libstemmer.c:67
void yaz_log(int level, const char *fmt,...)
Writes log message.
Definition: log.c:487
#define YLOG_FATAL
log level: fatal
Definition: log.h:42
#define YLOG_DEBUG
log level: debugging
Definition: log.h:44
Header for the stemming API.
yaz_stemmer_p yaz_stemmer_create(const char *locale, const char *rule, UErrorCode *status)
void yaz_stemmer_stem(yaz_stemmer_p stemmer, struct icu_buf_utf16 *dst, struct icu_buf_utf16 *src, UErrorCode *status)
void yaz_stemmer_destroy(yaz_stemmer_p stemmer)
yaz_stemmer_p yaz_stemmer_clone(yaz_stemmer_p stemmer)
struct yaz_stemmer_t * yaz_stemmer_p
Definition: stemmer.h:49
int32_t utf8_len
Definition: icu_I18N.h:81
Header for memory handling functions.
#define xstrdup(s)
utility macro which calls xstrdup_f
Definition: xmalloc.h:55
#define xfree(x)
utility macro which calls xfree_f
Definition: xmalloc.h:53
#define xmalloc(x)
utility macro which calls malloc_f
Definition: xmalloc.h:49