YAZ 5.35.1
stemmer.c
Go to the documentation of this file.
1/* This file is part of the YAZ toolkit.
2 * Copyright (C) Index Data
3 * See the file LICENSE for details.
4 */
5
11#if HAVE_CONFIG_H
12#include <config.h>
13#endif
14
15#if YAZ_HAVE_ICU
16
17#include <yaz/stemmer.h>
18
19#include <yaz/xmalloc.h>
20
21#include <libstemmer.h>
22
23#include <unicode/ustring.h> /* some more string fcns*/
24#include <unicode/uchar.h> /* char names */
25
26enum stemmer_implementation {
27 yaz_no_operation,
28 yaz_snowball
29};
30struct yaz_stemmer_t
31{
32 int implementation;
33 char *locale;
34 char *rule;
35 struct sb_stemmer *sb_stemmer;
36};
37
38const char* yaz_stemmer_lookup_charenc(const char *charenc, const char *rule) {
39 return "UTF_8";
40}
41
42const char* yaz_stemmer_lookup_algorithm(const char *locale, const char *rule) {
43 return locale;
44}
45
46yaz_stemmer_p yaz_stemmer_snowball_create(const char *locale, const char *rule, UErrorCode *status) {
47 const char *charenc = yaz_stemmer_lookup_charenc(locale, rule);
48 const char *algorithm = yaz_stemmer_lookup_algorithm(locale,rule);
49 struct sb_stemmer *stemmer = sb_stemmer_new(algorithm, charenc);
50 yaz_stemmer_p yaz_stemmer;
51 if (stemmer == 0) {
52 *status = U_ILLEGAL_ARGUMENT_ERROR;
53 yaz_log(YLOG_FATAL, "yaz_stemmer: Failed to create snowball stemmer from locale %srule %s. Showball: charenc %s algorithm %s ",
54 locale, rule, charenc, algorithm);
55 return 0;
56 }
57 yaz_log(YLOG_DEBUG, "created snowball stemmer: algorithm %s charenc %s ", algorithm, charenc);
58 yaz_stemmer = xmalloc(sizeof(*yaz_stemmer));
59 yaz_stemmer->implementation = yaz_snowball;
60
61 yaz_stemmer->locale = xstrdup(locale);
62 yaz_stemmer->rule = xstrdup(rule);
63 yaz_stemmer->sb_stemmer = stemmer;
64 yaz_log(YLOG_DEBUG, "created snowball stemmer: algorithm %s charenc %s ", algorithm, charenc);
65 return yaz_stemmer;
66}
67
68yaz_stemmer_p yaz_stemmer_create(const char *locale, const char *rule, UErrorCode *status) {
69 *status = U_ZERO_ERROR;
70 yaz_log(YLOG_DEBUG, "create stemmer: locale %s rule %s ", locale, rule);
71 return yaz_stemmer_snowball_create(locale, rule, status);
72}
73
75 UErrorCode error = U_ZERO_ERROR;
76 if (stemmer == 0)
77 return 0;
78 return yaz_stemmer_create(stemmer->locale, stemmer->rule, &error);
79}
80
81void yaz_stemmer_stem(yaz_stemmer_p stemmer, struct icu_buf_utf16 *dst, struct icu_buf_utf16* src, UErrorCode *status)
82{
83 switch(stemmer->implementation) {
84 case yaz_snowball: {
85 struct icu_buf_utf8 *utf8_buf = icu_buf_utf8_create(0);
86 icu_utf16_to_utf8(utf8_buf, src, status);
87 if (*status == U_ZERO_ERROR) {
88 const sb_symbol *cstr = (const sb_symbol*) icu_buf_utf8_to_cstr(utf8_buf);
89 const sb_symbol *sb_symbol = sb_stemmer_stem(stemmer->sb_stemmer, cstr, utf8_buf->utf8_len);
90 if (sb_symbol == 0) {
91 icu_buf_utf16_copy(dst, src);
92 }
93 else {
94
95 const char *cstr2 = (const char *) sb_symbol;
96 icu_utf16_from_utf8_cstr(dst, cstr2 , status);
97#if 0
98 yaz_log(YLOG_DEBUG, "stemming %s to %s ", cstr, cstr2);
99#endif
100 }
101 }
102 icu_buf_utf8_destroy(utf8_buf);
103 return ;
104 break;
105 }
106 case yaz_no_operation:
107 yaz_log(YLOG_DEBUG, "Stemmer (No operation) called");
108 default: {
109 icu_buf_utf16_copy(dst, src);
110 }
111 }
112}
113
115{
116 /* Handle no stemmer correctly */
117 if (stemmer == 0)
118 return ;
119
120 switch (stemmer->implementation) {
121 case yaz_snowball:
122 sb_stemmer_delete(stemmer->sb_stemmer);
123 break;
124 }
125 xfree(stemmer->locale);
126 xfree(stemmer->rule);
127 xfree(stemmer);
128}
129
130#endif /* YAZ_HAVE_ICU */
void icu_buf_utf8_destroy(struct icu_buf_utf8 *buf8)
UErrorCode icu_utf16_from_utf8_cstr(struct icu_buf_utf16 *dest16, const char *src8cstr, UErrorCode *status)
struct icu_buf_utf8 * icu_buf_utf8_create(size_t capacity)
const char * icu_buf_utf8_to_cstr(struct icu_buf_utf8 *src8)
UErrorCode icu_utf16_to_utf8(struct icu_buf_utf8 *dest8, const struct icu_buf_utf16 *src16, UErrorCode *status)
struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 *dest16, const struct icu_buf_utf16 *src16)
unsigned char sb_symbol
Definition libstemmer.h:8
struct sb_stemmer * sb_stemmer_new(const char *algorithm, const char *charenc)
Definition libstemmer.c:40
void sb_stemmer_delete(struct sb_stemmer *stemmer)
Definition libstemmer.c:77
const sb_symbol * sb_stemmer_stem(struct sb_stemmer *stemmer, const sb_symbol *word, int size)
Definition libstemmer.c:89
void yaz_log(int level, const char *fmt,...)
Writes log message.
Definition log.c:487
#define YLOG_FATAL
log level: fatal
Definition log.h:42
#define YLOG_DEBUG
log level: debugging
Definition log.h:44
Header for the stemming API.
yaz_stemmer_p yaz_stemmer_create(const char *locale, const char *rule, UErrorCode *status)
void yaz_stemmer_stem(yaz_stemmer_p stemmer, struct icu_buf_utf16 *dst, struct icu_buf_utf16 *src, UErrorCode *status)
void yaz_stemmer_destroy(yaz_stemmer_p stemmer)
yaz_stemmer_p yaz_stemmer_clone(yaz_stemmer_p stemmer)
struct yaz_stemmer_t * yaz_stemmer_p
Definition stemmer.h:49
int32_t utf8_len
Definition icu_I18N.h:81
Header for memory handling functions.
#define xstrdup(s)
utility macro which calls xstrdup_f
Definition xmalloc.h:55
#define xfree(x)
utility macro which calls xfree_f
Definition xmalloc.h:53
#define xmalloc(x)
utility macro which calls malloc_f
Definition xmalloc.h:49