YAZ 5.35.1
icu_tokenizer.c
Go to the documentation of this file.
1/* This file is part of the YAZ toolkit.
2 * Copyright (C) Index Data
3 * See the file LICENSE for details.
4 */
5
11#if HAVE_CONFIG_H
12#include "config.h"
13#endif
14
15#if YAZ_HAVE_ICU
16#include <yaz/xmalloc.h>
17
18#include <yaz/icu_I18N.h>
19
20#include <yaz/log.h>
21
22#include <assert.h>
23#include <string.h>
24#include <stdlib.h>
25#include <stdio.h>
26
27#include <unicode/ustring.h> /* some more string fcns*/
28#include <unicode/uchar.h> /* char names */
29
30struct icu_tokenizer
31{
32 char action;
33 UBreakIterator* bi;
34 struct icu_buf_utf16 * buf16;
35 int32_t token_count;
36 int32_t token_id;
37 int32_t token_start;
38 int32_t token_end;
39/*
40 keep always invariant
41 0 <= token_start
42 <= token_end
43 <= buf16->utf16_len
44 and invariant
45 0 <= token_id <= token_count
46*/
47};
48
49static void icu_tokenizer_reset(struct icu_tokenizer *tokenizer,
50 char action)
51{
52 tokenizer->action = action;
53 tokenizer->bi = 0;
54 tokenizer->buf16 = icu_buf_utf16_create(0);
55 tokenizer->token_count = 0;
56 tokenizer->token_id = 0;
57 tokenizer->token_start = 0;
58 tokenizer->token_end = 0;
59 tokenizer->bi = 0;
60}
61
62struct icu_tokenizer *icu_tokenizer_clone(struct icu_tokenizer *old)
63{
64 UErrorCode status = U_ZERO_ERROR;
65 struct icu_tokenizer * tokenizer
66 = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
67#if U_ICU_VERSION_MAJOR_NUM < 69
68 int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
69#endif
70
71 assert(old);
72 icu_tokenizer_reset(tokenizer, old->action);
73 assert(old->bi);
74#if U_ICU_VERSION_MAJOR_NUM < 69
75 tokenizer->bi = ubrk_safeClone(old->bi, NULL, &bufferSize, &status);
76#else
77 tokenizer->bi = ubrk_clone(old->bi, &status);
78#endif
79 if (U_SUCCESS(status))
80 return tokenizer;
81 return tokenizer;
82}
83
84struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
85 UErrorCode *status)
86{
87 struct icu_tokenizer *tokenizer
88 = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
89
90 icu_tokenizer_reset(tokenizer, action);
91 switch (tokenizer->action)
92 {
93 case 'l':
94 case 'L':
95 tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
96 break;
97 case 's':
98 case 'S':
99 tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status);
100 break;
101 case 'w':
102 case 'W':
103 tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status);
104 break;
105 case 'c':
106 case 'C':
107 tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status);
108 break;
109 case 't':
110 case 'T':
111 tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status);
112 break;
113 default:
114 *status = U_UNSUPPORTED_ERROR;
115 return 0;
116 break;
117 }
118
119 /* ICU error stuff is a very funny business */
120 if (U_SUCCESS(*status))
121 return tokenizer;
122
123 /* freeing if failed */
124 icu_tokenizer_destroy(tokenizer);
125 return 0;
126}
127
128void icu_tokenizer_destroy(struct icu_tokenizer *tokenizer)
129{
130 if (tokenizer)
131 {
132 icu_buf_utf16_destroy(tokenizer->buf16);
133 if (tokenizer->bi)
134 ubrk_close(tokenizer->bi);
135 xfree(tokenizer);
136 }
137}
138
139int icu_tokenizer_attach(struct icu_tokenizer *tokenizer,
140 struct icu_buf_utf16 *src16,
141 UErrorCode *status)
142{
143 if (!tokenizer || !tokenizer->bi || !src16)
144 return 0;
145
146 icu_buf_utf16_copy(tokenizer->buf16, src16);
147
148 tokenizer->token_count = 0;
149 tokenizer->token_id = 0;
150 tokenizer->token_start = 0;
151 tokenizer->token_end = 0;
152
153 ubrk_setText(tokenizer->bi,
154 tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status);
155
156 if (U_FAILURE(*status))
157 return 0;
158
159 return 1;
160}
161
162int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer,
163 struct icu_buf_utf16 *tkn16,
164 UErrorCode *status,
165 size_t *start, size_t *len)
166{
167 int32_t tkn_start = 0;
168 int32_t tkn_end = 0;
169 int32_t tkn_len = 0;
170
171 if (!tokenizer || !tokenizer->bi
172 || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
173 return 0;
174 /*
175 never change tokenizer->buf16 and keep always invariant
176 0 <= tokenizer->token_start
177 <= tokenizer->token_end
178 <= tokenizer->buf16->utf16_len
179 returns length of token
180 */
181
182 if (0 == tokenizer->token_end) /* first call */
183 tkn_start = ubrk_first(tokenizer->bi);
184 else /* successive calls */
185 tkn_start = tokenizer->token_end;
186
187 /* get next position */
188 tkn_end = ubrk_next(tokenizer->bi);
189
190 /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
191 if (UBRK_DONE == tkn_end)
192 tkn_end = tokenizer->buf16->utf16_len;
193
194 /* copy out if everything is well */
195 if (U_FAILURE(*status))
196 return 0;
197
198 /* everything OK, now update internal state */
199 tkn_len = tkn_end - tkn_start;
200
201 if (0 < tkn_len)
202 {
203 tokenizer->token_count++;
204 tokenizer->token_id++;
205 }
206 else
207 tokenizer->token_id = 0;
208
209 tokenizer->token_start = tkn_start;
210 tokenizer->token_end = tkn_end;
211
212 *start = tkn_start;
213 *len = tkn_end - tkn_start;
214
215 /* copying into token buffer if it exists */
216 if (tkn16)
217 {
218 if (tkn16->utf16_cap < tkn_len)
219 icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
220
221 u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
222 tkn_len);
223
224 tkn16->utf16_len = tkn_len;
225 }
226
227 return tkn_len;
228}
229
230int32_t icu_tokenizer_token_count(struct icu_tokenizer *tokenizer)
231{
232 return tokenizer->token_count;
233}
234
235#endif /* YAZ_HAVE_ICU */
236
237/*
238 * Local variables:
239 * c-basic-offset: 4
240 * c-file-style: "Stroustrup"
241 * indent-tabs-mode: nil
242 * End:
243 * vim: shiftwidth=4 tabstop=8 expandtab
244 */
245
Internal header for ICU utilities.
struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 *buf16, size_t capacity)
struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, UErrorCode *status)
int32_t icu_tokenizer_token_count(struct icu_tokenizer *tokenizer)
int icu_tokenizer_attach(struct icu_tokenizer *tokenizer, struct icu_buf_utf16 *src16, UErrorCode *status)
struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
void icu_buf_utf16_destroy(struct icu_buf_utf16 *buf16)
void icu_tokenizer_destroy(struct icu_tokenizer *tokenizer)
struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 *dest16, const struct icu_buf_utf16 *src16)
int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer, struct icu_buf_utf16 *tkn16, UErrorCode *status, size_t *start, size_t *len)
struct icu_tokenizer * icu_tokenizer_clone(struct icu_tokenizer *old)
Logging utility.
UChar * utf16
Definition icu_I18N.h:54
int32_t utf16_cap
Definition icu_I18N.h:56
int32_t utf16_len
Definition icu_I18N.h:55
Header for memory handling functions.
#define xfree(x)
utility macro which calls xfree_f
Definition xmalloc.h:53
#define xmalloc(x)
utility macro which calls malloc_f
Definition xmalloc.h:49