27#include <unicode/ustring.h>
28#include <unicode/uchar.h>
49static void icu_tokenizer_reset(
struct icu_tokenizer *tokenizer,
52 tokenizer->action = action;
55 tokenizer->token_count = 0;
56 tokenizer->token_id = 0;
57 tokenizer->token_start = 0;
58 tokenizer->token_end = 0;
64 UErrorCode status = U_ZERO_ERROR;
65 struct icu_tokenizer * tokenizer
66 = (
struct icu_tokenizer *)
xmalloc(
sizeof(
struct icu_tokenizer));
67#if U_ICU_VERSION_MAJOR_NUM < 69
68 int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
72 icu_tokenizer_reset(tokenizer, old->action);
74#if U_ICU_VERSION_MAJOR_NUM < 69
75 tokenizer->bi = ubrk_safeClone(old->bi, NULL, &bufferSize, &status);
77 tokenizer->bi = ubrk_clone(old->bi, &status);
79 if (U_SUCCESS(status))
87 struct icu_tokenizer *tokenizer
88 = (
struct icu_tokenizer *)
xmalloc(
sizeof(
struct icu_tokenizer));
90 icu_tokenizer_reset(tokenizer, action);
91 switch (tokenizer->action)
95 tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
99 tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status);
103 tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status);
107 tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status);
111 tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status);
114 *status = U_UNSUPPORTED_ERROR;
120 if (U_SUCCESS(*status))
134 ubrk_close(tokenizer->bi);
143 if (!tokenizer || !tokenizer->bi || !src16)
148 tokenizer->token_count = 0;
149 tokenizer->token_id = 0;
150 tokenizer->token_start = 0;
151 tokenizer->token_end = 0;
153 ubrk_setText(tokenizer->bi,
154 tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status);
156 if (U_FAILURE(*status))
165 size_t *start,
size_t *len)
167 int32_t tkn_start = 0;
171 if (!tokenizer || !tokenizer->bi
172 || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
182 if (0 == tokenizer->token_end)
183 tkn_start = ubrk_first(tokenizer->bi);
185 tkn_start = tokenizer->token_end;
188 tkn_end = ubrk_next(tokenizer->bi);
191 if (UBRK_DONE == tkn_end)
192 tkn_end = tokenizer->buf16->utf16_len;
195 if (U_FAILURE(*status))
199 tkn_len = tkn_end - tkn_start;
203 tokenizer->token_count++;
204 tokenizer->token_id++;
207 tokenizer->token_id = 0;
209 tokenizer->token_start = tkn_start;
210 tokenizer->token_end = tkn_end;
213 *len = tkn_end - tkn_start;
221 u_strncpy(tkn16->
utf16, &(tokenizer->buf16->utf16)[tkn_start],
232 return tokenizer->token_count;
Internal header for ICU utilities.
struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 *buf16, size_t capacity)
struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, UErrorCode *status)
int32_t icu_tokenizer_token_count(struct icu_tokenizer *tokenizer)
int icu_tokenizer_attach(struct icu_tokenizer *tokenizer, struct icu_buf_utf16 *src16, UErrorCode *status)
struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
void icu_buf_utf16_destroy(struct icu_buf_utf16 *buf16)
void icu_tokenizer_destroy(struct icu_tokenizer *tokenizer)
struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 *dest16, const struct icu_buf_utf16 *src16)
int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer, struct icu_buf_utf16 *tkn16, UErrorCode *status, size_t *start, size_t *len)
struct icu_tokenizer * icu_tokenizer_clone(struct icu_tokenizer *old)
Header for memory handling functions.
#define xfree(x)
utility macro which calls xfree_f
#define xmalloc(x)
utility macro which calls malloc_f