YAZ  5.34.0
icu_tokenizer.c
Go to the documentation of this file.
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) Index Data
3  * See the file LICENSE for details.
4  */
5 
11 #if HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14 
15 #if YAZ_HAVE_ICU
16 #include <yaz/xmalloc.h>
17 
18 #include <yaz/icu_I18N.h>
19 
20 #include <yaz/log.h>
21 
22 #include <assert.h>
23 #include <string.h>
24 #include <stdlib.h>
25 #include <stdio.h>
26 
27 #include <unicode/ustring.h> /* some more string fcns*/
28 #include <unicode/uchar.h> /* char names */
29 
30 struct icu_tokenizer
31 {
32  char action;
33  UBreakIterator* bi;
34  struct icu_buf_utf16 * buf16;
35  int32_t token_count;
36  int32_t token_id;
37  int32_t token_start;
38  int32_t token_end;
39 /*
40  keep always invariant
41  0 <= token_start
42  <= token_end
43  <= buf16->utf16_len
44  and invariant
45  0 <= token_id <= token_count
46 */
47 };
48 
49 static void icu_tokenizer_reset(struct icu_tokenizer *tokenizer,
50  char action)
51 {
52  tokenizer->action = action;
53  tokenizer->bi = 0;
54  tokenizer->buf16 = icu_buf_utf16_create(0);
55  tokenizer->token_count = 0;
56  tokenizer->token_id = 0;
57  tokenizer->token_start = 0;
58  tokenizer->token_end = 0;
59  tokenizer->bi = 0;
60 }
61 
62 struct icu_tokenizer *icu_tokenizer_clone(struct icu_tokenizer *old)
63 {
64  int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
65  UErrorCode status = U_ZERO_ERROR;
66  struct icu_tokenizer * tokenizer
67  = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
68 
69  assert(old);
70  icu_tokenizer_reset(tokenizer, old->action);
71  assert(old->bi);
72  tokenizer->bi = ubrk_safeClone(old->bi, NULL, &bufferSize, &status);
73  if (U_SUCCESS(status))
74  return tokenizer;
75  return tokenizer;
76 }
77 
78 struct icu_tokenizer *icu_tokenizer_create(const char *locale, char action,
79  UErrorCode *status)
80 {
81  struct icu_tokenizer *tokenizer
82  = (struct icu_tokenizer *) xmalloc(sizeof(struct icu_tokenizer));
83 
84  icu_tokenizer_reset(tokenizer, action);
85  switch (tokenizer->action)
86  {
87  case 'l':
88  case 'L':
89  tokenizer->bi = ubrk_open(UBRK_LINE, locale, 0, 0, status);
90  break;
91  case 's':
92  case 'S':
93  tokenizer->bi = ubrk_open(UBRK_SENTENCE, locale, 0, 0, status);
94  break;
95  case 'w':
96  case 'W':
97  tokenizer->bi = ubrk_open(UBRK_WORD, locale, 0, 0, status);
98  break;
99  case 'c':
100  case 'C':
101  tokenizer->bi = ubrk_open(UBRK_CHARACTER, locale, 0, 0, status);
102  break;
103  case 't':
104  case 'T':
105  tokenizer->bi = ubrk_open(UBRK_TITLE, locale, 0, 0, status);
106  break;
107  default:
108  *status = U_UNSUPPORTED_ERROR;
109  return 0;
110  break;
111  }
112 
113  /* ICU error stuff is a very funny business */
114  if (U_SUCCESS(*status))
115  return tokenizer;
116 
117  /* freeing if failed */
118  icu_tokenizer_destroy(tokenizer);
119  return 0;
120 }
121 
122 void icu_tokenizer_destroy(struct icu_tokenizer *tokenizer)
123 {
124  if (tokenizer)
125  {
126  icu_buf_utf16_destroy(tokenizer->buf16);
127  if (tokenizer->bi)
128  ubrk_close(tokenizer->bi);
129  xfree(tokenizer);
130  }
131 }
132 
133 int icu_tokenizer_attach(struct icu_tokenizer *tokenizer,
134  struct icu_buf_utf16 *src16,
135  UErrorCode *status)
136 {
137  if (!tokenizer || !tokenizer->bi || !src16)
138  return 0;
139 
140  icu_buf_utf16_copy(tokenizer->buf16, src16);
141 
142  tokenizer->token_count = 0;
143  tokenizer->token_id = 0;
144  tokenizer->token_start = 0;
145  tokenizer->token_end = 0;
146 
147  ubrk_setText(tokenizer->bi,
148  tokenizer->buf16->utf16, tokenizer->buf16->utf16_len, status);
149 
150  if (U_FAILURE(*status))
151  return 0;
152 
153  return 1;
154 }
155 
156 int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer,
157  struct icu_buf_utf16 *tkn16,
158  UErrorCode *status,
159  size_t *start, size_t *len)
160 {
161  int32_t tkn_start = 0;
162  int32_t tkn_end = 0;
163  int32_t tkn_len = 0;
164 
165  if (!tokenizer || !tokenizer->bi
166  || !tokenizer->buf16 || !tokenizer->buf16->utf16_len)
167  return 0;
168  /*
169  never change tokenizer->buf16 and keep always invariant
170  0 <= tokenizer->token_start
171  <= tokenizer->token_end
172  <= tokenizer->buf16->utf16_len
173  returns length of token
174  */
175 
176  if (0 == tokenizer->token_end) /* first call */
177  tkn_start = ubrk_first(tokenizer->bi);
178  else /* successive calls */
179  tkn_start = tokenizer->token_end;
180 
181  /* get next position */
182  tkn_end = ubrk_next(tokenizer->bi);
183 
184  /* repairing invariant at end of ubrk, which is UBRK_DONE = -1 */
185  if (UBRK_DONE == tkn_end)
186  tkn_end = tokenizer->buf16->utf16_len;
187 
188  /* copy out if everything is well */
189  if (U_FAILURE(*status))
190  return 0;
191 
192  /* everything OK, now update internal state */
193  tkn_len = tkn_end - tkn_start;
194 
195  if (0 < tkn_len)
196  {
197  tokenizer->token_count++;
198  tokenizer->token_id++;
199  }
200  else
201  tokenizer->token_id = 0;
202 
203  tokenizer->token_start = tkn_start;
204  tokenizer->token_end = tkn_end;
205 
206  *start = tkn_start;
207  *len = tkn_end - tkn_start;
208 
209  /* copying into token buffer if it exists */
210  if (tkn16)
211  {
212  if (tkn16->utf16_cap < tkn_len)
213  icu_buf_utf16_resize(tkn16, (size_t) tkn_len * 2);
214 
215  u_strncpy(tkn16->utf16, &(tokenizer->buf16->utf16)[tkn_start],
216  tkn_len);
217 
218  tkn16->utf16_len = tkn_len;
219  }
220 
221  return tkn_len;
222 }
223 
224 int32_t icu_tokenizer_token_count(struct icu_tokenizer *tokenizer)
225 {
226  return tokenizer->token_count;
227 }
228 
229 #endif /* YAZ_HAVE_ICU */
230 
231 /*
232  * Local variables:
233  * c-basic-offset: 4
234  * c-file-style: "Stroustrup"
235  * indent-tabs-mode: nil
236  * End:
237  * vim: shiftwidth=4 tabstop=8 expandtab
238  */
239 
Internal header for ICU utilities.
struct icu_buf_utf16 * icu_buf_utf16_copy(struct icu_buf_utf16 *dest16, const struct icu_buf_utf16 *src16)
struct icu_tokenizer * icu_tokenizer_create(const char *locale, char action, UErrorCode *status)
struct icu_tokenizer * icu_tokenizer_clone(struct icu_tokenizer *old)
int32_t icu_tokenizer_token_count(struct icu_tokenizer *tokenizer)
int icu_tokenizer_attach(struct icu_tokenizer *tokenizer, struct icu_buf_utf16 *src16, UErrorCode *status)
void icu_buf_utf16_destroy(struct icu_buf_utf16 *buf16)
void icu_tokenizer_destroy(struct icu_tokenizer *tokenizer)
struct icu_buf_utf16 * icu_buf_utf16_create(size_t capacity)
struct icu_buf_utf16 * icu_buf_utf16_resize(struct icu_buf_utf16 *buf16, size_t capacity)
int32_t icu_tokenizer_next_token(struct icu_tokenizer *tokenizer, struct icu_buf_utf16 *tkn16, UErrorCode *status, size_t *start, size_t *len)
Logging utility.
UChar * utf16
Definition: icu_I18N.h:54
int32_t utf16_cap
Definition: icu_I18N.h:56
int32_t utf16_len
Definition: icu_I18N.h:55
Header for memory handling functions.
#define xfree(x)
utility macro which calls xfree_f
Definition: xmalloc.h:53
#define xmalloc(x)
utility macro which calls malloc_f
Definition: xmalloc.h:49