YAZ 5.35.1
utf8.c
Go to the documentation of this file.
1/* This file is part of the YAZ toolkit.
2 * Copyright (C) Index Data
3 * See the file LICENSE for details.
4 */
10#if HAVE_CONFIG_H
11#include <config.h>
12#endif
13
14#include <assert.h>
15#include <errno.h>
16#include <string.h>
17
18#include "iconv-p.h"
19
21 unsigned char *inp,
22 size_t inbytesleft, size_t *no_read)
23{
24 if (!inp || inp[0] != 0xef)
25 {
26 *no_read = 0;
27 return 0;
28 }
29 if (inbytesleft < 3)
30 {
32 return (size_t) -1;
33 }
34 if (inp[1] != 0xbb && inp[2] == 0xbf)
35 *no_read = 3;
36 else
37 *no_read = 0;
38 return 0;
39}
40
41unsigned long yaz_read_UTF8_char(const unsigned char *inp,
42 size_t inbytesleft, size_t *no_read,
43 int *error)
44{
45 unsigned long x = 0;
46
47 *no_read = 0; /* by default */
48 if (inp[0] <= 0x7f)
49 {
50 x = inp[0];
51 *no_read = 1;
52 }
53 else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
54 {
55 *error = YAZ_ICONV_EILSEQ;
56 }
57 else if (inp[0] <= 0xdf && inbytesleft >= 2)
58 {
59 if ((inp[1] & 0xc0) == 0x80)
60 {
61 x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
62 if (x >= 0x80)
63 *no_read = 2;
64 else
65 *error = YAZ_ICONV_EILSEQ;
66 }
67 else
68 *error = YAZ_ICONV_EILSEQ;
69 }
70 else if (inp[0] <= 0xef && inbytesleft >= 3)
71 {
72 if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80)
73 {
74 x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
75 (inp[2] & 0x3f);
76 if (x >= 0x800)
77 *no_read = 3;
78 else
79 *error = YAZ_ICONV_EILSEQ;
80 }
81 else
82 *error = YAZ_ICONV_EILSEQ;
83 }
84 else if (inp[0] <= 0xf7 && inbytesleft >= 4)
85 {
86 if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80
87 && (inp[3] & 0xc0) == 0x80)
88 {
89 x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
90 ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
91 if (x >= 0x10000)
92 *no_read = 4;
93 else
94 *error = YAZ_ICONV_EILSEQ;
95 }
96 else
97 *error = YAZ_ICONV_EILSEQ;
98 }
99 else if (inp[0] <= 0xfb && inbytesleft >= 5)
100 {
101 if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80
102 && (inp[3] & 0xc0) == 0x80 && (inp[4] & 0xc0) == 0x80)
103 {
104 x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
105 ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
106 (inp[4] & 0x3f);
107 if (x >= 0x200000)
108 *no_read = 5;
109 else
110 *error = YAZ_ICONV_EILSEQ;
111 }
112 else
113 *error = YAZ_ICONV_EILSEQ;
114 }
115 else if (inp[0] <= 0xfd && inbytesleft >= 6)
116 {
117 if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80
118 && (inp[3] & 0xc0) == 0x80 && (inp[4] & 0xc0) == 0x80
119 && (inp[5] & 0xc0) == 0x80)
120 {
121 x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
122 ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
123 ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
124 if (x >= 0x4000000)
125 *no_read = 6;
126 else
127 *error = YAZ_ICONV_EILSEQ;
128 }
129 else
130 *error = YAZ_ICONV_EILSEQ;
131 }
132 else
133 *error = YAZ_ICONV_EINVAL; /* incomplete sentence */
134
135 return x;
136}
137
138static unsigned long read_utf8(yaz_iconv_t cd, yaz_iconv_decoder_t d,
139 unsigned char *inp,
140 size_t inbytesleft, size_t *no_read)
141{
142 int err = 0;
143 int r = yaz_read_UTF8_char(inp, inbytesleft, no_read, &err);
144 yaz_iconv_set_errno(cd, err);
145 return r;
146}
147
148
150 unsigned long x,
151 char **outbuf, size_t *outbytesleft)
152{
153 int err = 0;
154 int r = yaz_write_UTF8_char(x, outbuf, outbytesleft, &err);
155 yaz_iconv_set_errno(cd, err);
156 return r;
157}
158
159size_t yaz_write_UTF8_char(unsigned long x,
160 char **outbuf, size_t *outbytesleft,
161 int *error)
162{
163 unsigned char *outp = (unsigned char *) *outbuf;
164
165 if (x <= 0x7f && *outbytesleft >= 1)
166 {
167 *outp++ = (unsigned char) x;
168 (*outbytesleft)--;
169 }
170 else if (x <= 0x7ff && *outbytesleft >= 2)
171 {
172 *outp++ = (unsigned char) ((x >> 6) | 0xc0);
173 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
174 (*outbytesleft) -= 2;
175 }
176 else if (x <= 0xffff && *outbytesleft >= 3)
177 {
178 *outp++ = (unsigned char) ((x >> 12) | 0xe0);
179 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
180 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
181 (*outbytesleft) -= 3;
182 }
183 else if (x <= 0x1fffff && *outbytesleft >= 4)
184 {
185 *outp++ = (unsigned char) ((x >> 18) | 0xf0);
186 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
187 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
188 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
189 (*outbytesleft) -= 4;
190 }
191 else if (x <= 0x3ffffff && *outbytesleft >= 5)
192 {
193 *outp++ = (unsigned char) ((x >> 24) | 0xf8);
194 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
195 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
196 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
197 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
198 (*outbytesleft) -= 5;
199 }
200 else if (*outbytesleft >= 6)
201 {
202 *outp++ = (unsigned char) ((x >> 30) | 0xfc);
203 *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
204 *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
205 *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
206 *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
207 *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
208 (*outbytesleft) -= 6;
209 }
210 else
211 {
212 *error = YAZ_ICONV_E2BIG; /* not room for output */
213 return (size_t)(-1);
214 }
215 *outbuf = (char *) outp;
216 return 0;
217}
218
221
222{
223 if (!yaz_matchstr(tocode, "UTF8"))
224 {
226 return e;
227 }
228 return 0;
229}
230
233{
234 if (!yaz_matchstr(fromcode, "UTF8"))
235 {
238 return d;
239 }
240 return 0;
241}
242
243int yaz_utf8_check(const char *str)
244{
245 /* cast OK: yaz_read_UTF8_char is read-only */
246 unsigned char *inp = (unsigned char *) str;
247 size_t inbytesleft = strlen(str);
248
249 while (inbytesleft)
250 {
251 int error = 0;
252 size_t no_read;
253 yaz_read_UTF8_char(inp, inbytesleft, &no_read, &error);
254 if (error)
255 return 0;
256 inp += no_read;
257 inbytesleft -= no_read;
258 }
259 return 1;
260}
261
262/*
263 * Local variables:
264 * c-basic-offset: 4
265 * c-file-style: "Stroustrup"
266 * indent-tabs-mode: nil
267 * End:
268 * vim: shiftwidth=4 tabstop=8 expandtab
269 */
270
Header for errno utilities.
Internal header for iconv.
void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
Definition siconv.c:298
int yaz_matchstr(const char *s1, const char *s2)
match strings - independent of case and '-'
Definition matchstr.c:42
unsigned long(* read_handle)(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inbuf, size_t inbytesleft, size_t *no_read)
Definition iconv-p.h:86
size_t(* init_handle)(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inbuf, size_t inbytesleft, size_t *no_read)
Definition iconv-p.h:83
size_t(* write_handle)(yaz_iconv_t cd, yaz_iconv_encoder_t e, unsigned long x, char **outbuf, size_t *outbytesleft)
Definition iconv-p.h:45
static size_t write_UTF8(yaz_iconv_t cd, yaz_iconv_encoder_t en, unsigned long x, char **outbuf, size_t *outbytesleft)
Definition utf8.c:149
static unsigned long read_utf8(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inp, size_t inbytesleft, size_t *no_read)
Definition utf8.c:138
size_t yaz_write_UTF8_char(unsigned long x, char **outbuf, size_t *outbytesleft, int *error)
encodes UTF-8 sequence
Definition utf8.c:159
yaz_iconv_encoder_t yaz_utf8_encoder(const char *tocode, yaz_iconv_encoder_t e)
Definition utf8.c:219
int yaz_utf8_check(const char *str)
check whether string apppers to be UTF-8 encoded
Definition utf8.c:243
yaz_iconv_decoder_t yaz_utf8_decoder(const char *fromcode, yaz_iconv_decoder_t d)
Definition utf8.c:231
static size_t init_utf8(yaz_iconv_t cd, yaz_iconv_decoder_t d, unsigned char *inp, size_t inbytesleft, size_t *no_read)
Definition utf8.c:20
unsigned long yaz_read_UTF8_char(const unsigned char *inp, size_t inbytesleft, size_t *no_read, int *error)
Definition utf8.c:41
#define YAZ_ICONV_EILSEQ
error code: Invalid sequence
Definition yaz-iconv.h:49
#define YAZ_ICONV_E2BIG
error code: Not sufficient room for output buffer
Definition yaz-iconv.h:47
#define YAZ_ICONV_EINVAL
error code: An incomplete multibyte sequence is in input buffer
Definition yaz-iconv.h:51