AmendHub

Download

jcs

/

wallops

/

utf8.c

 

(View History)

jcs   utf8: Import from Carl Latest amendment: 112 on 2024-09-17

1 /*
2 * Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
3 * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
4 *
5 * Copyright (c) 2022 joshua stein <jcs@jcs.org>
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be
16 * included in all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
22 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
23 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 * SOFTWARE.
26 */
27
28 #include "utf8.h"
29
30 #define ONLY_TO_MACROMAN 1
31 #define ONLY_TO_UTF8 2
32
33 static const struct utf8_macroman_pair {
34 utf8_codepoint utf8;
35 unsigned char macroman;
36 unsigned char type;
37 } utf8_macroman_pairs[] = {
38 { 0x000000a0, ' ', ONLY_TO_MACROMAN }, /* supposed to be non-breaking */
39 { 0x00002019, '\'', ONLY_TO_MACROMAN },
40 { 0x00002026, '…', ONLY_TO_MACROMAN },
41 { 0x0000c2a0, ' ', ONLY_TO_MACROMAN },
42 { 0x00efbfbd, UTF8_UNKNOWN_MACROMAN, ONLY_TO_UTF8 }, /* unknown utf-8 char */
43
44 { 0x0000c2a1, '¡' },
45 { 0x0000c2a2, '¢' },
46 { 0x0000c2a3, '£' },
47 { 0x0000c2a5, '¥' },
48 { 0x0000c2a7, '§' },
49 { 0x0000c2a8, '¨' },
50 { 0x0000c2a9, '©' },
51 { 0x0000c2aa, 'ª' },
52 { 0x0000c2ab, '«' },
53 { 0x0000c2ac, '¬' },
54 { 0x0000c2ae, '®' },
55 { 0x0000c2b0, '°' },
56 { 0x0000c2b1, '±' },
57 { 0x0000c2b4, '´' },
58 { 0x0000c2b5, 'µ' },
59 { 0x0000c2b6, '¶' },
60 { 0x0000c2ba, 'º' },
61 { 0x0000c2bb, '»' },
62 { 0x0000c2bf, '¿' },
63 { 0x0000c380, 'À' },
64 { 0x0000c383, 'Ã' },
65 { 0x0000c384, 'Ä' },
66 { 0x0000c385, 'Å' },
67 { 0x0000c386, 'Æ' },
68 { 0x0000c387, 'Ç' },
69 { 0x0000c389, 'É' },
70 { 0x0000c391, 'Ñ' },
71 { 0x0000c395, 'Õ' },
72 { 0x0000c396, 'Ö' },
73 { 0x0000c398, 'Ø' },
74 { 0x0000c39c, 'Ü' },
75 { 0x0000c39f, 'ß' },
76 { 0x0000c3a0, 'à' },
77 { 0x0000c3a1, 'á' },
78 { 0x0000c3a2, 'â' },
79 { 0x0000c3a3, 'ã' },
80 { 0x0000c3a4, 'ä' },
81 { 0x0000c3a5, 'å' },
82 { 0x0000c3a6, 'æ' },
83 { 0x0000c3a7, 'ç' },
84 { 0x0000c3a8, 'è' },
85 { 0x0000c3a9, 'é' },
86 { 0x0000c3aa, 'ê' },
87 { 0x0000c3ab, 'ë' },
88 { 0x0000c3ac, 'ì' },
89 { 0x0000c3ad, 'í' },
90 { 0x0000c3ae, 'î' },
91 { 0x0000c3af, 'ï' },
92 { 0x0000c3b1, 'ñ' },
93 { 0x0000c3b2, 'ò' },
94 { 0x0000c3b3, 'ó' },
95 { 0x0000c3b4, 'ô' },
96 { 0x0000c3b5, 'õ' },
97 { 0x0000c3b6, 'ö' },
98 { 0x0000c3b7, '÷' },
99 { 0x0000c3b8, 'ø' },
100 { 0x0000c3b9, 'ù' },
101 { 0x0000c3ba, 'ú' },
102 { 0x0000c3bb, 'û' },
103 { 0x0000c3bc, 'ü' },
104 { 0x0000c3bf, 'ÿ' },
105 { 0x0000c592, 'Œ' },
106 { 0x0000c593, 'œ' },
107 { 0x0000c5b8, 'Ÿ' },
108 { 0x0000c692, 'ƒ' },
109 { 0x0000cf80, 'π' },
110 { 0x00e28093, '–' },
111 { 0x00e28094, '—' },
112 { 0x00e28098, '‘' },
113 { 0x00e28099, '’' },
114 { 0x00e2809c, '“' },
115 { 0x00e2809d, '”' },
116 { 0x00e280a0, '†' },
117 { 0x00e280a2, '•' },
118 { 0x00e280a6, '…' },
119 { 0x00e284a2, '™' },
120 { 0x00e284a6, 'Ω' },
121 { 0x00e28882, '∂' },
122 { 0x00e28886, '∆' },
123 { 0x00e2888f, '∏' },
124 { 0x00e28891, '∑' },
125 { 0x00e2889a, '√' },
126 { 0x00e2889e, '∞' },
127 { 0x00e288ab, '∫' },
128 { 0x00e28988, '≈' },
129 { 0x00e289a0, '≠' },
130 { 0x00e289a4, '≤' },
131 { 0x00e289a5, '≥' },
132 };
133
134 static const unsigned char utf8d[] = {
135 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
136 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
137 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
138 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
139 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
140 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
141 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
142 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
143 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
144 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
145 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
146 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
147 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
148 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
149 };
150
151 /* iso8859-1 character, minus 128, to macroman */
152 static const unsigned char iso88591_macroman_pairs[] = {
153 '?', '?', '?', '?', '?', '?', '?', '?',
154 '?', '?', '?', '?', '?', '?', '?', '?',
155 '?', '?', '?', '?', '?', '?', '?', '?',
156 '?', '?', '?', '?', '?', '?', '?', '?',
157 0xca, 0xc1, 0xa2, 0xa3, 0xdb, 0xb4, '?', 0xa4,
158 0xac, 0xa9, 0xbb, 0xc7, 0xc2, '?', 0xa8, 0xf8,
159 0xa1, 0xb1, '?', '?', 0xab, 0xb5, 0xa6, 0xe1,
160 0xfc, '?', 0xbc, 0xc8, '?', '?', '?', 0xc0,
161 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82,
162 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec,
163 '?', 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, '?',
164 0xaf, 0xf4, 0xf2, 0xf3, 0x86, '?', '?', 0xa7,
165 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d,
166 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95,
167 '?', 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6,
168 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, '?', '?', 0xd8,
169 };
170
171 utf8_codepoint
172 utf8_decode(utf8_codepoint *state, utf8_codepoint *codep,
173 utf8_codepoint byte)
174 {
175 utf8_codepoint type = utf8d[byte];
176
177 *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) :
178 (0xff >> type) & (byte);
179
180 *state = utf8d[256 + (*state * 16) + type];
181 return *state;
182 }
183
184 size_t
185 utf8_to_macroman_string(char *str, size_t len, char *ret)
186 {
187 struct utf8_macroman_pair *pair;
188 utf8_codepoint codepoint, utf8_bytes;
189 utf8_codepoint state = UTF8_ACCEPT;
190 size_t retlen, n, j;
191 short bytes = 0;
192 unsigned char *ustr = (unsigned char *)str;
193 bool found_macroman;
194
195 retlen = 0;
196 for (n = 0; n < len; n++) {
197 bytes++;
198
199 if (utf8_decode(&state, &codepoint, ustr[n]) == 0) {
200 if (bytes > 1) {
201 utf8_bytes = 0;
202 if (bytes == 4)
203 utf8_bytes |= ((unsigned long)(ustr[n - 3]) << 24);
204 if (bytes >= 3)
205 utf8_bytes |= ((unsigned long)(ustr[n - 2]) << 16);
206 if (bytes >= 2)
207 utf8_bytes |= ((unsigned long)(ustr[n - 1]) << 8);
208 if (bytes >= 1)
209 utf8_bytes |= ustr[n];
210
211 found_macroman = false;
212 for (j = 0; j < nitems(utf8_macroman_pairs); j++) {
213 pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j];
214 if (utf8_bytes == pair->utf8 &&
215 pair->type != ONLY_TO_UTF8) {
216 codepoint = pair->macroman;
217 found_macroman = true;
218 break;
219 }
220 }
221
222 if (!found_macroman)
223 codepoint = UTF8_UNKNOWN_MACROMAN;
224 }
225
226 ret[retlen++] = codepoint;
227 bytes = 0;
228 state = UTF8_ACCEPT;
229 }
230 }
231
232 ret[retlen] = '\0';
233 return retlen;
234 }
235
236 size_t
237 macroman_to_utf8_string(char *str, size_t len, char **retp)
238 {
239 struct utf8_macroman_pair *pair;
240 size_t retlen, n, j;
241 unsigned char *ustr = (unsigned char *)str;
242 unsigned char c;
243 bool found_utf8;
244
245 retlen = 0;
246 for (n = 0; n < len; n++) {
247 c = ustr[n];
248
249 if (c >= 128)
250 retlen += 4;
251 else
252 retlen++;
253 }
254
255 *retp = xmalloc(retlen + 1);
256 if (*retp == NULL)
257 return 0;
258
259 retlen = 0;
260 for (n = 0; n < len; n++) {
261 c = ustr[n];
262
263 if (c < 128) {
264 (*retp)[retlen++] = c;
265 continue;
266 }
267
268 found_utf8 = false;
269 for (j = 0; j < nitems(utf8_macroman_pairs); j++) {
270 pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j];
271 if (pair->type == ONLY_TO_MACROMAN)
272 continue;
273 if (c == pair->macroman) {
274 found_utf8 = true;
275 if (pair->utf8 >= 0x01000000)
276 (*retp)[retlen++] = (pair->utf8 >> 24);
277 if (pair->utf8 >= 0x00010000)
278 (*retp)[retlen++] = (pair->utf8 >> 16) & 0xff;
279 if (pair->utf8 >= 0x00000100)
280 (*retp)[retlen++] = (pair->utf8 >> 8) & 0xff;
281 if (pair->utf8 >= 0x00000001)
282 (*retp)[retlen++] = pair->utf8 & 0xff;
283 break;
284 }
285 }
286
287 if (!found_utf8)
288 (*retp)[retlen++] = c;
289 }
290
291 (*retp)[retlen] = '\0';
292 return retlen;
293 }
294
295 size_t
296 iso88591_to_macroman_string(char *str, size_t len, char **retp)
297 {
298 size_t mlen, retlen, n, j;
299 unsigned char *ustr = (unsigned char *)str;
300 unsigned char c;
301
302 *retp = xmalloc(len + 1);
303 if (*retp == NULL)
304 return 0;
305
306 retlen = 0;
307 for (n = 0; n < len; n++) {
308 c = ustr[n];
309
310 if (c >= 128)
311 (*retp)[retlen++] = iso88591_macroman_pairs[c - 128];
312 else
313 (*retp)[retlen++] = c;
314 }
315
316 (*retp)[retlen] = '\0';
317 return retlen;
318 }