Download
jcs
/wallops
/utf8.c
(View History)
jcs utf8: Import from Carl | Latest amendment: 112 on 2024-09-17 |
1 | /* |
2 | * Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> |
3 | * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. |
4 | * |
5 | * Copyright (c) 2022 joshua stein <jcs@jcs.org> |
6 | * |
7 | * Permission is hereby granted, free of charge, to any person obtaining |
8 | * a copy of this software and associated documentation files (the |
9 | * "Software"), to deal in the Software without restriction, including |
10 | * without limitation the rights to use, copy, modify, merge, publish, |
11 | * distribute, sublicense, and/or sell copies of the Software, and to |
12 | * permit persons to whom the Software is furnished to do so, subject to |
13 | * the following conditions: |
14 | * |
15 | * The above copyright notice and this permission notice shall be |
16 | * included in all copies or substantial portions of the Software. |
17 | * |
18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
19 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
20 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
21 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
22 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
23 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
24 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
25 | * SOFTWARE. |
26 | */ |
27 | |
28 | #include "utf8.h" |
29 | |
30 | #define ONLY_TO_MACROMAN 1 |
31 | #define ONLY_TO_UTF8 2 |
32 | |
33 | static const struct utf8_macroman_pair { |
34 | utf8_codepoint utf8; |
35 | unsigned char macroman; |
36 | unsigned char type; |
37 | } utf8_macroman_pairs[] = { |
38 | { 0x000000a0, ' ', ONLY_TO_MACROMAN }, /* supposed to be non-breaking */ |
39 | { 0x00002019, '\'', ONLY_TO_MACROMAN }, |
40 | { 0x00002026, '…', ONLY_TO_MACROMAN }, |
41 | { 0x0000c2a0, ' ', ONLY_TO_MACROMAN }, |
42 | { 0x00efbfbd, UTF8_UNKNOWN_MACROMAN, ONLY_TO_UTF8 }, /* unknown utf-8 char */ |
43 | |
44 | { 0x0000c2a1, '¡' }, |
45 | { 0x0000c2a2, '¢' }, |
46 | { 0x0000c2a3, '£' }, |
47 | { 0x0000c2a5, '¥' }, |
48 | { 0x0000c2a7, '§' }, |
49 | { 0x0000c2a8, '¨' }, |
50 | { 0x0000c2a9, '©' }, |
51 | { 0x0000c2aa, 'ª' }, |
52 | { 0x0000c2ab, '«' }, |
53 | { 0x0000c2ac, '¬' }, |
54 | { 0x0000c2ae, '®' }, |
55 | { 0x0000c2b0, '°' }, |
56 | { 0x0000c2b1, '±' }, |
57 | { 0x0000c2b4, '´' }, |
58 | { 0x0000c2b5, 'µ' }, |
59 | { 0x0000c2b6, '¶' }, |
60 | { 0x0000c2ba, 'º' }, |
61 | { 0x0000c2bb, '»' }, |
62 | { 0x0000c2bf, '¿' }, |
63 | { 0x0000c380, 'À' }, |
64 | { 0x0000c383, 'Ã' }, |
65 | { 0x0000c384, 'Ä' }, |
66 | { 0x0000c385, 'Å' }, |
67 | { 0x0000c386, 'Æ' }, |
68 | { 0x0000c387, 'Ç' }, |
69 | { 0x0000c389, 'É' }, |
70 | { 0x0000c391, 'Ñ' }, |
71 | { 0x0000c395, 'Õ' }, |
72 | { 0x0000c396, 'Ö' }, |
73 | { 0x0000c398, 'Ø' }, |
74 | { 0x0000c39c, 'Ü' }, |
75 | { 0x0000c39f, 'ß' }, |
76 | { 0x0000c3a0, 'à' }, |
77 | { 0x0000c3a1, 'á' }, |
78 | { 0x0000c3a2, 'â' }, |
79 | { 0x0000c3a3, 'ã' }, |
80 | { 0x0000c3a4, 'ä' }, |
81 | { 0x0000c3a5, 'å' }, |
82 | { 0x0000c3a6, 'æ' }, |
83 | { 0x0000c3a7, 'ç' }, |
84 | { 0x0000c3a8, 'è' }, |
85 | { 0x0000c3a9, 'é' }, |
86 | { 0x0000c3aa, 'ê' }, |
87 | { 0x0000c3ab, 'ë' }, |
88 | { 0x0000c3ac, 'ì' }, |
89 | { 0x0000c3ad, 'í' }, |
90 | { 0x0000c3ae, 'î' }, |
91 | { 0x0000c3af, 'ï' }, |
92 | { 0x0000c3b1, 'ñ' }, |
93 | { 0x0000c3b2, 'ò' }, |
94 | { 0x0000c3b3, 'ó' }, |
95 | { 0x0000c3b4, 'ô' }, |
96 | { 0x0000c3b5, 'õ' }, |
97 | { 0x0000c3b6, 'ö' }, |
98 | { 0x0000c3b7, '÷' }, |
99 | { 0x0000c3b8, 'ø' }, |
100 | { 0x0000c3b9, 'ù' }, |
101 | { 0x0000c3ba, 'ú' }, |
102 | { 0x0000c3bb, 'û' }, |
103 | { 0x0000c3bc, 'ü' }, |
104 | { 0x0000c3bf, 'ÿ' }, |
105 | { 0x0000c592, 'Œ' }, |
106 | { 0x0000c593, 'œ' }, |
107 | { 0x0000c5b8, 'Ÿ' }, |
108 | { 0x0000c692, 'ƒ' }, |
109 | { 0x0000cf80, 'π' }, |
110 | { 0x00e28093, '–' }, |
111 | { 0x00e28094, '—' }, |
112 | { 0x00e28098, '‘' }, |
113 | { 0x00e28099, '’' }, |
114 | { 0x00e2809c, '“' }, |
115 | { 0x00e2809d, '”' }, |
116 | { 0x00e280a0, '†' }, |
117 | { 0x00e280a2, '•' }, |
118 | { 0x00e280a6, '…' }, |
119 | { 0x00e284a2, '™' }, |
120 | { 0x00e284a6, 'Ω' }, |
121 | { 0x00e28882, '∂' }, |
122 | { 0x00e28886, '∆' }, |
123 | { 0x00e2888f, '∏' }, |
124 | { 0x00e28891, '∑' }, |
125 | { 0x00e2889a, '√' }, |
126 | { 0x00e2889e, '∞' }, |
127 | { 0x00e288ab, '∫' }, |
128 | { 0x00e28988, '≈' }, |
129 | { 0x00e289a0, '≠' }, |
130 | { 0x00e289a4, '≤' }, |
131 | { 0x00e289a5, '≥' }, |
132 | }; |
133 | |
134 | static const unsigned char utf8d[] = { |
135 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f |
136 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f |
137 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f |
138 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f |
139 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f |
140 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf |
141 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df |
142 | 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef |
143 | 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff |
144 | 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 |
145 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 |
146 | 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 |
147 | 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 |
148 | 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 |
149 | }; |
150 | |
151 | /* iso8859-1 character, minus 128, to macroman */ |
152 | static const unsigned char iso88591_macroman_pairs[] = { |
153 | '?', '?', '?', '?', '?', '?', '?', '?', |
154 | '?', '?', '?', '?', '?', '?', '?', '?', |
155 | '?', '?', '?', '?', '?', '?', '?', '?', |
156 | '?', '?', '?', '?', '?', '?', '?', '?', |
157 | 0xca, 0xc1, 0xa2, 0xa3, 0xdb, 0xb4, '?', 0xa4, |
158 | 0xac, 0xa9, 0xbb, 0xc7, 0xc2, '?', 0xa8, 0xf8, |
159 | 0xa1, 0xb1, '?', '?', 0xab, 0xb5, 0xa6, 0xe1, |
160 | 0xfc, '?', 0xbc, 0xc8, '?', '?', '?', 0xc0, |
161 | 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, |
162 | 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, |
163 | '?', 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, '?', |
164 | 0xaf, 0xf4, 0xf2, 0xf3, 0x86, '?', '?', 0xa7, |
165 | 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, |
166 | 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, |
167 | '?', 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, |
168 | 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, '?', '?', 0xd8, |
169 | }; |
170 | |
171 | utf8_codepoint |
172 | utf8_decode(utf8_codepoint *state, utf8_codepoint *codep, |
173 | utf8_codepoint byte) |
174 | { |
175 | utf8_codepoint type = utf8d[byte]; |
176 | |
177 | *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) : |
178 | (0xff >> type) & (byte); |
179 | |
180 | *state = utf8d[256 + (*state * 16) + type]; |
181 | return *state; |
182 | } |
183 | |
184 | size_t |
185 | utf8_to_macroman_string(char *str, size_t len, char *ret) |
186 | { |
187 | struct utf8_macroman_pair *pair; |
188 | utf8_codepoint codepoint, utf8_bytes; |
189 | utf8_codepoint state = UTF8_ACCEPT; |
190 | size_t retlen, n, j; |
191 | short bytes = 0; |
192 | unsigned char *ustr = (unsigned char *)str; |
193 | bool found_macroman; |
194 | |
195 | retlen = 0; |
196 | for (n = 0; n < len; n++) { |
197 | bytes++; |
198 | |
199 | if (utf8_decode(&state, &codepoint, ustr[n]) == 0) { |
200 | if (bytes > 1) { |
201 | utf8_bytes = 0; |
202 | if (bytes == 4) |
203 | utf8_bytes |= ((unsigned long)(ustr[n - 3]) << 24); |
204 | if (bytes >= 3) |
205 | utf8_bytes |= ((unsigned long)(ustr[n - 2]) << 16); |
206 | if (bytes >= 2) |
207 | utf8_bytes |= ((unsigned long)(ustr[n - 1]) << 8); |
208 | if (bytes >= 1) |
209 | utf8_bytes |= ustr[n]; |
210 | |
211 | found_macroman = false; |
212 | for (j = 0; j < nitems(utf8_macroman_pairs); j++) { |
213 | pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j]; |
214 | if (utf8_bytes == pair->utf8 && |
215 | pair->type != ONLY_TO_UTF8) { |
216 | codepoint = pair->macroman; |
217 | found_macroman = true; |
218 | break; |
219 | } |
220 | } |
221 | |
222 | if (!found_macroman) |
223 | codepoint = UTF8_UNKNOWN_MACROMAN; |
224 | } |
225 | |
226 | ret[retlen++] = codepoint; |
227 | bytes = 0; |
228 | state = UTF8_ACCEPT; |
229 | } |
230 | } |
231 | |
232 | ret[retlen] = '\0'; |
233 | return retlen; |
234 | } |
235 | |
236 | size_t |
237 | macroman_to_utf8_string(char *str, size_t len, char **retp) |
238 | { |
239 | struct utf8_macroman_pair *pair; |
240 | size_t retlen, n, j; |
241 | unsigned char *ustr = (unsigned char *)str; |
242 | unsigned char c; |
243 | bool found_utf8; |
244 | |
245 | retlen = 0; |
246 | for (n = 0; n < len; n++) { |
247 | c = ustr[n]; |
248 | |
249 | if (c >= 128) |
250 | retlen += 4; |
251 | else |
252 | retlen++; |
253 | } |
254 | |
255 | *retp = xmalloc(retlen + 1); |
256 | if (*retp == NULL) |
257 | return 0; |
258 | |
259 | retlen = 0; |
260 | for (n = 0; n < len; n++) { |
261 | c = ustr[n]; |
262 | |
263 | if (c < 128) { |
264 | (*retp)[retlen++] = c; |
265 | continue; |
266 | } |
267 | |
268 | found_utf8 = false; |
269 | for (j = 0; j < nitems(utf8_macroman_pairs); j++) { |
270 | pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j]; |
271 | if (pair->type == ONLY_TO_MACROMAN) |
272 | continue; |
273 | if (c == pair->macroman) { |
274 | found_utf8 = true; |
275 | if (pair->utf8 >= 0x01000000) |
276 | (*retp)[retlen++] = (pair->utf8 >> 24); |
277 | if (pair->utf8 >= 0x00010000) |
278 | (*retp)[retlen++] = (pair->utf8 >> 16) & 0xff; |
279 | if (pair->utf8 >= 0x00000100) |
280 | (*retp)[retlen++] = (pair->utf8 >> 8) & 0xff; |
281 | if (pair->utf8 >= 0x00000001) |
282 | (*retp)[retlen++] = pair->utf8 & 0xff; |
283 | break; |
284 | } |
285 | } |
286 | |
287 | if (!found_utf8) |
288 | (*retp)[retlen++] = c; |
289 | } |
290 | |
291 | (*retp)[retlen] = '\0'; |
292 | return retlen; |
293 | } |
294 | |
295 | size_t |
296 | iso88591_to_macroman_string(char *str, size_t len, char **retp) |
297 | { |
298 | size_t mlen, retlen, n, j; |
299 | unsigned char *ustr = (unsigned char *)str; |
300 | unsigned char c; |
301 | |
302 | *retp = xmalloc(len + 1); |
303 | if (*retp == NULL) |
304 | return 0; |
305 | |
306 | retlen = 0; |
307 | for (n = 0; n < len; n++) { |
308 | c = ustr[n]; |
309 | |
310 | if (c >= 128) |
311 | (*retp)[retlen++] = iso88591_macroman_pairs[c - 128]; |
312 | else |
313 | (*retp)[retlen++] = c; |
314 | } |
315 | |
316 | (*retp)[retlen] = '\0'; |
317 | return retlen; |
318 | } |