| 1 |
/* |
| 2 |
* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> |
| 3 |
* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. |
| 4 |
* |
| 5 |
* Copyright (c) 2022 joshua stein <jcs@jcs.org> |
| 6 |
* |
| 7 |
* Permission is hereby granted, free of charge, to any person obtaining |
| 8 |
* a copy of this software and associated documentation files (the |
| 9 |
* "Software"), to deal in the Software without restriction, including |
| 10 |
* without limitation the rights to use, copy, modify, merge, publish, |
| 11 |
* distribute, sublicense, and/or sell copies of the Software, and to |
| 12 |
* permit persons to whom the Software is furnished to do so, subject to |
| 13 |
* the following conditions: |
| 14 |
* |
| 15 |
* The above copyright notice and this permission notice shall be |
| 16 |
* included in all copies or substantial portions of the Software. |
| 17 |
* |
| 18 |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| 19 |
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| 20 |
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| 21 |
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
| 22 |
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
| 23 |
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
| 24 |
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 25 |
* SOFTWARE. |
| 26 |
*/ |
| 27 |
|
| 28 |
#include "utf8.h" |
| 29 |
|
| 30 |
#define ONLY_TO_MACROMAN 1 |
| 31 |
#define ONLY_TO_UTF8 2 |
| 32 |
|
| 33 |
static const struct utf8_macroman_pair { |
| 34 |
utf8_codepoint utf8; |
| 35 |
unsigned char macroman; |
| 36 |
unsigned char type; |
| 37 |
} utf8_macroman_pairs[] = { |
| 38 |
{ 0x000000a0, ' ', ONLY_TO_MACROMAN }, /* supposed to be non-breaking */ |
| 39 |
{ 0x00002019, '\'', ONLY_TO_MACROMAN }, |
| 40 |
{ 0x00002026, '…', ONLY_TO_MACROMAN }, |
| 41 |
{ 0x0000c2a0, ' ', ONLY_TO_MACROMAN }, |
| 42 |
{ 0x00efbfbd, UTF8_UNKNOWN_MACROMAN, ONLY_TO_UTF8 }, /* unknown utf-8 char */ |
| 43 |
|
| 44 |
{ 0x0000c2a1, '¡' }, |
| 45 |
{ 0x0000c2a2, '¢' }, |
| 46 |
{ 0x0000c2a3, '£' }, |
| 47 |
{ 0x0000c2a5, '¥' }, |
| 48 |
{ 0x0000c2a7, '§' }, |
| 49 |
{ 0x0000c2a8, '¨' }, |
| 50 |
{ 0x0000c2a9, '©' }, |
| 51 |
{ 0x0000c2aa, 'ª' }, |
| 52 |
{ 0x0000c2ab, '«' }, |
| 53 |
{ 0x0000c2ac, '¬' }, |
| 54 |
{ 0x0000c2ae, '®' }, |
| 55 |
{ 0x0000c2b0, '°' }, |
| 56 |
{ 0x0000c2b1, '±' }, |
| 57 |
{ 0x0000c2b4, '´' }, |
| 58 |
{ 0x0000c2b5, 'µ' }, |
| 59 |
{ 0x0000c2b6, '¶' }, |
| 60 |
{ 0x0000c2ba, 'º' }, |
| 61 |
{ 0x0000c2bb, '»' }, |
| 62 |
{ 0x0000c2bf, '¿' }, |
| 63 |
{ 0x0000c380, 'À' }, |
| 64 |
{ 0x0000c383, 'Ã' }, |
| 65 |
{ 0x0000c384, 'Ä' }, |
| 66 |
{ 0x0000c385, 'Å' }, |
| 67 |
{ 0x0000c386, 'Æ' }, |
| 68 |
{ 0x0000c387, 'Ç' }, |
| 69 |
{ 0x0000c389, 'É' }, |
| 70 |
{ 0x0000c391, 'Ñ' }, |
| 71 |
{ 0x0000c395, 'Õ' }, |
| 72 |
{ 0x0000c396, 'Ö' }, |
| 73 |
{ 0x0000c398, 'Ø' }, |
| 74 |
{ 0x0000c39c, 'Ü' }, |
| 75 |
{ 0x0000c39f, 'ß' }, |
| 76 |
{ 0x0000c3a0, 'à' }, |
| 77 |
{ 0x0000c3a1, 'á' }, |
| 78 |
{ 0x0000c3a2, 'â' }, |
| 79 |
{ 0x0000c3a3, 'ã' }, |
| 80 |
{ 0x0000c3a4, 'ä' }, |
| 81 |
{ 0x0000c3a5, 'å' }, |
| 82 |
{ 0x0000c3a6, 'æ' }, |
| 83 |
{ 0x0000c3a7, 'ç' }, |
| 84 |
{ 0x0000c3a8, 'è' }, |
| 85 |
{ 0x0000c3a9, 'é' }, |
| 86 |
{ 0x0000c3aa, 'ê' }, |
| 87 |
{ 0x0000c3ab, 'ë' }, |
| 88 |
{ 0x0000c3ac, 'ì' }, |
| 89 |
{ 0x0000c3ad, 'í' }, |
| 90 |
{ 0x0000c3ae, 'î' }, |
| 91 |
{ 0x0000c3af, 'ï' }, |
| 92 |
{ 0x0000c3b1, 'ñ' }, |
| 93 |
{ 0x0000c3b2, 'ò' }, |
| 94 |
{ 0x0000c3b3, 'ó' }, |
| 95 |
{ 0x0000c3b4, 'ô' }, |
| 96 |
{ 0x0000c3b5, 'õ' }, |
| 97 |
{ 0x0000c3b6, 'ö' }, |
| 98 |
{ 0x0000c3b7, '÷' }, |
| 99 |
{ 0x0000c3b8, 'ø' }, |
| 100 |
{ 0x0000c3b9, 'ù' }, |
| 101 |
{ 0x0000c3ba, 'ú' }, |
| 102 |
{ 0x0000c3bb, 'û' }, |
| 103 |
{ 0x0000c3bc, 'ü' }, |
| 104 |
{ 0x0000c3bf, 'ÿ' }, |
| 105 |
{ 0x0000c592, 'Œ' }, |
| 106 |
{ 0x0000c593, 'œ' }, |
| 107 |
{ 0x0000c5b8, 'Ÿ' }, |
| 108 |
{ 0x0000c692, 'ƒ' }, |
| 109 |
{ 0x0000cf80, 'π' }, |
| 110 |
{ 0x00e28093, '–' }, |
| 111 |
{ 0x00e28094, '—' }, |
| 112 |
{ 0x00e28098, '‘' }, |
| 113 |
{ 0x00e28099, '’' }, |
| 114 |
{ 0x00e2809c, '“' }, |
| 115 |
{ 0x00e2809d, '”' }, |
| 116 |
{ 0x00e280a0, '†' }, |
| 117 |
{ 0x00e280a2, '•' }, |
| 118 |
{ 0x00e280a6, '…' }, |
| 119 |
{ 0x00e284a2, '™' }, |
| 120 |
{ 0x00e284a6, 'Ω' }, |
| 121 |
{ 0x00e28882, '∂' }, |
| 122 |
{ 0x00e28886, '∆' }, |
| 123 |
{ 0x00e2888f, '∏' }, |
| 124 |
{ 0x00e28891, '∑' }, |
| 125 |
{ 0x00e2889a, '√' }, |
| 126 |
{ 0x00e2889e, '∞' }, |
| 127 |
{ 0x00e288ab, '∫' }, |
| 128 |
{ 0x00e28988, '≈' }, |
| 129 |
{ 0x00e289a0, '≠' }, |
| 130 |
{ 0x00e289a4, '≤' }, |
| 131 |
{ 0x00e289a5, '≥' }, |
| 132 |
}; |
| 133 |
|
| 134 |
static const unsigned char utf8d[] = { |
| 135 |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f |
| 136 |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f |
| 137 |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f |
| 138 |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f |
| 139 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f |
| 140 |
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf |
| 141 |
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df |
| 142 |
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef |
| 143 |
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff |
| 144 |
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 |
| 145 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 |
| 146 |
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 |
| 147 |
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 |
| 148 |
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 |
| 149 |
}; |
| 150 |
|
| 151 |
/* iso8859-1 character, minus 128, to macroman */ |
| 152 |
static const unsigned char iso88591_macroman_pairs[] = { |
| 153 |
'?', '?', '?', '?', '?', '?', '?', '?', |
| 154 |
'?', '?', '?', '?', '?', '?', '?', '?', |
| 155 |
'?', '?', '?', '?', '?', '?', '?', '?', |
| 156 |
'?', '?', '?', '?', '?', '?', '?', '?', |
| 157 |
0xca, 0xc1, 0xa2, 0xa3, 0xdb, 0xb4, '?', 0xa4, |
| 158 |
0xac, 0xa9, 0xbb, 0xc7, 0xc2, '?', 0xa8, 0xf8, |
| 159 |
0xa1, 0xb1, '?', '?', 0xab, 0xb5, 0xa6, 0xe1, |
| 160 |
0xfc, '?', 0xbc, 0xc8, '?', '?', '?', 0xc0, |
| 161 |
0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, |
| 162 |
0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, |
| 163 |
'?', 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, '?', |
| 164 |
0xaf, 0xf4, 0xf2, 0xf3, 0x86, '?', '?', 0xa7, |
| 165 |
0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, |
| 166 |
0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, |
| 167 |
'?', 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, |
| 168 |
0xbf, 0x9d, 0x9c, 0x9e, 0x9f, '?', '?', 0xd8, |
| 169 |
}; |
| 170 |
|
| 171 |
utf8_codepoint |
| 172 |
utf8_decode(utf8_codepoint *state, utf8_codepoint *codep, |
| 173 |
utf8_codepoint byte) |
| 174 |
{ |
| 175 |
utf8_codepoint type = utf8d[byte]; |
| 176 |
|
| 177 |
*codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) : |
| 178 |
(0xff >> type) & (byte); |
| 179 |
|
| 180 |
*state = utf8d[256 + (*state * 16) + type]; |
| 181 |
return *state; |
| 182 |
} |
| 183 |
|
| 184 |
size_t |
| 185 |
utf8_to_macroman_string(char *str, size_t len, char *ret) |
| 186 |
{ |
| 187 |
struct utf8_macroman_pair *pair; |
| 188 |
utf8_codepoint codepoint, utf8_bytes; |
| 189 |
utf8_codepoint state = UTF8_ACCEPT; |
| 190 |
size_t retlen, n, j; |
| 191 |
short bytes = 0; |
| 192 |
unsigned char *ustr = (unsigned char *)str; |
| 193 |
bool found_macroman; |
| 194 |
|
| 195 |
retlen = 0; |
| 196 |
for (n = 0; n < len; n++) { |
| 197 |
bytes++; |
| 198 |
|
| 199 |
if (utf8_decode(&state, &codepoint, ustr[n]) == 0) { |
| 200 |
if (bytes > 1) { |
| 201 |
utf8_bytes = 0; |
| 202 |
if (bytes == 4) |
| 203 |
utf8_bytes |= ((unsigned long)(ustr[n - 3]) << 24); |
| 204 |
if (bytes >= 3) |
| 205 |
utf8_bytes |= ((unsigned long)(ustr[n - 2]) << 16); |
| 206 |
if (bytes >= 2) |
| 207 |
utf8_bytes |= ((unsigned long)(ustr[n - 1]) << 8); |
| 208 |
if (bytes >= 1) |
| 209 |
utf8_bytes |= ustr[n]; |
| 210 |
|
| 211 |
found_macroman = false; |
| 212 |
for (j = 0; j < nitems(utf8_macroman_pairs); j++) { |
| 213 |
pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j]; |
| 214 |
if (utf8_bytes == pair->utf8 && |
| 215 |
pair->type != ONLY_TO_UTF8) { |
| 216 |
codepoint = pair->macroman; |
| 217 |
found_macroman = true; |
| 218 |
break; |
| 219 |
} |
| 220 |
} |
| 221 |
|
| 222 |
if (!found_macroman) |
| 223 |
codepoint = UTF8_UNKNOWN_MACROMAN; |
| 224 |
} |
| 225 |
|
| 226 |
ret[retlen++] = codepoint; |
| 227 |
bytes = 0; |
| 228 |
state = UTF8_ACCEPT; |
| 229 |
} |
| 230 |
} |
| 231 |
|
| 232 |
ret[retlen] = '\0'; |
| 233 |
return retlen; |
| 234 |
} |
| 235 |
|
| 236 |
size_t |
| 237 |
macroman_to_utf8_string(char *str, size_t len, char **retp) |
| 238 |
{ |
| 239 |
struct utf8_macroman_pair *pair; |
| 240 |
size_t retlen, n, j; |
| 241 |
unsigned char *ustr = (unsigned char *)str; |
| 242 |
unsigned char c; |
| 243 |
bool found_utf8; |
| 244 |
|
| 245 |
retlen = 0; |
| 246 |
for (n = 0; n < len; n++) { |
| 247 |
c = ustr[n]; |
| 248 |
|
| 249 |
if (c >= 128) |
| 250 |
retlen += 4; |
| 251 |
else |
| 252 |
retlen++; |
| 253 |
} |
| 254 |
|
| 255 |
*retp = xmalloc(retlen + 1); |
| 256 |
if (*retp == NULL) |
| 257 |
return 0; |
| 258 |
|
| 259 |
retlen = 0; |
| 260 |
for (n = 0; n < len; n++) { |
| 261 |
c = ustr[n]; |
| 262 |
|
| 263 |
if (c < 128) { |
| 264 |
(*retp)[retlen++] = c; |
| 265 |
continue; |
| 266 |
} |
| 267 |
|
| 268 |
found_utf8 = false; |
| 269 |
for (j = 0; j < nitems(utf8_macroman_pairs); j++) { |
| 270 |
pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j]; |
| 271 |
if (pair->type == ONLY_TO_MACROMAN) |
| 272 |
continue; |
| 273 |
if (c == pair->macroman) { |
| 274 |
found_utf8 = true; |
| 275 |
if (pair->utf8 >= 0x01000000) |
| 276 |
(*retp)[retlen++] = (pair->utf8 >> 24); |
| 277 |
if (pair->utf8 >= 0x00010000) |
| 278 |
(*retp)[retlen++] = (pair->utf8 >> 16) & 0xff; |
| 279 |
if (pair->utf8 >= 0x00000100) |
| 280 |
(*retp)[retlen++] = (pair->utf8 >> 8) & 0xff; |
| 281 |
if (pair->utf8 >= 0x00000001) |
| 282 |
(*retp)[retlen++] = pair->utf8 & 0xff; |
| 283 |
break; |
| 284 |
} |
| 285 |
} |
| 286 |
|
| 287 |
if (!found_utf8) |
| 288 |
(*retp)[retlen++] = c; |
| 289 |
} |
| 290 |
|
| 291 |
(*retp)[retlen] = '\0'; |
| 292 |
return retlen; |
| 293 |
} |
| 294 |
|
| 295 |
size_t |
| 296 |
iso88591_to_macroman_string(char *str, size_t len, char **retp) |
| 297 |
{ |
| 298 |
size_t mlen, retlen, n, j; |
| 299 |
unsigned char *ustr = (unsigned char *)str; |
| 300 |
unsigned char c; |
| 301 |
|
| 302 |
*retp = xmalloc(len + 1); |
| 303 |
if (*retp == NULL) |
| 304 |
return 0; |
| 305 |
|
| 306 |
retlen = 0; |
| 307 |
for (n = 0; n < len; n++) { |
| 308 |
c = ustr[n]; |
| 309 |
|
| 310 |
if (c >= 128) |
| 311 |
(*retp)[retlen++] = iso88591_macroman_pairs[c - 128]; |
| 312 |
else |
| 313 |
(*retp)[retlen++] = c; |
| 314 |
} |
| 315 |
|
| 316 |
(*retp)[retlen] = '\0'; |
| 317 |
return retlen; |
| 318 |
} |