/* * Copyright (c) 2008-2009 Bjoern Hoehrmann * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. * * Copyright (c) 2022 joshua stein * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "utf8.h" #define ONLY_TO_MACROMAN 1 #define ONLY_TO_UTF8 2 static const struct utf8_macroman_pair { utf8_codepoint utf8; unsigned char macroman; unsigned char type; } utf8_macroman_pairs[] = { { 0x000000a0, ' ', ONLY_TO_MACROMAN }, /* supposed to be non-breaking */ { 0x00002019, '\'', ONLY_TO_MACROMAN }, { 0x00002026, '', ONLY_TO_MACROMAN }, { 0x0000c2a0, '', ONLY_TO_MACROMAN }, { 0x00efbfbd, UTF8_UNKNOWN_MACROMAN, ONLY_TO_UTF8 }, /* unknown utf-8 char */ { 0x0000c2a1, '' }, { 0x0000c2a2, '' }, { 0x0000c2a3, '' }, { 0x0000c2a5, '' }, { 0x0000c2a7, '' }, { 0x0000c2a8, '' }, { 0x0000c2a9, '' }, { 0x0000c2aa, '' }, { 0x0000c2ab, '' }, { 0x0000c2ac, '' }, { 0x0000c2ae, '' }, { 0x0000c2b0, '' }, { 0x0000c2b1, '' }, { 0x0000c2b4, '' }, { 0x0000c2b5, '' }, { 0x0000c2b6, '' }, { 0x0000c2ba, '' }, { 0x0000c2bb, '' }, { 0x0000c2bf, '' }, { 0x0000c380, '' }, { 0x0000c383, '' }, { 0x0000c384, '' }, { 0x0000c385, '' }, { 0x0000c386, '' }, { 0x0000c387, '' }, { 0x0000c389, '' }, { 0x0000c391, '' }, { 0x0000c395, '' }, { 0x0000c396, '' }, { 0x0000c398, '' }, { 0x0000c39c, '' }, { 0x0000c39f, '' }, { 0x0000c3a0, '' }, { 0x0000c3a1, '' }, { 0x0000c3a2, '' }, { 0x0000c3a3, '' }, { 0x0000c3a4, '' }, { 0x0000c3a5, '' }, { 0x0000c3a6, '' }, { 0x0000c3a7, '' }, { 0x0000c3a8, '' }, { 0x0000c3a9, '' }, { 0x0000c3aa, '' }, { 0x0000c3ab, '' }, { 0x0000c3ac, '' }, { 0x0000c3ad, '' }, { 0x0000c3ae, '' }, { 0x0000c3af, '' }, { 0x0000c3b1, '' }, { 0x0000c3b2, '' }, { 0x0000c3b3, '' }, { 0x0000c3b4, '' }, { 0x0000c3b5, '' }, { 0x0000c3b6, '' }, { 0x0000c3b7, '' }, { 0x0000c3b8, '' }, { 0x0000c3b9, '' }, { 0x0000c3ba, '' }, { 0x0000c3bb, '' }, { 0x0000c3bc, '' }, { 0x0000c3bf, '' }, { 0x0000c592, '' }, { 0x0000c593, '' }, { 0x0000c5b8, '' }, { 0x0000c692, '' }, { 0x0000cf80, '' }, { 0x00e28093, '' }, { 0x00e28094, '' }, { 0x00e28098, '' }, { 0x00e28099, '' }, { 0x00e2809c, '' }, { 0x00e2809d, '' }, { 0x00e280a0, '' }, { 0x00e280a2, '' }, { 0x00e280a6, '' }, { 0x00e284a2, '' }, { 0x00e284a6, '' }, { 0x00e28882, '' }, { 0x00e28886, '' }, { 0x00e2888f, '' }, { 0x00e28891, '' }, { 0x00e2889a, '' }, { 0x00e2889e, '' }, { 0x00e288ab, '' }, { 0x00e28988, '' }, { 0x00e289a0, '' }, { 0x00e289a4, '' }, { 0x00e289a5, '' }, }; static const unsigned char utf8d[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 }; /* iso8859-1 character, minus 128, to macroman */ static const unsigned char iso88591_macroman_pairs[] = { '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', '?', 0xca, 0xc1, 0xa2, 0xa3, 0xdb, 0xb4, '?', 0xa4, 0xac, 0xa9, 0xbb, 0xc7, 0xc2, '?', 0xa8, 0xf8, 0xa1, 0xb1, '?', '?', 0xab, 0xb5, 0xa6, 0xe1, 0xfc, '?', 0xbc, 0xc8, '?', '?', '?', 0xc0, 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, '?', 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, '?', 0xaf, 0xf4, 0xf2, 0xf3, 0x86, '?', '?', 0xa7, 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, '?', 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, '?', '?', 0xd8, }; utf8_codepoint utf8_decode(utf8_codepoint *state, utf8_codepoint *codep, utf8_codepoint byte) { utf8_codepoint type = utf8d[byte]; *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) : (0xff >> type) & (byte); *state = utf8d[256 + (*state * 16) + type]; return *state; } size_t utf8_to_macroman_string(char *str, size_t len, char *ret) { struct utf8_macroman_pair *pair; utf8_codepoint codepoint, utf8_bytes; utf8_codepoint state = UTF8_ACCEPT; size_t retlen, n, j; short bytes = 0; unsigned char *ustr = (unsigned char *)str; bool found_macroman; retlen = 0; for (n = 0; n < len; n++) { bytes++; if (utf8_decode(&state, &codepoint, ustr[n]) == 0) { if (bytes > 1) { utf8_bytes = 0; if (bytes == 4) utf8_bytes |= ((unsigned long)(ustr[n - 3]) << 24); if (bytes >= 3) utf8_bytes |= ((unsigned long)(ustr[n - 2]) << 16); if (bytes >= 2) utf8_bytes |= ((unsigned long)(ustr[n - 1]) << 8); if (bytes >= 1) utf8_bytes |= ustr[n]; found_macroman = false; for (j = 0; j < nitems(utf8_macroman_pairs); j++) { pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j]; if (utf8_bytes == pair->utf8 && pair->type != ONLY_TO_UTF8) { codepoint = pair->macroman; found_macroman = true; break; } } if (!found_macroman) codepoint = UTF8_UNKNOWN_MACROMAN; } ret[retlen++] = codepoint; bytes = 0; state = UTF8_ACCEPT; } } ret[retlen] = '\0'; return retlen; } size_t macroman_to_utf8_string(char *str, size_t len, char **retp) { struct utf8_macroman_pair *pair; size_t retlen, n, j; unsigned char *ustr = (unsigned char *)str; unsigned char c; bool found_utf8; retlen = 0; for (n = 0; n < len; n++) { c = ustr[n]; if (c >= 128) retlen += 4; else retlen++; } *retp = xmalloc(retlen + 1); if (*retp == NULL) return 0; retlen = 0; for (n = 0; n < len; n++) { c = ustr[n]; if (c < 128) { (*retp)[retlen++] = c; continue; } found_utf8 = false; for (j = 0; j < nitems(utf8_macroman_pairs); j++) { pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j]; if (pair->type == ONLY_TO_MACROMAN) continue; if (c == pair->macroman) { found_utf8 = true; if (pair->utf8 >= 0x01000000) (*retp)[retlen++] = (pair->utf8 >> 24); if (pair->utf8 >= 0x00010000) (*retp)[retlen++] = (pair->utf8 >> 16) & 0xff; if (pair->utf8 >= 0x00000100) (*retp)[retlen++] = (pair->utf8 >> 8) & 0xff; if (pair->utf8 >= 0x00000001) (*retp)[retlen++] = pair->utf8 & 0xff; break; } } if (!found_utf8) (*retp)[retlen++] = c; } (*retp)[retlen] = '\0'; return retlen; } size_t iso88591_to_macroman_string(char *str, size_t len, char **retp) { size_t mlen, retlen, n, j; unsigned char *ustr = (unsigned char *)str; unsigned char c; *retp = xmalloc(len + 1); if (*retp == NULL) return 0; retlen = 0; for (n = 0; n < len; n++) { c = ustr[n]; if (c >= 128) (*retp)[retlen++] = iso88591_macroman_pairs[c - 128]; else (*retp)[retlen++] = c; } (*retp)[retlen] = '\0'; return retlen; }