AmendHub

Download:

jcs

/

wallops

/

amendments

/

112

utf8: Import from Carl


jcs made amendment 112 3 months ago
--- utf8.c Mon Sep 16 17:17:29 2024 +++ utf8.c Mon Sep 16 17:17:29 2024 @@ -0,0 +1,318 @@ +/* + * Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> + * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + * + * Copyright (c) 2022 joshua stein <jcs@jcs.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "utf8.h" + +#define ONLY_TO_MACROMAN 1 +#define ONLY_TO_UTF8 2 + +static const struct utf8_macroman_pair { + utf8_codepoint utf8; + unsigned char macroman; + unsigned char type; +} utf8_macroman_pairs[] = { + { 0x000000a0, ' ', ONLY_TO_MACROMAN }, /* supposed to be non-breaking */ + { 0x00002019, '\'', ONLY_TO_MACROMAN }, + { 0x00002026, '…', ONLY_TO_MACROMAN }, + { 0x0000c2a0, ' ', ONLY_TO_MACROMAN }, + { 0x00efbfbd, UTF8_UNKNOWN_MACROMAN, ONLY_TO_UTF8 }, /* unknown utf-8 char */ + + { 0x0000c2a1, '¡' }, + { 0x0000c2a2, '¢' }, + { 0x0000c2a3, '£' }, + { 0x0000c2a5, '¥' }, + { 0x0000c2a7, '§' }, + { 0x0000c2a8, '¨' }, + { 0x0000c2a9, '©' }, + { 0x0000c2aa, 'ª' }, + { 0x0000c2ab, '«' }, + { 0x0000c2ac, '¬' }, + { 0x0000c2ae, '®' }, + { 0x0000c2b0, '°' }, + { 0x0000c2b1, '±' }, + { 0x0000c2b4, '´' }, + { 0x0000c2b5, 'µ' }, + { 0x0000c2b6, '¶' }, + { 0x0000c2ba, 'º' }, + { 0x0000c2bb, '»' }, + { 0x0000c2bf, '¿' }, + { 0x0000c380, 'À' }, + { 0x0000c383, 'Ã' }, + { 0x0000c384, 'Ä' }, + { 0x0000c385, 'Å' }, + { 0x0000c386, 'Æ' }, + { 0x0000c387, 'Ç' }, + { 0x0000c389, 'É' }, + { 0x0000c391, 'Ñ' }, + { 0x0000c395, 'Õ' }, + { 0x0000c396, 'Ö' }, + { 0x0000c398, 'Ø' }, + { 0x0000c39c, 'Ü' }, + { 0x0000c39f, 'ß' }, + { 0x0000c3a0, 'à' }, + { 0x0000c3a1, 'á' }, + { 0x0000c3a2, 'â' }, + { 0x0000c3a3, 'ã' }, + { 0x0000c3a4, 'ä' }, + { 0x0000c3a5, 'å' }, + { 0x0000c3a6, 'æ' }, + { 0x0000c3a7, 'ç' }, + { 0x0000c3a8, 'è' }, + { 0x0000c3a9, 'é' }, + { 0x0000c3aa, 'ê' }, + { 0x0000c3ab, 'ë' }, + { 0x0000c3ac, 'ì' }, + { 0x0000c3ad, 'í' }, + { 0x0000c3ae, 'î' }, + { 0x0000c3af, 'ï' }, + { 0x0000c3b1, 'ñ' }, + { 0x0000c3b2, 'ò' }, + { 0x0000c3b3, 'ó' }, + { 0x0000c3b4, 'ô' }, + { 0x0000c3b5, 'õ' }, + { 0x0000c3b6, 'ö' }, + { 0x0000c3b7, '÷' }, + { 0x0000c3b8, 'ø' }, + { 0x0000c3b9, 'ù' }, + { 0x0000c3ba, 'ú' }, + { 0x0000c3bb, 'û' }, + { 0x0000c3bc, 'ü' }, + { 0x0000c3bf, 'ÿ' }, + { 0x0000c592, 'Œ' }, + { 0x0000c593, 'œ' }, + { 0x0000c5b8, 'Ÿ' }, + { 0x0000c692, 'ƒ' }, + { 0x0000cf80, 'π' }, + { 0x00e28093, '–' }, + { 0x00e28094, '—' }, + { 0x00e28098, '‘' }, + { 0x00e28099, '’' }, + { 0x00e2809c, '“' }, + { 0x00e2809d, '”' }, + { 0x00e280a0, '†' }, + { 0x00e280a2, '•' }, + { 0x00e280a6, '…' }, + { 0x00e284a2, '™' }, + { 0x00e284a6, 'Ω' }, + { 0x00e28882, '∂' }, + { 0x00e28886, '∆' }, + { 0x00e2888f, '∏' }, + { 0x00e28891, '∑' }, + { 0x00e2889a, '√' }, + { 0x00e2889e, '∞' }, + { 0x00e288ab, '∫' }, + { 0x00e28988, '≈' }, + { 0x00e289a0, '≠' }, + { 0x00e289a4, '≤' }, + { 0x00e289a5, '≥' }, +}; + +static const unsigned char utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +/* iso8859-1 character, minus 128, to macroman */ +static const unsigned char iso88591_macroman_pairs[] = { + '?', '?', '?', '?', '?', '?', '?', '?', + '?', '?', '?', '?', '?', '?', '?', '?', + '?', '?', '?', '?', '?', '?', '?', '?', + '?', '?', '?', '?', '?', '?', '?', '?', + 0xca, 0xc1, 0xa2, 0xa3, 0xdb, 0xb4, '?', 0xa4, + 0xac, 0xa9, 0xbb, 0xc7, 0xc2, '?', 0xa8, 0xf8, + 0xa1, 0xb1, '?', '?', 0xab, 0xb5, 0xa6, 0xe1, + 0xfc, '?', 0xbc, 0xc8, '?', '?', '?', 0xc0, + 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, + 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, + '?', 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, '?', + 0xaf, 0xf4, 0xf2, 0xf3, 0x86, '?', '?', 0xa7, + 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, + 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, + '?', 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, + 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, '?', '?', 0xd8, +}; + +utf8_codepoint +utf8_decode(utf8_codepoint *state, utf8_codepoint *codep, + utf8_codepoint byte) +{ + utf8_codepoint type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + (*state * 16) + type]; + return *state; +} + +size_t +utf8_to_macroman_string(char *str, size_t len, char *ret) +{ + struct utf8_macroman_pair *pair; + utf8_codepoint codepoint, utf8_bytes; + utf8_codepoint state = UTF8_ACCEPT; + size_t retlen, n, j; + short bytes = 0; + unsigned char *ustr = (unsigned char *)str; + bool found_macroman; + + retlen = 0; + for (n = 0; n < len; n++) { + bytes++; + + if (utf8_decode(&state, &codepoint, ustr[n]) == 0) { + if (bytes > 1) { + utf8_bytes = 0; + if (bytes == 4) + utf8_bytes |= ((unsigned long)(ustr[n - 3]) << 24); + if (bytes >= 3) + utf8_bytes |= ((unsigned long)(ustr[n - 2]) << 16); + if (bytes >= 2) + utf8_bytes |= ((unsigned long)(ustr[n - 1]) << 8); + if (bytes >= 1) + utf8_bytes |= ustr[n]; + + found_macroman = false; + for (j = 0; j < nitems(utf8_macroman_pairs); j++) { + pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j]; + if (utf8_bytes == pair->utf8 && + pair->type != ONLY_TO_UTF8) { + codepoint = pair->macroman; + found_macroman = true; + break; + } + } + + if (!found_macroman) + codepoint = UTF8_UNKNOWN_MACROMAN; + } + + ret[retlen++] = codepoint; + bytes = 0; + state = UTF8_ACCEPT; + } + } + + ret[retlen] = '\0'; + return retlen; +} + +size_t +macroman_to_utf8_string(char *str, size_t len, char **retp) +{ + struct utf8_macroman_pair *pair; + size_t retlen, n, j; + unsigned char *ustr = (unsigned char *)str; + unsigned char c; + bool found_utf8; + + retlen = 0; + for (n = 0; n < len; n++) { + c = ustr[n]; + + if (c >= 128) + retlen += 4; + else + retlen++; + } + + *retp = xmalloc(retlen + 1); + if (*retp == NULL) + return 0; + + retlen = 0; + for (n = 0; n < len; n++) { + c = ustr[n]; + + if (c < 128) { + (*retp)[retlen++] = c; + continue; + } + + found_utf8 = false; + for (j = 0; j < nitems(utf8_macroman_pairs); j++) { + pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j]; + if (pair->type == ONLY_TO_MACROMAN) + continue; + if (c == pair->macroman) { + found_utf8 = true; + if (pair->utf8 >= 0x01000000) + (*retp)[retlen++] = (pair->utf8 >> 24); + if (pair->utf8 >= 0x00010000) + (*retp)[retlen++] = (pair->utf8 >> 16) & 0xff; + if (pair->utf8 >= 0x00000100) + (*retp)[retlen++] = (pair->utf8 >> 8) & 0xff; + if (pair->utf8 >= 0x00000001) + (*retp)[retlen++] = pair->utf8 & 0xff; + break; + } + } + + if (!found_utf8) + (*retp)[retlen++] = c; + } + + (*retp)[retlen] = '\0'; + return retlen; +} + +size_t +iso88591_to_macroman_string(char *str, size_t len, char **retp) +{ + size_t mlen, retlen, n, j; + unsigned char *ustr = (unsigned char *)str; + unsigned char c; + + *retp = xmalloc(len + 1); + if (*retp == NULL) + return 0; + + retlen = 0; + for (n = 0; n < len; n++) { + c = ustr[n]; + + if (c >= 128) + (*retp)[retlen++] = iso88591_macroman_pairs[c - 128]; + else + (*retp)[retlen++] = c; + } + + (*retp)[retlen] = '\0'; + return retlen; +} --- utf8.h Mon Sep 16 17:21:35 2024 +++ utf8.h Mon Sep 16 17:21:35 2024 @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2022 joshua stein <jcs@jcs.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __UTF8_H__ +#define __UTF8_H__ + +#include "util.h" + +typedef unsigned long utf8_codepoint; + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 1 + +#define UTF8_UNKNOWN_MACROMAN '◊' + +utf8_codepoint utf8_decode(utf8_codepoint *state, utf8_codepoint *codep, + utf8_codepoint byte); +size_t utf8_to_macroman_string(char *str, size_t len, char *ret); +size_t macroman_to_utf8_string(char *str, size_t len, char **retp); +size_t iso88591_to_macroman_string(char *str, size_t len, char **retp); + +#endif \ No newline at end of file