jcs
/wallops
/amendments
/112
utf8: Import from Carl
jcs made amendment 112 23 days ago
--- utf8.c Mon Sep 16 17:17:29 2024
+++ utf8.c Mon Sep 16 17:17:29 2024
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+ * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+ *
+ * Copyright (c) 2022 joshua stein <jcs@jcs.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utf8.h"
+
+#define ONLY_TO_MACROMAN 1
+#define ONLY_TO_UTF8 2
+
+static const struct utf8_macroman_pair {
+ utf8_codepoint utf8;
+ unsigned char macroman;
+ unsigned char type;
+} utf8_macroman_pairs[] = {
+ { 0x000000a0, ' ', ONLY_TO_MACROMAN }, /* supposed to be non-breaking */
+ { 0x00002019, '\'', ONLY_TO_MACROMAN },
+ { 0x00002026, '…', ONLY_TO_MACROMAN },
+ { 0x0000c2a0, ' ', ONLY_TO_MACROMAN },
+ { 0x00efbfbd, UTF8_UNKNOWN_MACROMAN, ONLY_TO_UTF8 }, /* unknown utf-8 char */
+
+ { 0x0000c2a1, '¡' },
+ { 0x0000c2a2, '¢' },
+ { 0x0000c2a3, '£' },
+ { 0x0000c2a5, '¥' },
+ { 0x0000c2a7, '§' },
+ { 0x0000c2a8, '¨' },
+ { 0x0000c2a9, '©' },
+ { 0x0000c2aa, 'ª' },
+ { 0x0000c2ab, '«' },
+ { 0x0000c2ac, '¬' },
+ { 0x0000c2ae, '®' },
+ { 0x0000c2b0, '°' },
+ { 0x0000c2b1, '±' },
+ { 0x0000c2b4, '´' },
+ { 0x0000c2b5, 'µ' },
+ { 0x0000c2b6, '¶' },
+ { 0x0000c2ba, 'º' },
+ { 0x0000c2bb, '»' },
+ { 0x0000c2bf, '¿' },
+ { 0x0000c380, 'À' },
+ { 0x0000c383, 'Ã' },
+ { 0x0000c384, 'Ä' },
+ { 0x0000c385, 'Å' },
+ { 0x0000c386, 'Æ' },
+ { 0x0000c387, 'Ç' },
+ { 0x0000c389, 'É' },
+ { 0x0000c391, 'Ñ' },
+ { 0x0000c395, 'Õ' },
+ { 0x0000c396, 'Ö' },
+ { 0x0000c398, 'Ø' },
+ { 0x0000c39c, 'Ü' },
+ { 0x0000c39f, 'ß' },
+ { 0x0000c3a0, 'à' },
+ { 0x0000c3a1, 'á' },
+ { 0x0000c3a2, 'â' },
+ { 0x0000c3a3, 'ã' },
+ { 0x0000c3a4, 'ä' },
+ { 0x0000c3a5, 'å' },
+ { 0x0000c3a6, 'æ' },
+ { 0x0000c3a7, 'ç' },
+ { 0x0000c3a8, 'è' },
+ { 0x0000c3a9, 'é' },
+ { 0x0000c3aa, 'ê' },
+ { 0x0000c3ab, 'ë' },
+ { 0x0000c3ac, 'ì' },
+ { 0x0000c3ad, 'í' },
+ { 0x0000c3ae, 'î' },
+ { 0x0000c3af, 'ï' },
+ { 0x0000c3b1, 'ñ' },
+ { 0x0000c3b2, 'ò' },
+ { 0x0000c3b3, 'ó' },
+ { 0x0000c3b4, 'ô' },
+ { 0x0000c3b5, 'õ' },
+ { 0x0000c3b6, 'ö' },
+ { 0x0000c3b7, '÷' },
+ { 0x0000c3b8, 'ø' },
+ { 0x0000c3b9, 'ù' },
+ { 0x0000c3ba, 'ú' },
+ { 0x0000c3bb, 'û' },
+ { 0x0000c3bc, 'ü' },
+ { 0x0000c3bf, 'ÿ' },
+ { 0x0000c592, 'Œ' },
+ { 0x0000c593, 'œ' },
+ { 0x0000c5b8, 'Ÿ' },
+ { 0x0000c692, 'ƒ' },
+ { 0x0000cf80, 'π' },
+ { 0x00e28093, '–' },
+ { 0x00e28094, '—' },
+ { 0x00e28098, '‘' },
+ { 0x00e28099, '’' },
+ { 0x00e2809c, '“' },
+ { 0x00e2809d, '”' },
+ { 0x00e280a0, '†' },
+ { 0x00e280a2, '•' },
+ { 0x00e280a6, '…' },
+ { 0x00e284a2, '™' },
+ { 0x00e284a6, 'Ω' },
+ { 0x00e28882, '∂' },
+ { 0x00e28886, '∆' },
+ { 0x00e2888f, '∏' },
+ { 0x00e28891, '∑' },
+ { 0x00e2889a, '√' },
+ { 0x00e2889e, '∞' },
+ { 0x00e288ab, '∫' },
+ { 0x00e28988, '≈' },
+ { 0x00e289a0, '≠' },
+ { 0x00e289a4, '≤' },
+ { 0x00e289a5, '≥' },
+};
+
+static const unsigned char utf8d[] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+ 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+ 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+ 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+ 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+ 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+ 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+/* iso8859-1 character, minus 128, to macroman */
+static const unsigned char iso88591_macroman_pairs[] = {
+ '?', '?', '?', '?', '?', '?', '?', '?',
+ '?', '?', '?', '?', '?', '?', '?', '?',
+ '?', '?', '?', '?', '?', '?', '?', '?',
+ '?', '?', '?', '?', '?', '?', '?', '?',
+ 0xca, 0xc1, 0xa2, 0xa3, 0xdb, 0xb4, '?', 0xa4,
+ 0xac, 0xa9, 0xbb, 0xc7, 0xc2, '?', 0xa8, 0xf8,
+ 0xa1, 0xb1, '?', '?', 0xab, 0xb5, 0xa6, 0xe1,
+ 0xfc, '?', 0xbc, 0xc8, '?', '?', '?', 0xc0,
+ 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82,
+ 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec,
+ '?', 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, '?',
+ 0xaf, 0xf4, 0xf2, 0xf3, 0x86, '?', '?', 0xa7,
+ 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d,
+ 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95,
+ '?', 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6,
+ 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, '?', '?', 0xd8,
+};
+
+utf8_codepoint
+utf8_decode(utf8_codepoint *state, utf8_codepoint *codep,
+ utf8_codepoint byte)
+{
+ utf8_codepoint type = utf8d[byte];
+
+ *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) :
+ (0xff >> type) & (byte);
+
+ *state = utf8d[256 + (*state * 16) + type];
+ return *state;
+}
+
+size_t
+utf8_to_macroman_string(char *str, size_t len, char *ret)
+{
+ struct utf8_macroman_pair *pair;
+ utf8_codepoint codepoint, utf8_bytes;
+ utf8_codepoint state = UTF8_ACCEPT;
+ size_t retlen, n, j;
+ short bytes = 0;
+ unsigned char *ustr = (unsigned char *)str;
+ bool found_macroman;
+
+ retlen = 0;
+ for (n = 0; n < len; n++) {
+ bytes++;
+
+ if (utf8_decode(&state, &codepoint, ustr[n]) == 0) {
+ if (bytes > 1) {
+ utf8_bytes = 0;
+ if (bytes == 4)
+ utf8_bytes |= ((unsigned long)(ustr[n - 3]) << 24);
+ if (bytes >= 3)
+ utf8_bytes |= ((unsigned long)(ustr[n - 2]) << 16);
+ if (bytes >= 2)
+ utf8_bytes |= ((unsigned long)(ustr[n - 1]) << 8);
+ if (bytes >= 1)
+ utf8_bytes |= ustr[n];
+
+ found_macroman = false;
+ for (j = 0; j < nitems(utf8_macroman_pairs); j++) {
+ pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j];
+ if (utf8_bytes == pair->utf8 &&
+ pair->type != ONLY_TO_UTF8) {
+ codepoint = pair->macroman;
+ found_macroman = true;
+ break;
+ }
+ }
+
+ if (!found_macroman)
+ codepoint = UTF8_UNKNOWN_MACROMAN;
+ }
+
+ ret[retlen++] = codepoint;
+ bytes = 0;
+ state = UTF8_ACCEPT;
+ }
+ }
+
+ ret[retlen] = '\0';
+ return retlen;
+}
+
+size_t
+macroman_to_utf8_string(char *str, size_t len, char **retp)
+{
+ struct utf8_macroman_pair *pair;
+ size_t retlen, n, j;
+ unsigned char *ustr = (unsigned char *)str;
+ unsigned char c;
+ bool found_utf8;
+
+ retlen = 0;
+ for (n = 0; n < len; n++) {
+ c = ustr[n];
+
+ if (c >= 128)
+ retlen += 4;
+ else
+ retlen++;
+ }
+
+ *retp = xmalloc(retlen + 1);
+ if (*retp == NULL)
+ return 0;
+
+ retlen = 0;
+ for (n = 0; n < len; n++) {
+ c = ustr[n];
+
+ if (c < 128) {
+ (*retp)[retlen++] = c;
+ continue;
+ }
+
+ found_utf8 = false;
+ for (j = 0; j < nitems(utf8_macroman_pairs); j++) {
+ pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j];
+ if (pair->type == ONLY_TO_MACROMAN)
+ continue;
+ if (c == pair->macroman) {
+ found_utf8 = true;
+ if (pair->utf8 >= 0x01000000)
+ (*retp)[retlen++] = (pair->utf8 >> 24);
+ if (pair->utf8 >= 0x00010000)
+ (*retp)[retlen++] = (pair->utf8 >> 16) & 0xff;
+ if (pair->utf8 >= 0x00000100)
+ (*retp)[retlen++] = (pair->utf8 >> 8) & 0xff;
+ if (pair->utf8 >= 0x00000001)
+ (*retp)[retlen++] = pair->utf8 & 0xff;
+ break;
+ }
+ }
+
+ if (!found_utf8)
+ (*retp)[retlen++] = c;
+ }
+
+ (*retp)[retlen] = '\0';
+ return retlen;
+}
+
+size_t
+iso88591_to_macroman_string(char *str, size_t len, char **retp)
+{
+ size_t mlen, retlen, n, j;
+ unsigned char *ustr = (unsigned char *)str;
+ unsigned char c;
+
+ *retp = xmalloc(len + 1);
+ if (*retp == NULL)
+ return 0;
+
+ retlen = 0;
+ for (n = 0; n < len; n++) {
+ c = ustr[n];
+
+ if (c >= 128)
+ (*retp)[retlen++] = iso88591_macroman_pairs[c - 128];
+ else
+ (*retp)[retlen++] = c;
+ }
+
+ (*retp)[retlen] = '\0';
+ return retlen;
+}
--- utf8.h Mon Sep 16 17:21:35 2024
+++ utf8.h Mon Sep 16 17:21:35 2024
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022 joshua stein <jcs@jcs.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __UTF8_H__
+#define __UTF8_H__
+
+#include "util.h"
+
+typedef unsigned long utf8_codepoint;
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 1
+
+#define UTF8_UNKNOWN_MACROMAN '◊'
+
+utf8_codepoint utf8_decode(utf8_codepoint *state, utf8_codepoint *codep,
+ utf8_codepoint byte);
+size_t utf8_to_macroman_string(char *str, size_t len, char *ret);
+size_t macroman_to_utf8_string(char *str, size_t len, char **retp);
+size_t iso88591_to_macroman_string(char *str, size_t len, char **retp);
+
+#endif
\ No newline at end of file