jcs
/wikipedia
/amendments
/31
utf8: Add UTF8-to-MacRoman conversion utilities
jcs made amendment 31 over 2 years ago
--- utf8.c Wed Sep 7 15:41:18 2022
+++ utf8.c Wed Sep 7 15:41:18 2022
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2022 joshua stein <jcs@jcs.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "utf8.h"
+
+const struct utf8_macroman_pair {
+ unsigned char macroman;
+ utf8_char utf8;
+} utf8_macroman_pairs[] = {
+ { 'x', { 0xc3, 0x97 } },
+ { 'è', { 0xc3, 0xa8 } },
+ { 'é', { 0xc3, 0xa9 } },
+ { 'ö', { 0xc3, 0xb6 } },
+ { 'ü', { 0xc3, 0xbc } },
+ { '"', { 0xe2, 0x80, 0x9c } },
+ { '–', { 0xe2, 0x80, 0x93 } },
+ { '—', { 0xe2, 0x80, 0x94 } },
+ { '\'', { 0xe2, 0x80, 0x98 } },
+ { '\'', { 0xe2, 0x80, 0x99 } },
+ { 0, 0 }
+};
+
+unsigned char
+utf8_to_macroman(utf8_char *utf8)
+{
+ short n;
+ short bytes = 0;
+ struct utf8_macroman_pair *p = NULL;
+
+ if ((*utf8)[0] >= 0xc2 && (*utf8)[0] <= 0xdf && (*utf8)[1] != 0)
+ bytes = 2;
+ else if ((*utf8)[0] >= 0xe0 && (*utf8)[0] <= 0xef && (*utf8)[2] != 0)
+ bytes = 3;
+ else if ((*utf8)[0] >= 0xf0 && (*utf8)[0] <= 0xf4 && (*utf8)[3] != 0)
+ bytes = 4;
+ else if ((*utf8)[0] != 0 && (*utf8)[1] != 0 && (*utf8)[2] != 0 &&
+ (*utf8)[3] != 0)
+ return '?';
+
+ if (bytes == 0)
+ return 0;
+
+ for (n = 0; ; n++) {
+ p = (struct utf8_macroman_pair *)&utf8_macroman_pairs[n];
+
+ if (p->macroman == 0)
+ break;
+
+ if (p->utf8[0] != (*utf8)[0])
+ continue;
+
+ if (p->utf8[1] != (*utf8)[1])
+ continue;
+
+ if (p->utf8[2] == 0 && (*utf8)[2] == 0)
+ return p->macroman;
+
+ if (p->utf8[2] != (*utf8)[2])
+ continue;
+
+ if (p->utf8[3] == 0 && (*utf8)[3] == 0)
+ return p->macroman;
+
+ if (p->utf8[3] != (*utf8)[3])
+ continue;
+
+ if (p->utf8[4] == (*utf8)[4])
+ return p->macroman;
+ }
+
+ return '?';
+}
+
+const utf8_char *
+macroman_to_utf8(unsigned char c)
+{
+ short n;
+ struct utf8_macroman_pair *p = NULL;
+
+ for (n = 0; ; n++) {
+ p = (struct utf8_macroman_pair *)&utf8_macroman_pairs[n];
+
+ if (p->macroman == 0)
+ break;
+
+ if (p->macroman == c)
+ return (const utf8_char *)&p->utf8;
+ }
+
+ return NULL;
+}
+
+unsigned char *
+macroman_to_utf8_string(unsigned char *str, size_t len)
+{
+ const utf8_char *utf8;
+ unsigned char *tmp, *ret, c;
+ size_t ulen, n;
+ const utf8_char *u;
+
+ tmp = xmalloc((len * 4) + 1, "macroman_to_utf8 tmp");
+
+ ulen = 0;
+ for (n = 0; n < len; n++) {
+ c = str[n];
+ u = macroman_to_utf8(c);
+
+ if (u == NULL) {
+ tmp[ulen++] = c;
+ continue;
+ }
+
+ tmp[ulen++] = (*u)[0];
+
+ if ((*u)[1] == 0)
+ continue;
+ tmp[ulen++] = (*u)[1];
+
+ if ((*u)[2] == 0)
+ continue;
+ tmp[ulen++] = (*u)[2];
+
+ if ((*u)[3] == 0)
+ continue;
+ tmp[ulen++] = (*u)[3];
+
+ if ((*u)[4] == 0)
+ continue;
+ tmp[ulen++] = (*u)[4];
+ }
+ tmp[ulen] = '\0';
+
+ ret = (unsigned char *)xstrdup((char *)tmp,
+ "macroman_to_utf8_string");
+ xfree(&tmp);
+
+ return ret;
+}
--- utf8.h Wed Sep 7 15:40:54 2022
+++ utf8.h Wed Sep 7 15:40:54 2022
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2022 joshua stein <jcs@jcs.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __UTF8_H__
+#define __UTF8_H__
+
+#include "util.h"
+
+#define UTF8_RANGE_START ((unsigned char)0x80)
+#define UTF8_RANGE_END ((unsigned char)0xf4)
+
+typedef unsigned char utf8_char[5];
+
+unsigned char utf8_to_macroman(utf8_char *utf8);
+const utf8_char * macroman_to_utf8(unsigned char c);
+unsigned char * macroman_to_utf8_string(unsigned char *str, size_t len);
+
+#endif