AmendHub

jcs

/

wikipedia

/

amendments

/

31

utf8: Add UTF8-to-MacRoman conversion utilities


jcs made amendment 31 4 months ago
--- utf8.c Wed Sep 7 15:41:18 2022 +++ utf8.c Wed Sep 7 15:41:18 2022 @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2022 joshua stein <jcs@jcs.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "utf8.h" + +const struct utf8_macroman_pair { + unsigned char macroman; + utf8_char utf8; +} utf8_macroman_pairs[] = { + { 'x', { 0xc3, 0x97 } }, + { 'è', { 0xc3, 0xa8 } }, + { 'é', { 0xc3, 0xa9 } }, + { 'ö', { 0xc3, 0xb6 } }, + { 'ü', { 0xc3, 0xbc } }, + { '"', { 0xe2, 0x80, 0x9c } }, + { '–', { 0xe2, 0x80, 0x93 } }, + { '—', { 0xe2, 0x80, 0x94 } }, + { '\'', { 0xe2, 0x80, 0x98 } }, + { '\'', { 0xe2, 0x80, 0x99 } }, + { 0, 0 } +}; + +unsigned char +utf8_to_macroman(utf8_char *utf8) +{ + short n; + short bytes = 0; + struct utf8_macroman_pair *p = NULL; + + if ((*utf8)[0] >= 0xc2 && (*utf8)[0] <= 0xdf && (*utf8)[1] != 0) + bytes = 2; + else if ((*utf8)[0] >= 0xe0 && (*utf8)[0] <= 0xef && (*utf8)[2] != 0) + bytes = 3; + else if ((*utf8)[0] >= 0xf0 && (*utf8)[0] <= 0xf4 && (*utf8)[3] != 0) + bytes = 4; + else if ((*utf8)[0] != 0 && (*utf8)[1] != 0 && (*utf8)[2] != 0 && + (*utf8)[3] != 0) + return '?'; + + if (bytes == 0) + return 0; + + for (n = 0; ; n++) { + p = (struct utf8_macroman_pair *)&utf8_macroman_pairs[n]; + + if (p->macroman == 0) + break; + + if (p->utf8[0] != (*utf8)[0]) + continue; + + if (p->utf8[1] != (*utf8)[1]) + continue; + + if (p->utf8[2] == 0 && (*utf8)[2] == 0) + return p->macroman; + + if (p->utf8[2] != (*utf8)[2]) + continue; + + if (p->utf8[3] == 0 && (*utf8)[3] == 0) + return p->macroman; + + if (p->utf8[3] != (*utf8)[3]) + continue; + + if (p->utf8[4] == (*utf8)[4]) + return p->macroman; + } + + return '?'; +} + +const utf8_char * +macroman_to_utf8(unsigned char c) +{ + short n; + struct utf8_macroman_pair *p = NULL; + + for (n = 0; ; n++) { + p = (struct utf8_macroman_pair *)&utf8_macroman_pairs[n]; + + if (p->macroman == 0) + break; + + if (p->macroman == c) + return (const utf8_char *)&p->utf8; + } + + return NULL; +} + +unsigned char * +macroman_to_utf8_string(unsigned char *str, size_t len) +{ + const utf8_char *utf8; + unsigned char *tmp, *ret, c; + size_t ulen, n; + const utf8_char *u; + + tmp = xmalloc((len * 4) + 1, "macroman_to_utf8 tmp"); + + ulen = 0; + for (n = 0; n < len; n++) { + c = str[n]; + u = macroman_to_utf8(c); + + if (u == NULL) { + tmp[ulen++] = c; + continue; + } + + tmp[ulen++] = (*u)[0]; + + if ((*u)[1] == 0) + continue; + tmp[ulen++] = (*u)[1]; + + if ((*u)[2] == 0) + continue; + tmp[ulen++] = (*u)[2]; + + if ((*u)[3] == 0) + continue; + tmp[ulen++] = (*u)[3]; + + if ((*u)[4] == 0) + continue; + tmp[ulen++] = (*u)[4]; + } + tmp[ulen] = '\0'; + + ret = (unsigned char *)xstrdup((char *)tmp, + "macroman_to_utf8_string"); + xfree(&tmp); + + return ret; +} --- utf8.h Wed Sep 7 15:40:54 2022 +++ utf8.h Wed Sep 7 15:40:54 2022 @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2022 joshua stein <jcs@jcs.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __UTF8_H__ +#define __UTF8_H__ + +#include "util.h" + +#define UTF8_RANGE_START ((unsigned char)0x80) +#define UTF8_RANGE_END ((unsigned char)0xf4) + +typedef unsigned char utf8_char[5]; + +unsigned char utf8_to_macroman(utf8_char *utf8); +const utf8_char * macroman_to_utf8(unsigned char c); +unsigned char * macroman_to_utf8_string(unsigned char *str, size_t len); + +#endif