AmendHub

Download

jcs

/

wikipedia

/

utf8.c

 

(View History)

jcs   *: Remove xmalloc comments, handle malloc failure Latest amendment: 42 on 2023-08-28

1 /*
2 * Copyright (c) 2022 joshua stein <jcs@jcs.org>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include "utf8.h"
18
19 const struct utf8_macroman_pair {
20 unsigned char macroman;
21 utf8_char utf8;
22 } utf8_macroman_pairs[] = {
23 { 'x', { 0xc3, 0x97 } },
24 { 'è', { 0xc3, 0xa8 } },
25 { 'é', { 0xc3, 0xa9 } },
26 { 'ö', { 0xc3, 0xb6 } },
27 { 'ü', { 0xc3, 0xbc } },
28 { '"', { 0xe2, 0x80, 0x9c } },
29 { '–', { 0xe2, 0x80, 0x93 } },
30 { '—', { 0xe2, 0x80, 0x94 } },
31 { '\'', { 0xe2, 0x80, 0x98 } },
32 { '\'', { 0xe2, 0x80, 0x99 } },
33 { 0, 0 }
34 };
35
36 unsigned char
37 utf8_to_macroman(utf8_char *utf8)
38 {
39 short n;
40 short bytes = 0;
41 struct utf8_macroman_pair *p = NULL;
42
43 if ((*utf8)[0] >= 0xc2 && (*utf8)[0] <= 0xdf && (*utf8)[1] != 0)
44 bytes = 2;
45 else if ((*utf8)[0] >= 0xe0 && (*utf8)[0] <= 0xef && (*utf8)[2] != 0)
46 bytes = 3;
47 else if ((*utf8)[0] >= 0xf0 && (*utf8)[0] <= 0xf4 && (*utf8)[3] != 0)
48 bytes = 4;
49 else if ((*utf8)[0] != 0 && (*utf8)[1] != 0 && (*utf8)[2] != 0 &&
50 (*utf8)[3] != 0)
51 return '?';
52
53 if (bytes == 0)
54 return 0;
55
56 for (n = 0; ; n++) {
57 p = (struct utf8_macroman_pair *)&utf8_macroman_pairs[n];
58
59 if (p->macroman == 0)
60 break;
61
62 if (p->utf8[0] != (*utf8)[0])
63 continue;
64
65 if (p->utf8[1] != (*utf8)[1])
66 continue;
67
68 if (p->utf8[2] == 0 && (*utf8)[2] == 0)
69 return p->macroman;
70
71 if (p->utf8[2] != (*utf8)[2])
72 continue;
73
74 if (p->utf8[3] == 0 && (*utf8)[3] == 0)
75 return p->macroman;
76
77 if (p->utf8[3] != (*utf8)[3])
78 continue;
79
80 if (p->utf8[4] == (*utf8)[4])
81 return p->macroman;
82 }
83
84 return '?';
85 }
86
87 const utf8_char *
88 macroman_to_utf8(unsigned char c)
89 {
90 short n;
91 struct utf8_macroman_pair *p = NULL;
92
93 for (n = 0; ; n++) {
94 p = (struct utf8_macroman_pair *)&utf8_macroman_pairs[n];
95
96 if (p->macroman == 0)
97 break;
98
99 if (p->macroman == c)
100 return (const utf8_char *)&p->utf8;
101 }
102
103 return NULL;
104 }
105
106 unsigned char *
107 macroman_to_utf8_string(unsigned char *str, size_t len)
108 {
109 const utf8_char *utf8;
110 unsigned char *tmp, *ret, c;
111 size_t ulen, n;
112 const utf8_char *u;
113
114 tmp = xmalloc((len * 4) + 1);
115 if (tmp == NULL) {
116 warn("utf8: Failed allocating (%ld * 4) + 1", len);
117 return NULL;
118 }
119
120 ulen = 0;
121 for (n = 0; n < len; n++) {
122 c = str[n];
123 u = macroman_to_utf8(c);
124
125 if (u == NULL) {
126 tmp[ulen++] = c;
127 continue;
128 }
129
130 tmp[ulen++] = (*u)[0];
131
132 if ((*u)[1] == 0)
133 continue;
134 tmp[ulen++] = (*u)[1];
135
136 if ((*u)[2] == 0)
137 continue;
138 tmp[ulen++] = (*u)[2];
139
140 if ((*u)[3] == 0)
141 continue;
142 tmp[ulen++] = (*u)[3];
143
144 if ((*u)[4] == 0)
145 continue;
146 tmp[ulen++] = (*u)[4];
147 }
148 tmp[ulen] = '\0';
149
150 ret = (unsigned char *)xstrdup((char *)tmp);
151 xfree(&tmp);
152
153 if (ret == NULL)
154 warn("utf8: Out of memory!");
155
156 return ret;
157 }