utf8.c in jcs/wallops

utf8.c

(View History)

jcs utf8: Import from Carl

Latest amendment: 112 on 2024-09-17

1	/*
2	* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
3	* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
4	*
5	* Copyright (c) 2022 joshua stein <jcs@jcs.org>
6	*
7	* Permission is hereby granted, free of charge, to any person obtaining
8	* a copy of this software and associated documentation files (the
9	* "Software"), to deal in the Software without restriction, including
10	* without limitation the rights to use, copy, modify, merge, publish,
11	* distribute, sublicense, and/or sell copies of the Software, and to
12	* permit persons to whom the Software is furnished to do so, subject to
13	* the following conditions:
14	*
15	* The above copyright notice and this permission notice shall be
16	* included in all copies or substantial portions of the Software.
17	*
18	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
22	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
23	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25	* SOFTWARE.
26	*/
27
28	#include "utf8.h"
29
30	#define ONLY_TO_MACROMAN 1
31	#define ONLY_TO_UTF8 2
32
33	static const struct utf8_macroman_pair {
34	utf8_codepoint utf8;
35	unsigned char macroman;
36	unsigned char type;
37	} utf8_macroman_pairs[] = {
38	{ 0x000000a0, ' ', ONLY_TO_MACROMAN }, /* supposed to be non-breaking */
39	{ 0x00002019, '\'', ONLY_TO_MACROMAN },
40	{ 0x00002026, '…', ONLY_TO_MACROMAN },
41	{ 0x0000c2a0, ' ', ONLY_TO_MACROMAN },
42	{ 0x00efbfbd, UTF8_UNKNOWN_MACROMAN, ONLY_TO_UTF8 }, /* unknown utf-8 char */
43
44	{ 0x0000c2a1, '¡' },
45	{ 0x0000c2a2, '¢' },
46	{ 0x0000c2a3, '£' },
47	{ 0x0000c2a5, '¥' },
48	{ 0x0000c2a7, '§' },
49	{ 0x0000c2a8, '¨' },
50	{ 0x0000c2a9, '©' },
51	{ 0x0000c2aa, 'ª' },
52	{ 0x0000c2ab, '«' },
53	{ 0x0000c2ac, '¬' },
54	{ 0x0000c2ae, '®' },
55	{ 0x0000c2b0, '°' },
56	{ 0x0000c2b1, '±' },
57	{ 0x0000c2b4, '´' },
58	{ 0x0000c2b5, 'µ' },
59	{ 0x0000c2b6, '¶' },
60	{ 0x0000c2ba, 'º' },
61	{ 0x0000c2bb, '»' },
62	{ 0x0000c2bf, '¿' },
63	{ 0x0000c380, 'À' },
64	{ 0x0000c383, 'Ã' },
65	{ 0x0000c384, 'Ä' },
66	{ 0x0000c385, 'Å' },
67	{ 0x0000c386, 'Æ' },
68	{ 0x0000c387, 'Ç' },
69	{ 0x0000c389, 'É' },
70	{ 0x0000c391, 'Ñ' },
71	{ 0x0000c395, 'Õ' },
72	{ 0x0000c396, 'Ö' },
73	{ 0x0000c398, 'Ø' },
74	{ 0x0000c39c, 'Ü' },
75	{ 0x0000c39f, 'ß' },
76	{ 0x0000c3a0, 'à' },
77	{ 0x0000c3a1, 'á' },
78	{ 0x0000c3a2, 'â' },
79	{ 0x0000c3a3, 'ã' },
80	{ 0x0000c3a4, 'ä' },
81	{ 0x0000c3a5, 'å' },
82	{ 0x0000c3a6, 'æ' },
83	{ 0x0000c3a7, 'ç' },
84	{ 0x0000c3a8, 'è' },
85	{ 0x0000c3a9, 'é' },
86	{ 0x0000c3aa, 'ê' },
87	{ 0x0000c3ab, 'ë' },
88	{ 0x0000c3ac, 'ì' },
89	{ 0x0000c3ad, 'í' },
90	{ 0x0000c3ae, 'î' },
91	{ 0x0000c3af, 'ï' },
92	{ 0x0000c3b1, 'ñ' },
93	{ 0x0000c3b2, 'ò' },
94	{ 0x0000c3b3, 'ó' },
95	{ 0x0000c3b4, 'ô' },
96	{ 0x0000c3b5, 'õ' },
97	{ 0x0000c3b6, 'ö' },
98	{ 0x0000c3b7, '÷' },
99	{ 0x0000c3b8, 'ø' },
100	{ 0x0000c3b9, 'ù' },
101	{ 0x0000c3ba, 'ú' },
102	{ 0x0000c3bb, 'û' },
103	{ 0x0000c3bc, 'ü' },
104	{ 0x0000c3bf, 'ÿ' },
105	{ 0x0000c592, 'Œ' },
106	{ 0x0000c593, 'œ' },
107	{ 0x0000c5b8, 'Ÿ' },
108	{ 0x0000c692, 'ƒ' },
109	{ 0x0000cf80, 'π' },
110	{ 0x00e28093, '–' },
111	{ 0x00e28094, '—' },
112	{ 0x00e28098, '‘' },
113	{ 0x00e28099, '’' },
114	{ 0x00e2809c, '“' },
115	{ 0x00e2809d, '”' },
116	{ 0x00e280a0, '†' },
117	{ 0x00e280a2, '•' },
118	{ 0x00e280a6, '…' },
119	{ 0x00e284a2, '™' },
120	{ 0x00e284a6, 'Ω' },
121	{ 0x00e28882, '∂' },
122	{ 0x00e28886, '∆' },
123	{ 0x00e2888f, '∏' },
124	{ 0x00e28891, '∑' },
125	{ 0x00e2889a, '√' },
126	{ 0x00e2889e, '∞' },
127	{ 0x00e288ab, '∫' },
128	{ 0x00e28988, '≈' },
129	{ 0x00e289a0, '≠' },
130	{ 0x00e289a4, '≤' },
131	{ 0x00e289a5, '≥' },
132	};
133
134	static const unsigned char utf8d[] = {
135	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
136	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
137	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
138	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
139	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
140	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
141	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
142	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
143	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
144	0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
145	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
146	1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
147	1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
148	1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
149	};
150
151	/* iso8859-1 character, minus 128, to macroman */
152	static const unsigned char iso88591_macroman_pairs[] = {
153	'?', '?', '?', '?', '?', '?', '?', '?',
154	'?', '?', '?', '?', '?', '?', '?', '?',
155	'?', '?', '?', '?', '?', '?', '?', '?',
156	'?', '?', '?', '?', '?', '?', '?', '?',
157	0xca, 0xc1, 0xa2, 0xa3, 0xdb, 0xb4, '?', 0xa4,
158	0xac, 0xa9, 0xbb, 0xc7, 0xc2, '?', 0xa8, 0xf8,
159	0xa1, 0xb1, '?', '?', 0xab, 0xb5, 0xa6, 0xe1,
160	0xfc, '?', 0xbc, 0xc8, '?', '?', '?', 0xc0,
161	0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82,
162	0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec,
163	'?', 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, '?',
164	0xaf, 0xf4, 0xf2, 0xf3, 0x86, '?', '?', 0xa7,
165	0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d,
166	0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95,
167	'?', 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6,
168	0xbf, 0x9d, 0x9c, 0x9e, 0x9f, '?', '?', 0xd8,
169	};
170
171	utf8_codepoint
172	utf8_decode(utf8_codepoint state, utf8_codepoint codep,
173	utf8_codepoint byte)
174	{
175	utf8_codepoint type = utf8d[byte];
176
177	codep = (state != UTF8_ACCEPT) ? (byte & 0x3fu) \| (*codep << 6) :
178	(0xff >> type) & (byte);
179
180	state = utf8d[256 + (state * 16) + type];
181	return *state;
182	}
183
184	size_t
185	utf8_to_macroman_string(char str, size_t len, char ret)
186	{
187	struct utf8_macroman_pair *pair;
188	utf8_codepoint codepoint, utf8_bytes;
189	utf8_codepoint state = UTF8_ACCEPT;
190	size_t retlen, n, j;
191	short bytes = 0;
192	unsigned char ustr = (unsigned char )str;
193	bool found_macroman;
194
195	retlen = 0;
196	for (n = 0; n < len; n++) {
197	bytes++;
198
199	if (utf8_decode(&state, &codepoint, ustr[n]) == 0) {
200	if (bytes > 1) {
201	utf8_bytes = 0;
202	if (bytes == 4)
203	utf8_bytes \|= ((unsigned long)(ustr[n - 3]) << 24);
204	if (bytes >= 3)
205	utf8_bytes \|= ((unsigned long)(ustr[n - 2]) << 16);
206	if (bytes >= 2)
207	utf8_bytes \|= ((unsigned long)(ustr[n - 1]) << 8);
208	if (bytes >= 1)
209	utf8_bytes \|= ustr[n];
210
211	found_macroman = false;
212	for (j = 0; j < nitems(utf8_macroman_pairs); j++) {
213	pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j];
214	if (utf8_bytes == pair->utf8 &&
215	pair->type != ONLY_TO_UTF8) {
216	codepoint = pair->macroman;
217	found_macroman = true;
218	break;
219	}
220	}
221
222	if (!found_macroman)
223	codepoint = UTF8_UNKNOWN_MACROMAN;
224	}
225
226	ret[retlen++] = codepoint;
227	bytes = 0;
228	state = UTF8_ACCEPT;
229	}
230	}
231
232	ret[retlen] = '\0';
233	return retlen;
234	}
235
236	size_t
237	macroman_to_utf8_string(char str, size_t len, char *retp)
238	{
239	struct utf8_macroman_pair *pair;
240	size_t retlen, n, j;
241	unsigned char ustr = (unsigned char )str;
242	unsigned char c;
243	bool found_utf8;
244
245	retlen = 0;
246	for (n = 0; n < len; n++) {
247	c = ustr[n];
248
249	if (c >= 128)
250	retlen += 4;
251	else
252	retlen++;
253	}
254
255	*retp = xmalloc(retlen + 1);
256	if (*retp == NULL)
257	return 0;
258
259	retlen = 0;
260	for (n = 0; n < len; n++) {
261	c = ustr[n];
262
263	if (c < 128) {
264	(*retp)[retlen++] = c;
265	continue;
266	}
267
268	found_utf8 = false;
269	for (j = 0; j < nitems(utf8_macroman_pairs); j++) {
270	pair = (struct utf8_macroman_pair *)&utf8_macroman_pairs[j];
271	if (pair->type == ONLY_TO_MACROMAN)
272	continue;
273	if (c == pair->macroman) {
274	found_utf8 = true;
275	if (pair->utf8 >= 0x01000000)
276	(*retp)[retlen++] = (pair->utf8 >> 24);
277	if (pair->utf8 >= 0x00010000)
278	(*retp)[retlen++] = (pair->utf8 >> 16) & 0xff;
279	if (pair->utf8 >= 0x00000100)
280	(*retp)[retlen++] = (pair->utf8 >> 8) & 0xff;
281	if (pair->utf8 >= 0x00000001)
282	(*retp)[retlen++] = pair->utf8 & 0xff;
283	break;
284	}
285	}
286
287	if (!found_utf8)
288	(*retp)[retlen++] = c;
289	}
290
291	(*retp)[retlen] = '\0';
292	return retlen;
293	}
294
295	size_t
296	iso88591_to_macroman_string(char str, size_t len, char *retp)
297	{
298	size_t mlen, retlen, n, j;
299	unsigned char ustr = (unsigned char )str;
300	unsigned char c;
301
302	*retp = xmalloc(len + 1);
303	if (*retp == NULL)
304	return 0;
305
306	retlen = 0;
307	for (n = 0; n < len; n++) {
308	c = ustr[n];
309
310	if (c >= 128)
311	(*retp)[retlen++] = iso88591_macroman_pairs[c - 128];
312	else
313	(*retp)[retlen++] = c;
314	}
315
316	(*retp)[retlen] = '\0';
317	return retlen;
318	}

AmendHub

Download

jcs

wallops

utf8.c

(View History)