utf8.c in jcs/wikipedia

utf8.c

(View History)

jcs *: Remove xmalloc comments, handle malloc failure

Latest amendment: 42 on 2023-08-28

1	/*
2	* Copyright (c) 2022 joshua stein <jcs@jcs.org>
3	*
4	* Permission to use, copy, modify, and distribute this software for any
5	* purpose with or without fee is hereby granted, provided that the above
6	* copyright notice and this permission notice appear in all copies.
7	*
8	* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9	* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10	* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11	* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14	* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15	*/
16
17	#include "utf8.h"
18
19	const struct utf8_macroman_pair {
20	unsigned char macroman;
21	utf8_char utf8;
22	} utf8_macroman_pairs[] = {
23	{ 'x', { 0xc3, 0x97 } },
24	{ 'è', { 0xc3, 0xa8 } },
25	{ 'é', { 0xc3, 0xa9 } },
26	{ 'ö', { 0xc3, 0xb6 } },
27	{ 'ü', { 0xc3, 0xbc } },
28	{ '"', { 0xe2, 0x80, 0x9c } },
29	{ '–', { 0xe2, 0x80, 0x93 } },
30	{ '—', { 0xe2, 0x80, 0x94 } },
31	{ '\'', { 0xe2, 0x80, 0x98 } },
32	{ '\'', { 0xe2, 0x80, 0x99 } },
33	{ 0, 0 }
34	};
35
36	unsigned char
37	utf8_to_macroman(utf8_char *utf8)
38	{
39	short n;
40	short bytes = 0;
41	struct utf8_macroman_pair *p = NULL;
42
43	if ((utf8)[0] >= 0xc2 && (utf8)[0] <= 0xdf && (*utf8)[1] != 0)
44	bytes = 2;
45	else if ((utf8)[0] >= 0xe0 && (utf8)[0] <= 0xef && (*utf8)[2] != 0)
46	bytes = 3;
47	else if ((utf8)[0] >= 0xf0 && (utf8)[0] <= 0xf4 && (*utf8)[3] != 0)
48	bytes = 4;
49	else if ((utf8)[0] != 0 && (utf8)[1] != 0 && (*utf8)[2] != 0 &&
50	(*utf8)[3] != 0)
51	return '?';
52
53	if (bytes == 0)
54	return 0;
55
56	for (n = 0; ; n++) {
57	p = (struct utf8_macroman_pair *)&utf8_macroman_pairs[n];
58
59	if (p->macroman == 0)
60	break;
61
62	if (p->utf8[0] != (*utf8)[0])
63	continue;
64
65	if (p->utf8[1] != (*utf8)[1])
66	continue;
67
68	if (p->utf8[2] == 0 && (*utf8)[2] == 0)
69	return p->macroman;
70
71	if (p->utf8[2] != (*utf8)[2])
72	continue;
73
74	if (p->utf8[3] == 0 && (*utf8)[3] == 0)
75	return p->macroman;
76
77	if (p->utf8[3] != (*utf8)[3])
78	continue;
79
80	if (p->utf8[4] == (*utf8)[4])
81	return p->macroman;
82	}
83
84	return '?';
85	}
86
87	const utf8_char *
88	macroman_to_utf8(unsigned char c)
89	{
90	short n;
91	struct utf8_macroman_pair *p = NULL;
92
93	for (n = 0; ; n++) {
94	p = (struct utf8_macroman_pair *)&utf8_macroman_pairs[n];
95
96	if (p->macroman == 0)
97	break;
98
99	if (p->macroman == c)
100	return (const utf8_char *)&p->utf8;
101	}
102
103	return NULL;
104	}
105
106	unsigned char *
107	macroman_to_utf8_string(unsigned char *str, size_t len)
108	{
109	const utf8_char *utf8;
110	unsigned char tmp, ret, c;
111	size_t ulen, n;
112	const utf8_char *u;
113
114	tmp = xmalloc((len * 4) + 1);
115	if (tmp == NULL) {
116	warn("utf8: Failed allocating (%ld * 4) + 1", len);
117	return NULL;
118	}
119
120	ulen = 0;
121	for (n = 0; n < len; n++) {
122	c = str[n];
123	u = macroman_to_utf8(c);
124
125	if (u == NULL) {
126	tmp[ulen++] = c;
127	continue;
128	}
129
130	tmp[ulen++] = (*u)[0];
131
132	if ((*u)[1] == 0)
133	continue;
134	tmp[ulen++] = (*u)[1];
135
136	if ((*u)[2] == 0)
137	continue;
138	tmp[ulen++] = (*u)[2];
139
140	if ((*u)[3] == 0)
141	continue;
142	tmp[ulen++] = (*u)[3];
143
144	if ((*u)[4] == 0)
145	continue;
146	tmp[ulen++] = (*u)[4];
147	}
148	tmp[ulen] = '\0';
149
150	ret = (unsigned char )xstrdup((char )tmp);
151	xfree(&tmp);
152
153	if (ret == NULL)
154	warn("utf8: Out of memory!");
155
156	return ret;
157	}

AmendHub

Download

jcs

wikipedia

utf8.c

(View History)