Download
jcs
/wikipedia
/utf8.c
(View History)
jcs *: Remove xmalloc comments, handle malloc failure | Latest amendment: 42 on 2023-08-28 |
1 | /* |
2 | * Copyright (c) 2022 joshua stein <jcs@jcs.org> |
3 | * |
4 | * Permission to use, copy, modify, and distribute this software for any |
5 | * purpose with or without fee is hereby granted, provided that the above |
6 | * copyright notice and this permission notice appear in all copies. |
7 | * |
8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
15 | */ |
16 | |
17 | #include "utf8.h" |
18 | |
19 | const struct utf8_macroman_pair { |
20 | unsigned char macroman; |
21 | utf8_char utf8; |
22 | } utf8_macroman_pairs[] = { |
23 | { 'x', { 0xc3, 0x97 } }, |
24 | { 'è', { 0xc3, 0xa8 } }, |
25 | { 'é', { 0xc3, 0xa9 } }, |
26 | { 'ö', { 0xc3, 0xb6 } }, |
27 | { 'ü', { 0xc3, 0xbc } }, |
28 | { '"', { 0xe2, 0x80, 0x9c } }, |
29 | { '–', { 0xe2, 0x80, 0x93 } }, |
30 | { '—', { 0xe2, 0x80, 0x94 } }, |
31 | { '\'', { 0xe2, 0x80, 0x98 } }, |
32 | { '\'', { 0xe2, 0x80, 0x99 } }, |
33 | { 0, 0 } |
34 | }; |
35 | |
36 | unsigned char |
37 | utf8_to_macroman(utf8_char *utf8) |
38 | { |
39 | short n; |
40 | short bytes = 0; |
41 | struct utf8_macroman_pair *p = NULL; |
42 | |
43 | if ((*utf8)[0] >= 0xc2 && (*utf8)[0] <= 0xdf && (*utf8)[1] != 0) |
44 | bytes = 2; |
45 | else if ((*utf8)[0] >= 0xe0 && (*utf8)[0] <= 0xef && (*utf8)[2] != 0) |
46 | bytes = 3; |
47 | else if ((*utf8)[0] >= 0xf0 && (*utf8)[0] <= 0xf4 && (*utf8)[3] != 0) |
48 | bytes = 4; |
49 | else if ((*utf8)[0] != 0 && (*utf8)[1] != 0 && (*utf8)[2] != 0 && |
50 | (*utf8)[3] != 0) |
51 | return '?'; |
52 | |
53 | if (bytes == 0) |
54 | return 0; |
55 | |
56 | for (n = 0; ; n++) { |
57 | p = (struct utf8_macroman_pair *)&utf8_macroman_pairs[n]; |
58 | |
59 | if (p->macroman == 0) |
60 | break; |
61 | |
62 | if (p->utf8[0] != (*utf8)[0]) |
63 | continue; |
64 | |
65 | if (p->utf8[1] != (*utf8)[1]) |
66 | continue; |
67 | |
68 | if (p->utf8[2] == 0 && (*utf8)[2] == 0) |
69 | return p->macroman; |
70 | |
71 | if (p->utf8[2] != (*utf8)[2]) |
72 | continue; |
73 | |
74 | if (p->utf8[3] == 0 && (*utf8)[3] == 0) |
75 | return p->macroman; |
76 | |
77 | if (p->utf8[3] != (*utf8)[3]) |
78 | continue; |
79 | |
80 | if (p->utf8[4] == (*utf8)[4]) |
81 | return p->macroman; |
82 | } |
83 | |
84 | return '?'; |
85 | } |
86 | |
87 | const utf8_char * |
88 | macroman_to_utf8(unsigned char c) |
89 | { |
90 | short n; |
91 | struct utf8_macroman_pair *p = NULL; |
92 | |
93 | for (n = 0; ; n++) { |
94 | p = (struct utf8_macroman_pair *)&utf8_macroman_pairs[n]; |
95 | |
96 | if (p->macroman == 0) |
97 | break; |
98 | |
99 | if (p->macroman == c) |
100 | return (const utf8_char *)&p->utf8; |
101 | } |
102 | |
103 | return NULL; |
104 | } |
105 | |
106 | unsigned char * |
107 | macroman_to_utf8_string(unsigned char *str, size_t len) |
108 | { |
109 | const utf8_char *utf8; |
110 | unsigned char *tmp, *ret, c; |
111 | size_t ulen, n; |
112 | const utf8_char *u; |
113 | |
114 | tmp = xmalloc((len * 4) + 1); |
115 | if (tmp == NULL) { |
116 | warn("utf8: Failed allocating (%ld * 4) + 1", len); |
117 | return NULL; |
118 | } |
119 | |
120 | ulen = 0; |
121 | for (n = 0; n < len; n++) { |
122 | c = str[n]; |
123 | u = macroman_to_utf8(c); |
124 | |
125 | if (u == NULL) { |
126 | tmp[ulen++] = c; |
127 | continue; |
128 | } |
129 | |
130 | tmp[ulen++] = (*u)[0]; |
131 | |
132 | if ((*u)[1] == 0) |
133 | continue; |
134 | tmp[ulen++] = (*u)[1]; |
135 | |
136 | if ((*u)[2] == 0) |
137 | continue; |
138 | tmp[ulen++] = (*u)[2]; |
139 | |
140 | if ((*u)[3] == 0) |
141 | continue; |
142 | tmp[ulen++] = (*u)[3]; |
143 | |
144 | if ((*u)[4] == 0) |
145 | continue; |
146 | tmp[ulen++] = (*u)[4]; |
147 | } |
148 | tmp[ulen] = '\0'; |
149 | |
150 | ret = (unsigned char *)xstrdup((char *)tmp); |
151 | xfree(&tmp); |
152 | |
153 | if (ret == NULL) |
154 | warn("utf8: Out of memory!"); |
155 | |
156 | return ret; |
157 | } |