jcs
/wikipedia
/amendments
/22
wikipedia: Add UTF-8 support, convert some to Mac OS Roman
This will take some trial-and-error to load articles with characters
we don't understand and then write Roman equivalents for them.
jcs made amendment 22 over 2 years ago
--- wikipedia.c Sun Sep 4 22:56:42 2022
+++ wikipedia.c Mon Sep 5 00:26:21 2022
@@ -123,6 +123,7 @@ wikipedia_request_process(struct wikipedia_request *wp
XML_IN_NORMALIZED
} xstate = 0;
bool dump = false;
+ unsigned char utf8[4] = { 0 };
get_char:
if (req->chunk_len == 0 || (req->chunk_off + 1 > req->chunk_len)) {
@@ -227,10 +228,11 @@ parse_char:
wpr->in_ref = true;
dump = true;
wpr->buf_len -= 4;
- } else if (last[-5] == '<' && last[-4] == '/' &&
+ } else if ((last[-5] == '<' && last[-4] == '/' &&
last[-3] == 'r' && last[-2] == 'e' && last[-1] == 'f' &&
- last[0] == '>') {
- /* </ref> */
+ last[0] == '>') ||
+ (last[-1] == '/' && last[0] == '>')) {
+ /* </ref> or <ref /> */
wpr->in_ref = false;
wpr->apostrophes = wpr->equals = wpr->brackets = wpr->curlys = 0;
wpr->buf_len = 0;
@@ -293,6 +295,59 @@ parse_char:
c = 0;
else
c = '\r';
+ } else if ((unsigned char)c >= 0x80 && (unsigned char)c < 0xf5) {
+ /* utf-8 */
+ if (utf8[0] == 0)
+ utf8[0] = c;
+ else if (utf8[1] == 0)
+ utf8[1] = c;
+ else if (utf8[2] == 0)
+ utf8[2] = c;
+ else if (utf8[3] == 0)
+ utf8[3] = c;
+ else {
+ /* bogus */
+ utf8[0] = 0;
+ c = 0;
+ }
+
+ if (c)
+ c = 0;
+ else
+ c = '?';
+
+ if (utf8[0] >= 0xc2 && utf8[0] <= 0xdf && utf8[1] != 0) {
+ /* 2 byte */
+ if (utf8[0] == 0xc3 && utf8[1] == 0xa9)
+ /* accent-e */
+ c = 'é'; // 0x8e;
+ else
+ c = '?';
+ utf8[0] = utf8[1] = utf8[2] = 0;
+ } else if (utf8[0] >= 0xe0 && utf8[0] <= 0xef && utf8[2] != 0) {
+ /* 3-byte */
+ if ((utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x9c) ||
+ (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x9d))
+ /* smart quote */
+ c = '"';
+ else if (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x93)
+ /* n-dash */
+ c = '–'; // 0xd0
+ else if (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x94)
+ /* m-dash */
+ c = '—'; // 0xd1
+ else if ((utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x98) ||
+ (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x99))
+ /* apos */
+ c = '\'';
+ else
+ c = '?';
+ utf8[0] = utf8[1] = utf8[2] = 0;
+ } else if (utf8[0] >= 0xf0 && utf8[0] <= 0xf4 && utf8[3] != 0) {
+ /* 4-byte */
+ c = '?';
+ utf8[0] = utf8[1] = utf8[2] = 0;
+ }
}
if (wpr->style != wpr->last_style || dump) {