AmendHub

Download:

jcs

/

wikipedia

/

amendments

/

22

wikipedia: Add UTF-8 support, convert some to Mac OS Roman

This will take some trial-and-error to load articles with characters
we don't understand and then write Roman equivalents for them.

jcs made amendment 22 about 1 year ago
--- wikipedia.c Sun Sep 4 22:56:42 2022 +++ wikipedia.c Mon Sep 5 00:26:21 2022 @@ -123,6 +123,7 @@ wikipedia_request_process(struct wikipedia_request *wp XML_IN_NORMALIZED } xstate = 0; bool dump = false; + unsigned char utf8[4] = { 0 }; get_char: if (req->chunk_len == 0 || (req->chunk_off + 1 > req->chunk_len)) { @@ -227,10 +228,11 @@ parse_char: wpr->in_ref = true; dump = true; wpr->buf_len -= 4; - } else if (last[-5] == '<' && last[-4] == '/' && + } else if ((last[-5] == '<' && last[-4] == '/' && last[-3] == 'r' && last[-2] == 'e' && last[-1] == 'f' && - last[0] == '>') { - /* </ref> */ + last[0] == '>') || + (last[-1] == '/' && last[0] == '>')) { + /* </ref> or <ref /> */ wpr->in_ref = false; wpr->apostrophes = wpr->equals = wpr->brackets = wpr->curlys = 0; wpr->buf_len = 0; @@ -293,6 +295,59 @@ parse_char: c = 0; else c = '\r'; + } else if ((unsigned char)c >= 0x80 && (unsigned char)c < 0xf5) { + /* utf-8 */ + if (utf8[0] == 0) + utf8[0] = c; + else if (utf8[1] == 0) + utf8[1] = c; + else if (utf8[2] == 0) + utf8[2] = c; + else if (utf8[3] == 0) + utf8[3] = c; + else { + /* bogus */ + utf8[0] = 0; + c = 0; + } + + if (c) + c = 0; + else + c = '?'; + + if (utf8[0] >= 0xc2 && utf8[0] <= 0xdf && utf8[1] != 0) { + /* 2 byte */ + if (utf8[0] == 0xc3 && utf8[1] == 0xa9) + /* accent-e */ + c = 'é'; // 0x8e; + else + c = '?'; + utf8[0] = utf8[1] = utf8[2] = 0; + } else if (utf8[0] >= 0xe0 && utf8[0] <= 0xef && utf8[2] != 0) { + /* 3-byte */ + if ((utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x9c) || + (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x9d)) + /* smart quote */ + c = '"'; + else if (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x93) + /* n-dash */ + c = '–'; // 0xd0 + else if (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x94) + /* m-dash */ + c = '—'; // 0xd1 + else if ((utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x98) || + (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x99)) + /* apos */ + c = '\''; + else + c = '?'; + utf8[0] = utf8[1] = utf8[2] = 0; + } else if (utf8[0] >= 0xf0 && utf8[0] <= 0xf4 && utf8[3] != 0) { + /* 4-byte */ + c = '?'; + utf8[0] = utf8[1] = utf8[2] = 0; + } } if (wpr->style != wpr->last_style || dump) {