AmendHub

Download:

jcs

/

wikipedia

/

amendments

/

23

wikipedia: Restructure to handle styles touching each other

''[[...]]'' was not getting parsed properly moving directly from
the 2 apostrophes (italics) to a link.

jcs made amendment 23 about 1 year ago
--- wikipedia.c Mon Sep 5 00:26:21 2022 +++ wikipedia.c Mon Sep 5 14:32:30 2022 @@ -207,190 +207,209 @@ get_char: parse_char: if (c == '{') { wpr->curlys++; + goto advance; } else if (c == '}') { wpr->curlys--; + goto advance; } else if (wpr->curlys > 0) { /* consume, obey */ + goto advance; } else if (c == '[') { wpr->brackets++; + goto advance; } else if (c == ']') { wpr->brackets--; + goto advance; } else if (c == '\'') { wpr->apostrophes++; + goto advance; } else if (c == '=') { wpr->equals++; - } else { - last = wpr->buf + wpr->buf_len - 1; - - if (last[-3] == '<' && last[-2] == 'r' && last[-1] == 'e' && - last[0] == 'f') { - /* <ref */ - wpr->in_ref = true; - dump = true; - wpr->buf_len -= 4; - } else if ((last[-5] == '<' && last[-4] == '/' && - last[-3] == 'r' && last[-2] == 'e' && last[-1] == 'f' && - last[0] == '>') || - (last[-1] == '/' && last[0] == '>')) { - /* </ref> or <ref /> */ - wpr->in_ref = false; - wpr->apostrophes = wpr->equals = wpr->brackets = wpr->curlys = 0; - wpr->buf_len = 0; - } else if (wpr->in_ref) { - /* consume, obey */ - } else if (wpr->apostrophes == 3) { + goto advance; + } + + last = wpr->buf + wpr->buf_len - 1; + + if (last[-3] == '<' && last[-2] == 'r' && last[-1] == 'e' && + last[0] == 'f') { + /* <ref */ + wpr->in_ref = true; + dump = true; + wpr->buf_len -= 4; + } else if ((last[-5] == '<' && last[-4] == '/' && last[-3] == 'r' && + last[-2] == 'e' && last[-1] == 'f' && last[0] == '>') || + (last[-1] == '/' && last[0] == '>')) { + /* </ref> or <ref /> */ + wpr->in_ref = false; + wpr->apostrophes = wpr->equals = wpr->brackets = wpr->curlys = 0; + wpr->buf_len = 0; + } else if (wpr->in_ref) { + goto consume; + } + + if (wpr->apostrophes) { + if (wpr->apostrophes == 3) { if (wpr->style & STYLE_BOLD) wpr->style &= ~(STYLE_BOLD); else wpr->style |= STYLE_BOLD; - wpr->apostrophes = 0; } else if (wpr->apostrophes == 2) { if (wpr->style & STYLE_ITALIC) wpr->style &= ~(STYLE_ITALIC); else wpr->style |= STYLE_ITALIC; - wpr->apostrophes = 0; } else if (wpr->apostrophes == 1) { /* literal apostrophe, add and go back to handle c */ wpr->apostrophes = 0; wpr->buf[wpr->buf_len++] = '\''; goto parse_char; - } else if (wpr->equals) { - if (wpr->equals == 5) { - if (wpr->style & STYLE_H5) - wpr->style &= ~(STYLE_H5); - else - wpr->style |= STYLE_H5; - } else if (wpr->equals == 4) { - if (wpr->style & STYLE_H4) - wpr->style &= ~(STYLE_H4); - else - wpr->style |= STYLE_H4; - } else if (wpr->equals == 3) { - if (wpr->style & STYLE_H3) - wpr->style &= ~(STYLE_H3); - else - wpr->style |= STYLE_H3; - } else if (wpr->equals == 2) { - if (wpr->style & STYLE_H2) - wpr->style &= ~(STYLE_H2); - else - wpr->style |= STYLE_H2; - } else { - /* literal equals, add and go back to handle c */ - wpr->equals = 0; - wpr->buf[wpr->buf_len++] = '='; - goto parse_char; - } + } + wpr->apostrophes = 0; + } + + if (wpr->equals) { + if (wpr->equals == 5) { + if (wpr->style & STYLE_H5) + wpr->style &= ~(STYLE_H5); + else + wpr->style |= STYLE_H5; + } else if (wpr->equals == 4) { + if (wpr->style & STYLE_H4) + wpr->style &= ~(STYLE_H4); + else + wpr->style |= STYLE_H4; + } else if (wpr->equals == 3) { + if (wpr->style & STYLE_H3) + wpr->style &= ~(STYLE_H3); + else + wpr->style |= STYLE_H3; + } else if (wpr->equals == 2) { + if (wpr->style & STYLE_H2) + wpr->style &= ~(STYLE_H2); + else + wpr->style |= STYLE_H2; + } else { + /* literal equals, add and go back to handle c */ wpr->equals = 0; - } else if (wpr->brackets == 2) { - wpr->style |= STYLE_LINK; - } else if (wpr->brackets == 0 && (wpr->style & STYLE_LINK)) { - wpr->style &= ~(STYLE_LINK); + wpr->buf[wpr->buf_len++] = '='; + goto parse_char; } + wpr->equals = 0; + } + + if (wpr->brackets == 2) + wpr->style |= STYLE_LINK; + else if (wpr->brackets == 0 && (wpr->style & STYLE_LINK)) + wpr->style &= ~(STYLE_LINK); - if (c == '\n') { - /* skip leading newlines and only allow 2 in a row */ - if (wpr->article_len == 0 || (last[0] == '\r' && last[-1] == '\r')) - c = 0; + if (c == '\n') { + /* skip leading newlines and only allow 2 in a row */ + if (wpr->article_len == 0 || (last[0] == '\r' && last[-1] == '\r')) + c = 0; + else + c = '\r'; + } else if ((unsigned char)c >= 0x80 && (unsigned char)c < 0xf5) { + /* utf-8 */ + if (utf8[0] == 0) + utf8[0] = c; + else if (utf8[1] == 0) + utf8[1] = c; + else if (utf8[2] == 0) + utf8[2] = c; + else if (utf8[3] == 0) + utf8[3] = c; + else { + /* bogus */ + utf8[0] = 0; + c = 0; + } + + if (c) + c = 0; + else + c = '?'; + + if (utf8[0] >= 0xc2 && utf8[0] <= 0xdf && utf8[1] != 0) { + /* 2 byte */ + if (utf8[0] == 0xc3 && utf8[1] == 0x97) + c = 'x'; + else if (utf8[0] == 0xc3 && utf8[1] == 0xa9) + /* e accent */ + c = 'é'; // 0x8e + else if (utf8[0] == 0xc3 && utf8[1] == 0xb6) + /* o umlat */ + c = 'ö'; // 0x9a else - c = '\r'; - } else if ((unsigned char)c >= 0x80 && (unsigned char)c < 0xf5) { - /* utf-8 */ - if (utf8[0] == 0) - utf8[0] = c; - else if (utf8[1] == 0) - utf8[1] = c; - else if (utf8[2] == 0) - utf8[2] = c; - else if (utf8[3] == 0) - utf8[3] = c; - else { - /* bogus */ - utf8[0] = 0; - c = 0; - } - - if (c) - c = 0; + c = '?'; + utf8[0] = utf8[1] = utf8[2] = 0; + } else if (utf8[0] >= 0xe0 && utf8[0] <= 0xef && utf8[2] != 0) { + /* 3-byte */ + if ((utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x9c) || + (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x9d)) + /* smart quote */ + c = '"'; + else if (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x93) + /* n-dash */ + c = '–'; // 0xd0 + else if (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x94) + /* m-dash */ + c = '—'; // 0xd1 + else if ((utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x98) || + (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x99)) + /* apos */ + c = '\''; else c = '?'; - - if (utf8[0] >= 0xc2 && utf8[0] <= 0xdf && utf8[1] != 0) { - /* 2 byte */ - if (utf8[0] == 0xc3 && utf8[1] == 0xa9) - /* accent-e */ - c = 'é'; // 0x8e; - else - c = '?'; - utf8[0] = utf8[1] = utf8[2] = 0; - } else if (utf8[0] >= 0xe0 && utf8[0] <= 0xef && utf8[2] != 0) { - /* 3-byte */ - if ((utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x9c) || - (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x9d)) - /* smart quote */ - c = '"'; - else if (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x93) - /* n-dash */ - c = '–'; // 0xd0 - else if (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x94) - /* m-dash */ - c = '—'; // 0xd1 - else if ((utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x98) || - (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x99)) - /* apos */ - c = '\''; - else - c = '?'; - utf8[0] = utf8[1] = utf8[2] = 0; - } else if (utf8[0] >= 0xf0 && utf8[0] <= 0xf4 && utf8[3] != 0) { - /* 4-byte */ - c = '?'; - utf8[0] = utf8[1] = utf8[2] = 0; - } + utf8[0] = utf8[1] = utf8[2] = 0; + } else if (utf8[0] >= 0xf0 && utf8[0] <= 0xf4 && utf8[3] != 0) { + /* 4-byte */ + c = '?'; + utf8[0] = utf8[1] = utf8[2] = 0; } + } - if (wpr->style != wpr->last_style || dump) { - if (wpr->buf_len) { - browser_print(wpr->browser, wpr->buf, wpr->buf_len, - wpr->last_style); - wpr->article_len += wpr->buf_len; - wpr->buf_len = 0; - } - wpr->last_style = wpr->style; - dump = false; + if (wpr->style != wpr->last_style || dump) { + if (wpr->buf_len) { + browser_print(wpr->browser, wpr->buf, wpr->buf_len, + wpr->last_style); + wpr->article_len += wpr->buf_len; + wpr->buf_len = 0; } + wpr->last_style = wpr->style; + dump = false; + } + + if (c == ' ' && wpr->buf_len == 0 && + (wpr->style & (STYLE_H2|STYLE_H3|STYLE_H4|STYLE_H5))) + /* skip leading spaces around headers */ + c = 0; + +consume: + if (c != 0) { + wpr->buf[wpr->buf_len++] = c; - if (c == ' ' && wpr->buf_len == 0 && - (wpr->style & (STYLE_H2|STYLE_H3|STYLE_H4|STYLE_H5))) - /* skip leading spaces around headers */ - c = 0; - - if (c != 0) { - wpr->buf[wpr->buf_len++] = c; - - /* XML entity decode */ - if (c == ';') { - last = wpr->buf + wpr->buf_len - 1; - if (last[-4] == '&' && last[-3] == 'a' && last[-2] == 'm' && - last[-1] == 'p') { - last[-4] = '&'; - wpr->buf_len -= 4; - } else if (last[-5] == '&' && last[-3] == 'n' && - last[-3] == 'b' && last[-2] == 's' && last[-1] == 'p') { - last[-5] = ' '; - wpr->buf_len -= 5; - } else if (last[-3] == '&' && last[-2] == 'l' && last[-1] == 't') { - last[-3] = '<'; - wpr->buf_len -= 3; - } else if (last[-3] == '&' && last[-2] == 'g' && last[-1] == 't') { - last[-3] = '>'; - wpr->buf_len -= 3; - } + /* XML entity decode */ + if (c == ';') { + last = wpr->buf + wpr->buf_len - 1; + if (last[-4] == '&' && last[-3] == 'a' && last[-2] == 'm' && + last[-1] == 'p') { + last[-4] = '&'; + wpr->buf_len -= 4; + } else if (last[-5] == '&' && last[-4] == 'n' && + last[-3] == 'b' && last[-2] == 's' && last[-1] == 'p') { + last[-5] = ' '; + wpr->buf_len -= 5; + } else if (last[-3] == '&' && last[-2] == 'l' && last[-1] == 't') { + last[-3] = '<'; + wpr->buf_len -= 3; + } else if (last[-3] == '&' && last[-2] == 'g' && last[-1] == 't') { + last[-3] = '>'; + wpr->buf_len -= 3; } } } +advance: req->chunk_off++; goto get_char; }