AmendHub

Download:

jcs

/

wikipedia

/

amendments

/

24

wikipedia: Rewrite Wikitext parser again


jcs made amendment 24 over 2 years ago
--- browser.c Mon Sep 5 00:38:47 2022 +++ browser.c Mon Sep 5 14:52:16 2022 @@ -509,7 +509,7 @@ static Handle scrp_rec_h = NULL; size_t browser_print(struct browser *browser, const char *str, size_t len, - unsigned short style) + unsigned long style) { StScrpRec *scrp_rec; ScrpSTElement *scrp_ele; @@ -517,7 +517,7 @@ browser_print(struct browser *browser, const char *str Rect zerorect = { 0, 0, 0, 0 }; size_t n; short line_height = 0, was_len = 0; - static unsigned short last_style = 0; + static unsigned long last_style = 0; if (scrp_rec_h == NULL) { scrp_rec_h = xNewHandle(sizeof(short) + @@ -560,8 +560,6 @@ browser_print(struct browser *browser, const char *str scrp_ele->scrpHeight += 4; scrp_ele->scrpAscent += 2; } - - if (style & STYLE_LINK) { /* remove link destinations for now */ --- browser.h Mon Sep 5 00:19:59 2022 +++ browser.h Mon Sep 5 20:53:18 2022 @@ -27,14 +27,16 @@ enum { BROWSER_STATE_ARTICLE_DONE }; -#define STYLE_BOLD (1 << 0) -#define STYLE_ITALIC (1 << 1) -#define STYLE_H1 (1 << 2) -#define STYLE_H2 (1 << 3) -#define STYLE_H3 (1 << 4) -#define STYLE_H4 (1 << 5) -#define STYLE_H5 (1 << 6) -#define STYLE_LINK (1 << 7) +#define STYLE_BOLD (1UL << 0) +#define STYLE_ITALIC (1UL << 1) +#define STYLE_H1 (1UL << 2) +#define STYLE_H2 (1UL << 3) +#define STYLE_H3 (1UL << 4) +#define STYLE_H4 (1UL << 5) +#define STYLE_H5 (1UL << 6) +#define STYLE_LINK (1UL << 7) +#define STYLE_REF (1UL << 8) +#define STYLE_TEMPLATE (1UL << 9) struct browser { short state; @@ -49,7 +51,7 @@ struct browser { struct browser *browser_init(void); size_t browser_print(struct browser *browser, const char *str, size_t len, - unsigned short style); + unsigned long style); void browser_clear(struct browser *browser); void browser_draw_line(struct browser *browser); --- wikipedia.c Mon Sep 5 14:32:30 2022 +++ wikipedia.c Mon Sep 5 22:53:18 2022 @@ -193,121 +193,40 @@ get_char: wpr->curlys = 0; wpr->brackets = 0; - wpr->apostrophes = 0; - wpr->equals = 0; + wpr->refs = 0; wpr->style = 0; wpr->last_style = 0; + wpr->trim_whitespace = true; wpr->state = WP_STATE_WIKITEXT_PARSE; /* FALLTHROUGH */ case WP_STATE_WIKITEXT_PARSE: { c = req->chunk[req->chunk_off]; - -parse_char: - if (c == '{') { - wpr->curlys++; - goto advance; - } else if (c == '}') { - wpr->curlys--; - goto advance; - } else if (wpr->curlys > 0) { - /* consume, obey */ - goto advance; - } else if (c == '[') { - wpr->brackets++; - goto advance; - } else if (c == ']') { - wpr->brackets--; - goto advance; - } else if (c == '\'') { - wpr->apostrophes++; - goto advance; - } else if (c == '=') { - wpr->equals++; - goto advance; - } - last = wpr->buf + wpr->buf_len - 1; - if (last[-3] == '<' && last[-2] == 'r' && last[-1] == 'e' && - last[0] == 'f') { - /* <ref */ - wpr->in_ref = true; - dump = true; - wpr->buf_len -= 4; - } else if ((last[-5] == '<' && last[-4] == '/' && last[-3] == 'r' && - last[-2] == 'e' && last[-1] == 'f' && last[0] == '>') || - (last[-1] == '/' && last[0] == '>')) { - /* </ref> or <ref /> */ - wpr->in_ref = false; - wpr->apostrophes = wpr->equals = wpr->brackets = wpr->curlys = 0; - wpr->buf_len = 0; - } else if (wpr->in_ref) { - goto consume; - } - - if (wpr->apostrophes) { - if (wpr->apostrophes == 3) { - if (wpr->style & STYLE_BOLD) - wpr->style &= ~(STYLE_BOLD); - else - wpr->style |= STYLE_BOLD; - } else if (wpr->apostrophes == 2) { - if (wpr->style & STYLE_ITALIC) - wpr->style &= ~(STYLE_ITALIC); - else - wpr->style |= STYLE_ITALIC; - } else if (wpr->apostrophes == 1) { - /* literal apostrophe, add and go back to handle c */ - wpr->apostrophes = 0; - wpr->buf[wpr->buf_len++] = '\''; - goto parse_char; - } - wpr->apostrophes = 0; - } + /* character conversions */ - if (wpr->equals) { - if (wpr->equals == 5) { - if (wpr->style & STYLE_H5) - wpr->style &= ~(STYLE_H5); - else - wpr->style |= STYLE_H5; - } else if (wpr->equals == 4) { - if (wpr->style & STYLE_H4) - wpr->style &= ~(STYLE_H4); - else - wpr->style |= STYLE_H4; - } else if (wpr->equals == 3) { - if (wpr->style & STYLE_H3) - wpr->style &= ~(STYLE_H3); - else - wpr->style |= STYLE_H3; - } else if (wpr->equals == 2) { - if (wpr->style & STYLE_H2) - wpr->style &= ~(STYLE_H2); - else - wpr->style |= STYLE_H2; - } else { - /* literal equals, add and go back to handle c */ - wpr->equals = 0; - wpr->buf[wpr->buf_len++] = '='; - goto parse_char; + if (c == ';') { + /* XML entity decode */ + if (last[-3] == '&' && last[-2] == 'a' && last[-1] == 'm' && + last[0] == 'p') { + c = '&'; + wpr->buf_len -= 4; + } else if (last[-4] == '&' && last[-3] == 'n' && + last[-2] == 'b' && last[-1] == 's' && last[0] == 'p') { + c = ' '; + wpr->buf_len -= 5; + } else if (last[-2] == '&' && last[-1] == 'l' && last[0] == 't') { + c = '<'; + wpr->buf_len -= 3; + } else if (last[-2] == '&' && last[-1] == 'g' && last[0] == 't') { + c = '>'; + wpr->buf_len -= 3; } - wpr->equals = 0; - } - - if (wpr->brackets == 2) - wpr->style |= STYLE_LINK; - else if (wpr->brackets == 0 && (wpr->style & STYLE_LINK)) - wpr->style &= ~(STYLE_LINK); - - if (c == '\n') { - /* skip leading newlines and only allow 2 in a row */ - if (wpr->article_len == 0 || (last[0] == '\r' && last[-1] == '\r')) - c = 0; - else - c = '\r'; + last = wpr->buf + wpr->buf_len - 1; + } else if (c == '\n') { + c = '\r'; } else if ((unsigned char)c >= 0x80 && (unsigned char)c < 0xf5) { /* utf-8 */ if (utf8[0] == 0) @@ -367,8 +286,111 @@ parse_char: utf8[0] = utf8[1] = utf8[2] = 0; } } + + /* check for style changes */ + + if (last[0] == '{' && c == '{') { + wpr->curlys++; + wpr->buf_len--; + wpr->style |= STYLE_TEMPLATE; + c = 0; + } else if (last[0] == '}' && c == '}') { + if (wpr->curlys) + wpr->curlys--; + wpr->buf_len--; + if (wpr->curlys == 0) + wpr->style &= ~(STYLE_TEMPLATE); + c = 0; + } else if (last[0] == '[' && c == '[') { + if (wpr->brackets) + wpr->brackets++; + wpr->buf_len--; + wpr->style |= STYLE_LINK; + c = 0; + } else if (last[0] == ']' && c == ']') { + if (wpr->brackets) + wpr->brackets--; + wpr->buf_len--; + if (wpr->brackets == 0) + wpr->style &= ~(STYLE_LINK); + c = 0; + } else if (last[-1] == '\'' && last[0] == '\'' && c == '\'') { + if (wpr->style & STYLE_BOLD) + wpr->style &= ~(STYLE_BOLD); + else + wpr->style |= STYLE_BOLD; + wpr->buf_len -= 2; + c = 0; + } else if (last[-1] == '\'' && last[0] == '\'' && c != '\'') { + if (wpr->style & STYLE_ITALIC) + wpr->style &= ~(STYLE_ITALIC); + else + wpr->style |= STYLE_ITALIC; + wpr->buf_len -= 2; + /* keep c */ + } else if (last[-3] == '=' && last[-2] == '=' && last[-1] == '=' && + last[0] == '=' && c == '=') { + if (wpr->style & STYLE_H5) + wpr->style &= ~(STYLE_H5); + else + wpr->style |= STYLE_H5; + wpr->buf_len -= 4; + c = 0; + } else if (last[-3] == '=' && last[-2] == '=' && last[-1] == '=' && + last[0] == '=' && c != '=') { + if (wpr->style & STYLE_H4) + wpr->style &= ~(STYLE_H4); + else + wpr->style |= STYLE_H4; + wpr->buf_len -= 4; + /* keep c */ + } else if (last[-2] == '=' && last[-1] == '=' && last[0] == '=' && + c != '=') { + if (wpr->style & STYLE_H3) + wpr->style &= ~(STYLE_H3); + else + wpr->style |= STYLE_H3; + wpr->buf_len -= 3; + /* keep c */ + } else if (last[-1] == '=' && last[0] == '=' && c != '=') { + if (wpr->style & STYLE_H2) + wpr->style &= ~(STYLE_H2); + else + wpr->style |= STYLE_H2; + wpr->buf_len -= 2; + /* keep c */ + } else if (last[-2] == '<' && last[-1] == 'r' && last[0] == 'e' && + c == 'f') { + /* <ref */ + wpr->refs++; + wpr->style |= STYLE_REF; + wpr->buf_len -= 3; + c = 0; + } else if ((wpr->style & STYLE_REF) && + ((last[-4] == '<' && last[-3] == '/' && last[-2] == 'r' && + last[-1] == 'e' && last[0] == 'f' && c == '>') || + (last[0] == '/' && c == '>'))) { + /* </ref> or <ref /> */ + if (wpr->refs) + wpr->refs--; + if (wpr->refs == 0) + wpr->style &= ~(STYLE_REF); + c = 0; + } + + /* + * If our style changed as of this character, dump the buffer in + * the previous style and clear the buffer. + */ + + if (wpr->style != wpr->last_style) { + if (wpr->last_style & (STYLE_REF | STYLE_TEMPLATE)) + wpr->buf_len = 0; - if (wpr->style != wpr->last_style || dump) { + if (wpr->last_style & (STYLE_TEMPLATE | + STYLE_H1 | STYLE_H2 | STYLE_H3 | STYLE_H4 | STYLE_H5)) + wpr->trim_whitespace = true; + if (wpr->buf_len) { browser_print(wpr->browser, wpr->buf, wpr->buf_len, wpr->last_style); @@ -376,40 +398,20 @@ parse_char: wpr->buf_len = 0; } wpr->last_style = wpr->style; - dump = false; } - if (c == ' ' && wpr->buf_len == 0 && - (wpr->style & (STYLE_H2|STYLE_H3|STYLE_H4|STYLE_H5))) - /* skip leading spaces around headers */ - c = 0; - -consume: - if (c != 0) { - wpr->buf[wpr->buf_len++] = c; - - /* XML entity decode */ - if (c == ';') { - last = wpr->buf + wpr->buf_len - 1; - if (last[-4] == '&' && last[-3] == 'a' && last[-2] == 'm' && - last[-1] == 'p') { - last[-4] = '&'; - wpr->buf_len -= 4; - } else if (last[-5] == '&' && last[-4] == 'n' && - last[-3] == 'b' && last[-2] == 's' && last[-1] == 'p') { - last[-5] = ' '; - wpr->buf_len -= 5; - } else if (last[-3] == '&' && last[-2] == 'l' && last[-1] == 't') { - last[-3] = '<'; - wpr->buf_len -= 3; - } else if (last[-3] == '&' && last[-2] == 'g' && last[-1] == 't') { - last[-3] = '>'; - wpr->buf_len -= 3; - } - } + /* and finally, add the new character */ + if (c != 0 && wpr->trim_whitespace) { + if (c == '\r' || c == '\t' || c == ' ') + /* trim whitespace after these */ + c = 0; + else + wpr->trim_whitespace = false; } -advance: + if (c != 0) + wpr->buf[wpr->buf_len++] = c; + req->chunk_off++; goto get_char; } --- wikipedia.h Fri Sep 2 13:33:44 2022 +++ wikipedia.h Mon Sep 5 22:35:12 2022 @@ -59,9 +59,9 @@ struct wikipedia_request { char *buf; size_t buf_size; size_t buf_len; - short curlys, brackets, apostrophes, equals; - bool in_ref; - unsigned short style, last_style; + short refs, curlys, brackets; + unsigned long style, last_style; + bool trim_whitespace; }; struct wikipedia_request * wikipedia_fetch_article(struct browser *,