jcs
/wikipedia
/amendments
/24
wikipedia: Rewrite Wikitext parser again
jcs made amendment 24 over 2 years ago
--- browser.c Mon Sep 5 00:38:47 2022
+++ browser.c Mon Sep 5 14:52:16 2022
@@ -509,7 +509,7 @@ static Handle scrp_rec_h = NULL;
size_t
browser_print(struct browser *browser, const char *str, size_t len,
- unsigned short style)
+ unsigned long style)
{
StScrpRec *scrp_rec;
ScrpSTElement *scrp_ele;
@@ -517,7 +517,7 @@ browser_print(struct browser *browser, const char *str
Rect zerorect = { 0, 0, 0, 0 };
size_t n;
short line_height = 0, was_len = 0;
- static unsigned short last_style = 0;
+ static unsigned long last_style = 0;
if (scrp_rec_h == NULL) {
scrp_rec_h = xNewHandle(sizeof(short) +
@@ -560,8 +560,6 @@ browser_print(struct browser *browser, const char *str
scrp_ele->scrpHeight += 4;
scrp_ele->scrpAscent += 2;
}
-
-
if (style & STYLE_LINK) {
/* remove link destinations for now */
--- browser.h Mon Sep 5 00:19:59 2022
+++ browser.h Mon Sep 5 20:53:18 2022
@@ -27,14 +27,16 @@ enum {
BROWSER_STATE_ARTICLE_DONE
};
-#define STYLE_BOLD (1 << 0)
-#define STYLE_ITALIC (1 << 1)
-#define STYLE_H1 (1 << 2)
-#define STYLE_H2 (1 << 3)
-#define STYLE_H3 (1 << 4)
-#define STYLE_H4 (1 << 5)
-#define STYLE_H5 (1 << 6)
-#define STYLE_LINK (1 << 7)
+#define STYLE_BOLD (1UL << 0)
+#define STYLE_ITALIC (1UL << 1)
+#define STYLE_H1 (1UL << 2)
+#define STYLE_H2 (1UL << 3)
+#define STYLE_H3 (1UL << 4)
+#define STYLE_H4 (1UL << 5)
+#define STYLE_H5 (1UL << 6)
+#define STYLE_LINK (1UL << 7)
+#define STYLE_REF (1UL << 8)
+#define STYLE_TEMPLATE (1UL << 9)
struct browser {
short state;
@@ -49,7 +51,7 @@ struct browser {
struct browser *browser_init(void);
size_t browser_print(struct browser *browser, const char *str, size_t len,
- unsigned short style);
+ unsigned long style);
void browser_clear(struct browser *browser);
void browser_draw_line(struct browser *browser);
--- wikipedia.c Mon Sep 5 14:32:30 2022
+++ wikipedia.c Mon Sep 5 22:53:18 2022
@@ -193,121 +193,40 @@ get_char:
wpr->curlys = 0;
wpr->brackets = 0;
- wpr->apostrophes = 0;
- wpr->equals = 0;
+ wpr->refs = 0;
wpr->style = 0;
wpr->last_style = 0;
+ wpr->trim_whitespace = true;
wpr->state = WP_STATE_WIKITEXT_PARSE;
/* FALLTHROUGH */
case WP_STATE_WIKITEXT_PARSE: {
c = req->chunk[req->chunk_off];
-
-parse_char:
- if (c == '{') {
- wpr->curlys++;
- goto advance;
- } else if (c == '}') {
- wpr->curlys--;
- goto advance;
- } else if (wpr->curlys > 0) {
- /* consume, obey */
- goto advance;
- } else if (c == '[') {
- wpr->brackets++;
- goto advance;
- } else if (c == ']') {
- wpr->brackets--;
- goto advance;
- } else if (c == '\'') {
- wpr->apostrophes++;
- goto advance;
- } else if (c == '=') {
- wpr->equals++;
- goto advance;
- }
-
last = wpr->buf + wpr->buf_len - 1;
- if (last[-3] == '<' && last[-2] == 'r' && last[-1] == 'e' &&
- last[0] == 'f') {
- /* <ref */
- wpr->in_ref = true;
- dump = true;
- wpr->buf_len -= 4;
- } else if ((last[-5] == '<' && last[-4] == '/' && last[-3] == 'r' &&
- last[-2] == 'e' && last[-1] == 'f' && last[0] == '>') ||
- (last[-1] == '/' && last[0] == '>')) {
- /* </ref> or <ref /> */
- wpr->in_ref = false;
- wpr->apostrophes = wpr->equals = wpr->brackets = wpr->curlys = 0;
- wpr->buf_len = 0;
- } else if (wpr->in_ref) {
- goto consume;
- }
-
- if (wpr->apostrophes) {
- if (wpr->apostrophes == 3) {
- if (wpr->style & STYLE_BOLD)
- wpr->style &= ~(STYLE_BOLD);
- else
- wpr->style |= STYLE_BOLD;
- } else if (wpr->apostrophes == 2) {
- if (wpr->style & STYLE_ITALIC)
- wpr->style &= ~(STYLE_ITALIC);
- else
- wpr->style |= STYLE_ITALIC;
- } else if (wpr->apostrophes == 1) {
- /* literal apostrophe, add and go back to handle c */
- wpr->apostrophes = 0;
- wpr->buf[wpr->buf_len++] = '\'';
- goto parse_char;
- }
- wpr->apostrophes = 0;
- }
+ /* character conversions */
- if (wpr->equals) {
- if (wpr->equals == 5) {
- if (wpr->style & STYLE_H5)
- wpr->style &= ~(STYLE_H5);
- else
- wpr->style |= STYLE_H5;
- } else if (wpr->equals == 4) {
- if (wpr->style & STYLE_H4)
- wpr->style &= ~(STYLE_H4);
- else
- wpr->style |= STYLE_H4;
- } else if (wpr->equals == 3) {
- if (wpr->style & STYLE_H3)
- wpr->style &= ~(STYLE_H3);
- else
- wpr->style |= STYLE_H3;
- } else if (wpr->equals == 2) {
- if (wpr->style & STYLE_H2)
- wpr->style &= ~(STYLE_H2);
- else
- wpr->style |= STYLE_H2;
- } else {
- /* literal equals, add and go back to handle c */
- wpr->equals = 0;
- wpr->buf[wpr->buf_len++] = '=';
- goto parse_char;
+ if (c == ';') {
+ /* XML entity decode */
+ if (last[-3] == '&' && last[-2] == 'a' && last[-1] == 'm' &&
+ last[0] == 'p') {
+ c = '&';
+ wpr->buf_len -= 4;
+ } else if (last[-4] == '&' && last[-3] == 'n' &&
+ last[-2] == 'b' && last[-1] == 's' && last[0] == 'p') {
+ c = ' ';
+ wpr->buf_len -= 5;
+ } else if (last[-2] == '&' && last[-1] == 'l' && last[0] == 't') {
+ c = '<';
+ wpr->buf_len -= 3;
+ } else if (last[-2] == '&' && last[-1] == 'g' && last[0] == 't') {
+ c = '>';
+ wpr->buf_len -= 3;
}
- wpr->equals = 0;
- }
-
- if (wpr->brackets == 2)
- wpr->style |= STYLE_LINK;
- else if (wpr->brackets == 0 && (wpr->style & STYLE_LINK))
- wpr->style &= ~(STYLE_LINK);
-
- if (c == '\n') {
- /* skip leading newlines and only allow 2 in a row */
- if (wpr->article_len == 0 || (last[0] == '\r' && last[-1] == '\r'))
- c = 0;
- else
- c = '\r';
+ last = wpr->buf + wpr->buf_len - 1;
+ } else if (c == '\n') {
+ c = '\r';
} else if ((unsigned char)c >= 0x80 && (unsigned char)c < 0xf5) {
/* utf-8 */
if (utf8[0] == 0)
@@ -367,8 +286,111 @@ parse_char:
utf8[0] = utf8[1] = utf8[2] = 0;
}
}
+
+ /* check for style changes */
+
+ if (last[0] == '{' && c == '{') {
+ wpr->curlys++;
+ wpr->buf_len--;
+ wpr->style |= STYLE_TEMPLATE;
+ c = 0;
+ } else if (last[0] == '}' && c == '}') {
+ if (wpr->curlys)
+ wpr->curlys--;
+ wpr->buf_len--;
+ if (wpr->curlys == 0)
+ wpr->style &= ~(STYLE_TEMPLATE);
+ c = 0;
+ } else if (last[0] == '[' && c == '[') {
+ if (wpr->brackets)
+ wpr->brackets++;
+ wpr->buf_len--;
+ wpr->style |= STYLE_LINK;
+ c = 0;
+ } else if (last[0] == ']' && c == ']') {
+ if (wpr->brackets)
+ wpr->brackets--;
+ wpr->buf_len--;
+ if (wpr->brackets == 0)
+ wpr->style &= ~(STYLE_LINK);
+ c = 0;
+ } else if (last[-1] == '\'' && last[0] == '\'' && c == '\'') {
+ if (wpr->style & STYLE_BOLD)
+ wpr->style &= ~(STYLE_BOLD);
+ else
+ wpr->style |= STYLE_BOLD;
+ wpr->buf_len -= 2;
+ c = 0;
+ } else if (last[-1] == '\'' && last[0] == '\'' && c != '\'') {
+ if (wpr->style & STYLE_ITALIC)
+ wpr->style &= ~(STYLE_ITALIC);
+ else
+ wpr->style |= STYLE_ITALIC;
+ wpr->buf_len -= 2;
+ /* keep c */
+ } else if (last[-3] == '=' && last[-2] == '=' && last[-1] == '=' &&
+ last[0] == '=' && c == '=') {
+ if (wpr->style & STYLE_H5)
+ wpr->style &= ~(STYLE_H5);
+ else
+ wpr->style |= STYLE_H5;
+ wpr->buf_len -= 4;
+ c = 0;
+ } else if (last[-3] == '=' && last[-2] == '=' && last[-1] == '=' &&
+ last[0] == '=' && c != '=') {
+ if (wpr->style & STYLE_H4)
+ wpr->style &= ~(STYLE_H4);
+ else
+ wpr->style |= STYLE_H4;
+ wpr->buf_len -= 4;
+ /* keep c */
+ } else if (last[-2] == '=' && last[-1] == '=' && last[0] == '=' &&
+ c != '=') {
+ if (wpr->style & STYLE_H3)
+ wpr->style &= ~(STYLE_H3);
+ else
+ wpr->style |= STYLE_H3;
+ wpr->buf_len -= 3;
+ /* keep c */
+ } else if (last[-1] == '=' && last[0] == '=' && c != '=') {
+ if (wpr->style & STYLE_H2)
+ wpr->style &= ~(STYLE_H2);
+ else
+ wpr->style |= STYLE_H2;
+ wpr->buf_len -= 2;
+ /* keep c */
+ } else if (last[-2] == '<' && last[-1] == 'r' && last[0] == 'e' &&
+ c == 'f') {
+ /* <ref */
+ wpr->refs++;
+ wpr->style |= STYLE_REF;
+ wpr->buf_len -= 3;
+ c = 0;
+ } else if ((wpr->style & STYLE_REF) &&
+ ((last[-4] == '<' && last[-3] == '/' && last[-2] == 'r' &&
+ last[-1] == 'e' && last[0] == 'f' && c == '>') ||
+ (last[0] == '/' && c == '>'))) {
+ /* </ref> or <ref /> */
+ if (wpr->refs)
+ wpr->refs--;
+ if (wpr->refs == 0)
+ wpr->style &= ~(STYLE_REF);
+ c = 0;
+ }
+
+ /*
+ * If our style changed as of this character, dump the buffer in
+ * the previous style and clear the buffer.
+ */
+
+ if (wpr->style != wpr->last_style) {
+ if (wpr->last_style & (STYLE_REF | STYLE_TEMPLATE))
+ wpr->buf_len = 0;
- if (wpr->style != wpr->last_style || dump) {
+ if (wpr->last_style & (STYLE_TEMPLATE |
+ STYLE_H1 | STYLE_H2 | STYLE_H3 | STYLE_H4 | STYLE_H5))
+ wpr->trim_whitespace = true;
+
if (wpr->buf_len) {
browser_print(wpr->browser, wpr->buf, wpr->buf_len,
wpr->last_style);
@@ -376,40 +398,20 @@ parse_char:
wpr->buf_len = 0;
}
wpr->last_style = wpr->style;
- dump = false;
}
- if (c == ' ' && wpr->buf_len == 0 &&
- (wpr->style & (STYLE_H2|STYLE_H3|STYLE_H4|STYLE_H5)))
- /* skip leading spaces around headers */
- c = 0;
-
-consume:
- if (c != 0) {
- wpr->buf[wpr->buf_len++] = c;
-
- /* XML entity decode */
- if (c == ';') {
- last = wpr->buf + wpr->buf_len - 1;
- if (last[-4] == '&' && last[-3] == 'a' && last[-2] == 'm' &&
- last[-1] == 'p') {
- last[-4] = '&';
- wpr->buf_len -= 4;
- } else if (last[-5] == '&' && last[-4] == 'n' &&
- last[-3] == 'b' && last[-2] == 's' && last[-1] == 'p') {
- last[-5] = ' ';
- wpr->buf_len -= 5;
- } else if (last[-3] == '&' && last[-2] == 'l' && last[-1] == 't') {
- last[-3] = '<';
- wpr->buf_len -= 3;
- } else if (last[-3] == '&' && last[-2] == 'g' && last[-1] == 't') {
- last[-3] = '>';
- wpr->buf_len -= 3;
- }
- }
+ /* and finally, add the new character */
+ if (c != 0 && wpr->trim_whitespace) {
+ if (c == '\r' || c == '\t' || c == ' ')
+ /* trim whitespace after these */
+ c = 0;
+ else
+ wpr->trim_whitespace = false;
}
-advance:
+ if (c != 0)
+ wpr->buf[wpr->buf_len++] = c;
+
req->chunk_off++;
goto get_char;
}
--- wikipedia.h Fri Sep 2 13:33:44 2022
+++ wikipedia.h Mon Sep 5 22:35:12 2022
@@ -59,9 +59,9 @@ struct wikipedia_request {
char *buf;
size_t buf_size;
size_t buf_len;
- short curlys, brackets, apostrophes, equals;
- bool in_ref;
- unsigned short style, last_style;
+ short refs, curlys, brackets;
+ unsigned long style, last_style;
+ bool trim_whitespace;
};
struct wikipedia_request * wikipedia_fetch_article(struct browser *,