jcs
/wikipedia
/amendments
/23
wikipedia: Restructure to handle styles touching each other
''[[...]]'' was not getting parsed properly moving directly from
the 2 apostrophes (italics) to a link.
jcs made amendment 23 over 2 years ago
--- wikipedia.c Mon Sep 5 00:26:21 2022
+++ wikipedia.c Mon Sep 5 14:32:30 2022
@@ -207,190 +207,209 @@ get_char:
parse_char:
if (c == '{') {
wpr->curlys++;
+ goto advance;
} else if (c == '}') {
wpr->curlys--;
+ goto advance;
} else if (wpr->curlys > 0) {
/* consume, obey */
+ goto advance;
} else if (c == '[') {
wpr->brackets++;
+ goto advance;
} else if (c == ']') {
wpr->brackets--;
+ goto advance;
} else if (c == '\'') {
wpr->apostrophes++;
+ goto advance;
} else if (c == '=') {
wpr->equals++;
- } else {
- last = wpr->buf + wpr->buf_len - 1;
-
- if (last[-3] == '<' && last[-2] == 'r' && last[-1] == 'e' &&
- last[0] == 'f') {
- /* <ref */
- wpr->in_ref = true;
- dump = true;
- wpr->buf_len -= 4;
- } else if ((last[-5] == '<' && last[-4] == '/' &&
- last[-3] == 'r' && last[-2] == 'e' && last[-1] == 'f' &&
- last[0] == '>') ||
- (last[-1] == '/' && last[0] == '>')) {
- /* </ref> or <ref /> */
- wpr->in_ref = false;
- wpr->apostrophes = wpr->equals = wpr->brackets = wpr->curlys = 0;
- wpr->buf_len = 0;
- } else if (wpr->in_ref) {
- /* consume, obey */
- } else if (wpr->apostrophes == 3) {
+ goto advance;
+ }
+
+ last = wpr->buf + wpr->buf_len - 1;
+
+ if (last[-3] == '<' && last[-2] == 'r' && last[-1] == 'e' &&
+ last[0] == 'f') {
+ /* <ref */
+ wpr->in_ref = true;
+ dump = true;
+ wpr->buf_len -= 4;
+ } else if ((last[-5] == '<' && last[-4] == '/' && last[-3] == 'r' &&
+ last[-2] == 'e' && last[-1] == 'f' && last[0] == '>') ||
+ (last[-1] == '/' && last[0] == '>')) {
+ /* </ref> or <ref /> */
+ wpr->in_ref = false;
+ wpr->apostrophes = wpr->equals = wpr->brackets = wpr->curlys = 0;
+ wpr->buf_len = 0;
+ } else if (wpr->in_ref) {
+ goto consume;
+ }
+
+ if (wpr->apostrophes) {
+ if (wpr->apostrophes == 3) {
if (wpr->style & STYLE_BOLD)
wpr->style &= ~(STYLE_BOLD);
else
wpr->style |= STYLE_BOLD;
- wpr->apostrophes = 0;
} else if (wpr->apostrophes == 2) {
if (wpr->style & STYLE_ITALIC)
wpr->style &= ~(STYLE_ITALIC);
else
wpr->style |= STYLE_ITALIC;
- wpr->apostrophes = 0;
} else if (wpr->apostrophes == 1) {
/* literal apostrophe, add and go back to handle c */
wpr->apostrophes = 0;
wpr->buf[wpr->buf_len++] = '\'';
goto parse_char;
- } else if (wpr->equals) {
- if (wpr->equals == 5) {
- if (wpr->style & STYLE_H5)
- wpr->style &= ~(STYLE_H5);
- else
- wpr->style |= STYLE_H5;
- } else if (wpr->equals == 4) {
- if (wpr->style & STYLE_H4)
- wpr->style &= ~(STYLE_H4);
- else
- wpr->style |= STYLE_H4;
- } else if (wpr->equals == 3) {
- if (wpr->style & STYLE_H3)
- wpr->style &= ~(STYLE_H3);
- else
- wpr->style |= STYLE_H3;
- } else if (wpr->equals == 2) {
- if (wpr->style & STYLE_H2)
- wpr->style &= ~(STYLE_H2);
- else
- wpr->style |= STYLE_H2;
- } else {
- /* literal equals, add and go back to handle c */
- wpr->equals = 0;
- wpr->buf[wpr->buf_len++] = '=';
- goto parse_char;
- }
+ }
+ wpr->apostrophes = 0;
+ }
+
+ if (wpr->equals) {
+ if (wpr->equals == 5) {
+ if (wpr->style & STYLE_H5)
+ wpr->style &= ~(STYLE_H5);
+ else
+ wpr->style |= STYLE_H5;
+ } else if (wpr->equals == 4) {
+ if (wpr->style & STYLE_H4)
+ wpr->style &= ~(STYLE_H4);
+ else
+ wpr->style |= STYLE_H4;
+ } else if (wpr->equals == 3) {
+ if (wpr->style & STYLE_H3)
+ wpr->style &= ~(STYLE_H3);
+ else
+ wpr->style |= STYLE_H3;
+ } else if (wpr->equals == 2) {
+ if (wpr->style & STYLE_H2)
+ wpr->style &= ~(STYLE_H2);
+ else
+ wpr->style |= STYLE_H2;
+ } else {
+ /* literal equals, add and go back to handle c */
wpr->equals = 0;
- } else if (wpr->brackets == 2) {
- wpr->style |= STYLE_LINK;
- } else if (wpr->brackets == 0 && (wpr->style & STYLE_LINK)) {
- wpr->style &= ~(STYLE_LINK);
+ wpr->buf[wpr->buf_len++] = '=';
+ goto parse_char;
}
+ wpr->equals = 0;
+ }
+
+ if (wpr->brackets == 2)
+ wpr->style |= STYLE_LINK;
+ else if (wpr->brackets == 0 && (wpr->style & STYLE_LINK))
+ wpr->style &= ~(STYLE_LINK);
- if (c == '\n') {
- /* skip leading newlines and only allow 2 in a row */
- if (wpr->article_len == 0 || (last[0] == '\r' && last[-1] == '\r'))
- c = 0;
+ if (c == '\n') {
+ /* skip leading newlines and only allow 2 in a row */
+ if (wpr->article_len == 0 || (last[0] == '\r' && last[-1] == '\r'))
+ c = 0;
+ else
+ c = '\r';
+ } else if ((unsigned char)c >= 0x80 && (unsigned char)c < 0xf5) {
+ /* utf-8 */
+ if (utf8[0] == 0)
+ utf8[0] = c;
+ else if (utf8[1] == 0)
+ utf8[1] = c;
+ else if (utf8[2] == 0)
+ utf8[2] = c;
+ else if (utf8[3] == 0)
+ utf8[3] = c;
+ else {
+ /* bogus */
+ utf8[0] = 0;
+ c = 0;
+ }
+
+ if (c)
+ c = 0;
+ else
+ c = '?';
+
+ if (utf8[0] >= 0xc2 && utf8[0] <= 0xdf && utf8[1] != 0) {
+ /* 2 byte */
+ if (utf8[0] == 0xc3 && utf8[1] == 0x97)
+ c = 'x';
+ else if (utf8[0] == 0xc3 && utf8[1] == 0xa9)
+ /* e accent */
+ c = 'é'; // 0x8e
+ else if (utf8[0] == 0xc3 && utf8[1] == 0xb6)
+ /* o umlat */
+ c = 'ö'; // 0x9a
else
- c = '\r';
- } else if ((unsigned char)c >= 0x80 && (unsigned char)c < 0xf5) {
- /* utf-8 */
- if (utf8[0] == 0)
- utf8[0] = c;
- else if (utf8[1] == 0)
- utf8[1] = c;
- else if (utf8[2] == 0)
- utf8[2] = c;
- else if (utf8[3] == 0)
- utf8[3] = c;
- else {
- /* bogus */
- utf8[0] = 0;
- c = 0;
- }
-
- if (c)
- c = 0;
+ c = '?';
+ utf8[0] = utf8[1] = utf8[2] = 0;
+ } else if (utf8[0] >= 0xe0 && utf8[0] <= 0xef && utf8[2] != 0) {
+ /* 3-byte */
+ if ((utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x9c) ||
+ (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x9d))
+ /* smart quote */
+ c = '"';
+ else if (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x93)
+ /* n-dash */
+ c = '–'; // 0xd0
+ else if (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x94)
+ /* m-dash */
+ c = '—'; // 0xd1
+ else if ((utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x98) ||
+ (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x99))
+ /* apos */
+ c = '\'';
else
c = '?';
-
- if (utf8[0] >= 0xc2 && utf8[0] <= 0xdf && utf8[1] != 0) {
- /* 2 byte */
- if (utf8[0] == 0xc3 && utf8[1] == 0xa9)
- /* accent-e */
- c = 'é'; // 0x8e;
- else
- c = '?';
- utf8[0] = utf8[1] = utf8[2] = 0;
- } else if (utf8[0] >= 0xe0 && utf8[0] <= 0xef && utf8[2] != 0) {
- /* 3-byte */
- if ((utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x9c) ||
- (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x9d))
- /* smart quote */
- c = '"';
- else if (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x93)
- /* n-dash */
- c = '–'; // 0xd0
- else if (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x94)
- /* m-dash */
- c = '—'; // 0xd1
- else if ((utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x98) ||
- (utf8[0] == 0xe2 && utf8[1] == 0x80 && utf8[2] == 0x99))
- /* apos */
- c = '\'';
- else
- c = '?';
- utf8[0] = utf8[1] = utf8[2] = 0;
- } else if (utf8[0] >= 0xf0 && utf8[0] <= 0xf4 && utf8[3] != 0) {
- /* 4-byte */
- c = '?';
- utf8[0] = utf8[1] = utf8[2] = 0;
- }
+ utf8[0] = utf8[1] = utf8[2] = 0;
+ } else if (utf8[0] >= 0xf0 && utf8[0] <= 0xf4 && utf8[3] != 0) {
+ /* 4-byte */
+ c = '?';
+ utf8[0] = utf8[1] = utf8[2] = 0;
}
+ }
- if (wpr->style != wpr->last_style || dump) {
- if (wpr->buf_len) {
- browser_print(wpr->browser, wpr->buf, wpr->buf_len,
- wpr->last_style);
- wpr->article_len += wpr->buf_len;
- wpr->buf_len = 0;
- }
- wpr->last_style = wpr->style;
- dump = false;
+ if (wpr->style != wpr->last_style || dump) {
+ if (wpr->buf_len) {
+ browser_print(wpr->browser, wpr->buf, wpr->buf_len,
+ wpr->last_style);
+ wpr->article_len += wpr->buf_len;
+ wpr->buf_len = 0;
}
+ wpr->last_style = wpr->style;
+ dump = false;
+ }
+
+ if (c == ' ' && wpr->buf_len == 0 &&
+ (wpr->style & (STYLE_H2|STYLE_H3|STYLE_H4|STYLE_H5)))
+ /* skip leading spaces around headers */
+ c = 0;
+
+consume:
+ if (c != 0) {
+ wpr->buf[wpr->buf_len++] = c;
- if (c == ' ' && wpr->buf_len == 0 &&
- (wpr->style & (STYLE_H2|STYLE_H3|STYLE_H4|STYLE_H5)))
- /* skip leading spaces around headers */
- c = 0;
-
- if (c != 0) {
- wpr->buf[wpr->buf_len++] = c;
-
- /* XML entity decode */
- if (c == ';') {
- last = wpr->buf + wpr->buf_len - 1;
- if (last[-4] == '&' && last[-3] == 'a' && last[-2] == 'm' &&
- last[-1] == 'p') {
- last[-4] = '&';
- wpr->buf_len -= 4;
- } else if (last[-5] == '&' && last[-3] == 'n' &&
- last[-3] == 'b' && last[-2] == 's' && last[-1] == 'p') {
- last[-5] = ' ';
- wpr->buf_len -= 5;
- } else if (last[-3] == '&' && last[-2] == 'l' && last[-1] == 't') {
- last[-3] = '<';
- wpr->buf_len -= 3;
- } else if (last[-3] == '&' && last[-2] == 'g' && last[-1] == 't') {
- last[-3] = '>';
- wpr->buf_len -= 3;
- }
+ /* XML entity decode */
+ if (c == ';') {
+ last = wpr->buf + wpr->buf_len - 1;
+ if (last[-4] == '&' && last[-3] == 'a' && last[-2] == 'm' &&
+ last[-1] == 'p') {
+ last[-4] = '&';
+ wpr->buf_len -= 4;
+ } else if (last[-5] == '&' && last[-4] == 'n' &&
+ last[-3] == 'b' && last[-2] == 's' && last[-1] == 'p') {
+ last[-5] = ' ';
+ wpr->buf_len -= 5;
+ } else if (last[-3] == '&' && last[-2] == 'l' && last[-1] == 't') {
+ last[-3] = '<';
+ wpr->buf_len -= 3;
+ } else if (last[-3] == '&' && last[-2] == 'g' && last[-1] == 't') {
+ last[-3] = '>';
+ wpr->buf_len -= 3;
}
}
}
+advance:
req->chunk_off++;
goto get_char;
}