jcs
/wikipedia
/amendments
/19
wikipedia: Parse XML instead of JSON, do all parsing at once
jcs made amendment 19 over 2 years ago
--- browser.c Thu Aug 25 15:58:37 2022
+++ browser.c Fri Sep 2 14:03:49 2022
@@ -56,7 +56,7 @@ browser_idle(struct focusable *focusable, EventRecord
browser_live_search(browser);
}
break;
- case BROWSER_STATE_DO_SEARCH: {
+ case BROWSER_STATE_ARTICLE_GET: {
TERec *te;
char *input;
@@ -76,21 +76,26 @@ browser_idle(struct focusable *focusable, EventRecord
}
HUnlock(browser->te);
+ SetCursor(*(GetCursor(watchCursor)));
browser->wpr = wikipedia_fetch_article(browser, input);
xfree(&input);
- browser->state = BROWSER_STATE_PROCESS_SEARCH;
+ browser->state = BROWSER_STATE_ARTICLE_PROCESS;
break;
}
- case BROWSER_STATE_PROCESS_SEARCH:
+ case BROWSER_STATE_ARTICLE_PROCESS:
if (browser->wpr == NULL) {
browser->state = BROWSER_STATE_IDLE;
break;
}
- SetCursor(*(GetCursor(watchCursor)));
wikipedia_request_process(browser->wpr);
+
+ if (browser->wpr->state == WP_STATE_DONE) {
+ browser->state = BROWSER_STATE_ARTICLE_DONE;
+ break;
+ }
break;
- case BROWSER_STATE_SEARCH_DONE:
+ case BROWSER_STATE_ARTICLE_DONE:
UpdateScrollbarForTE(browser->te_scroller, browser->te, false);
UpdtControl(browser->win, browser->win->visRgn);
progress(NULL);
@@ -319,7 +324,7 @@ browser_mouse_down(struct focusable *focusable, EventR
TESetText(str, len, browser->input_te);
InvalRect(&(*(browser->input_te))->viewRect);
browser_hide_search_results(browser);
- browser->state = BROWSER_STATE_DO_SEARCH;
+ browser->state = BROWSER_STATE_ARTICLE_GET;
}
return;
@@ -370,7 +375,7 @@ browser_key_down(struct focusable *focusable, EventRec
k = (event->message & charCodeMask);
if (k == '\r') {
- browser->state = BROWSER_STATE_DO_SEARCH;
+ browser->state = BROWSER_STATE_ARTICLE_GET;
} else {
TEKey(k, browser->input_te);
TESelView(browser->input_te);
@@ -613,4 +618,10 @@ no_overflow:
}
return len;
+}
+
+void
+browser_clear(struct browser *browser)
+{
+
}
--- browser.h Thu Aug 25 15:57:24 2022
+++ browser.h Fri Sep 2 13:57:10 2022
@@ -22,9 +22,9 @@
enum {
BROWSER_STATE_IDLE,
- BROWSER_STATE_DO_SEARCH,
- BROWSER_STATE_PROCESS_SEARCH,
- BROWSER_STATE_SEARCH_DONE
+ BROWSER_STATE_ARTICLE_GET,
+ BROWSER_STATE_ARTICLE_PROCESS,
+ BROWSER_STATE_ARTICLE_DONE
};
#define STYLE_BOLD (1 << 0)
@@ -50,5 +50,6 @@ struct browser {
struct browser *browser_init(void);
size_t browser_print(struct browser *browser, const char *str, size_t len,
unsigned short style);
+void browser_clear(struct browser *browser);
#endif
--- dnr.c Fri Aug 19 13:15:23 2022
+++ dnr.c Wed Aug 31 17:42:22 2022
@@ -6,6 +6,14 @@
* Modifications by Jim Matthews, Dartmouth College, 5/91
*/
+/*
+ * TODO: update to avoid having to include MacTraps2 for:
+ * CloseWD
+ * HOpenResFile
+ * FindFolder
+ * GetWDInfo
+ */
+
#include <OSUtils.h>
#include <Files.h>
#include <Folders.h>
--- wikipedia.c Thu Aug 25 15:59:08 2022
+++ wikipedia.c Sat Sep 3 22:26:38 2022
@@ -26,15 +26,6 @@
/* en.wikipedia.org doesn't support non-TLS :( */
#define WIKIPEDIA_HOST "wikipedia.jcs.org"
-/* {"query":{"normalized":[{"to": */
-#define NORMALIZED_TITLE_CONTEXT "{\"query\":{\"normalized\":[{\"to\":"
-
-/* {"query":{"pages":[{"revisions":[{"slots":{"main":{"content": */
-#define ARTICLE_TEXT_CONTEXT "{\"query\":{\"pages\":[{\"revisions\":[{\"slots\":{\"main\":{\"content\":"
-
-short wikipedia_json_peek(void *cookie);
-short wikipedia_json_get(void *cookie);
-
struct wikipedia_request *
wikipedia_fetch_article(struct browser *browser, char *name)
{
@@ -57,10 +48,10 @@ wikipedia_fetch_article(struct browser *browser, char
snprintf(url, sizeof(url), "http://%s/w/api.php?action=query&"
"prop=revisions&rvslots=*&rvprop=content&"
- "format=json&formatversion=2&titles=%s", WIKIPEDIA_HOST, name);
+ "format=xml&titles=%s", WIKIPEDIA_HOST, name);
wpr->http_request = http_get(url);
http_req_skip_header(wpr->http_request);
- wpr->state = WP_STATE_PARSE_JSON;
+ wpr->state = WP_STATE_XML_INIT;
wpr->normalized_title = xstrdup(name, "normalized_title");
return wpr;
@@ -121,309 +112,243 @@ wikipedia_fetch_search_results(struct browser *browser
return nrets;
}
-struct wikipedia_request *
-wikipedia_read_cached_article(struct browser *browser, char *name)
-{
- struct wikipedia_request *wpr;
- short state;
- short error, frefnum;
- size_t len, size;
-
- wpr = xmalloczero(sizeof(struct wikipedia_request),
- "fetch_article wpr");
- wpr->browser = browser;
-
- /* XXX TODO */
- error = FSOpen("\pMacintosh:wp.txt", 0, &frefnum);
- if (error)
- panic("FSOpen failed: %d", error);
-
- error = SetFPos(frefnum, fsFromLEOF, 0);
- if (error)
- panic("SetFPos failed: %d", error);
- GetFPos(frefnum, &len);
- SetFPos(frefnum, fsFromStart, 0);
-
- wpr->article_len = len;
- wpr->article = xmalloc(wpr->article_len, "article");
-
- for (len = 0; len < wpr->article_len; ) {
- size = 1024;
- error = FSRead(frefnum, &size, wpr->article + len);
- if (error && error != eofErr)
- panic("FSRead failed: %d", error);
-
- len += size;
-
- if (size < 1024)
- break;
- }
-
- FSClose(frefnum);
-
- wpr->state = WP_STATE_PARSE_JSON;
- wpr->normalized_title = xstrdup(name, "normalized_title");
-
- return wpr;
-}
-
-short
-wikipedia_json_peek(void *cookie)
-{
- struct wikipedia_request *wpr = (struct wikipedia_request *)cookie;
- struct http_request *req = wpr->http_request;
-
- return http_req_chunk_peek(req);
-}
-
-short
-wikipedia_json_get(void *cookie)
-{
- struct wikipedia_request *wpr = (struct wikipedia_request *)cookie;
- struct http_request *req = wpr->http_request;
-
- return http_req_chunk_read(req);
-}
-
void
wikipedia_request_process(struct wikipedia_request *wpr)
{
struct http_request *req = wpr->http_request;
size_t len, n;
+ char c, *last;
+ enum xml_state {
+ XML_DEFAULT,
+ XML_IN_NORMALIZED
+ } xstate = 0;
+ bool dump = false;
+get_char:
+ if (req->chunk_len == 0 || (req->chunk_off + 1 > req->chunk_len)) {
+ req->chunk_len = http_req_read(req, req->chunk,
+ sizeof(req->chunk));
+ req->chunk_off = 0;
+
+ if (req->chunk_len == 0 || (req->chunk_off + 1 > req->chunk_len))
+ goto done_parsing;
+ }
+
switch (wpr->state) {
- case WP_STATE_PARSE_JSON: {
- char context_str[PDJSON_STACK_MAX * 32];
- const char *str;
- enum json_type type, context_type;
- size_t tmp_depth;
+ case WP_STATE_XML_INIT:
+ progress("Fetching and parsing response...");
+ wpr->buf_size = 1024;
+ wpr->buf_len = 0;
+ wpr->buf = xmalloc(wpr->buf_size, "wpr buf");
+ wpr->state = WP_STATE_XML_PARSE;
+ goto get_char;
- wpr->json_context_depth = 0;
-
- progress("Parsing JSON response...");
- json_open_user(&wpr->json, wikipedia_json_get, wikipedia_json_peek,
- wpr);
-
- for (;;) {
- type = json_next(&wpr->json);
+ case WP_STATE_XML_PARSE:
+ c = req->chunk[req->chunk_off++];
- if (type == JSON_ERROR || type == JSON_DONE) {
- if (type == JSON_ERROR) {
- char err[100];
- size_t len;
- len = snprintf(err, sizeof(err),
- "%s at line %ld pos %ld", json_get_error(&wpr->json),
- json_get_lineno(&wpr->json),
- json_get_position(&wpr->json));
- browser_print(wpr->browser, err, len, 0);
+ if (c == '<') {
+ wpr->buf[0] = '\0';
+ wpr->buf_len = 0;
+ } else if (c == '>') {
+ wpr->buf[wpr->buf_len] = '\0';
+ if (xstate == XML_DEFAULT) {
+ if (strcmp(wpr->buf, "normalized") == 0) {
+ xstate = XML_IN_NORMALIZED;
+ } else if (strncmp(wpr->buf, "slot ", 5) == 0) {
+ wpr->state = WP_STATE_WIKITEXT_INIT;
}
- json_close(&wpr->json);
- if (wpr->http_request != NULL)
- http_req_free(&wpr->http_request);
- if (type == JSON_ERROR)
- wpr->state = WP_STATE_DONE;
- else {
- char title[100];
- snprintf(title, sizeof(title), "%s: %s", PROGRAM_NAME,
- wpr->normalized_title);
- SetWTitle(wpr->browser->win, CtoPstr(title));
- wpr->state = WP_STATE_PARSE_WIKITEXT;
- progress("Formatting article...");
- }
- break;
- }
-
- context_type = json_get_context(&wpr->json, &tmp_depth);
-
-#define wprjcd wpr->json_context[wpr->json_context_depth]
-
- switch (type) {
- case JSON_OBJECT:
- snprintf(wprjcd, sizeof(wprjcd), "{");
- wpr->json_context_depth++;
- break;
- case JSON_OBJECT_END:
- snprintf(wprjcd, sizeof(wprjcd), "}");
- wpr->json_context_depth--;
- break;
- case JSON_ARRAY:
- snprintf(wprjcd, sizeof(wprjcd), "[");
- wpr->json_context_depth++;
- break;
- case JSON_ARRAY_END:
- snprintf(wprjcd, sizeof(wprjcd), "]");
- wpr->json_context_depth--;
- break;
- case JSON_STRING:
- snprintf(wprjcd, sizeof(wprjcd), "\"%s\"",
- json_get_string(&wpr->json, NULL));
- break;
- case JSON_NUMBER:
- snprintf(wprjcd, sizeof(wprjcd), "%s",
- json_get_string(&wpr->json, NULL));
- break;
- case JSON_TRUE:
- snprintf(wprjcd, sizeof(wprjcd), "true");
- break;
- case JSON_FALSE:
- snprintf(wprjcd, sizeof(wprjcd), "false");
- break;
- case JSON_NULL:
- snprintf(wprjcd, sizeof(wprjcd), "null");
- break;
- }
-
- if (tmp_depth > 0 && (tmp_depth % 2) != 0)
- strlcat(wprjcd, ":", sizeof(wprjcd));
-
- if (type != JSON_STRING)
- goto next_context;
+ } else if (xstate == XML_IN_NORMALIZED) {
+ char from_normalized[255], to_normalized[255];
+ size_t count;
- context_str[0] = '\0';
- for (n = 0; n < wpr->json_context_depth; n++)
- strlcat(context_str, wpr->json_context[n],
- sizeof(context_str));
-
- if (strcmp(context_str, NORMALIZED_TITLE_CONTEXT) == 0) {
- xfree(&wpr->normalized_title);
- wpr->normalized_title =
- xstrdup(json_get_string(&wpr->json, NULL),
- "normalized_title");
- } else if (strcmp(context_str, ARTICLE_TEXT_CONTEXT) == 0) {
- str = json_get_string(&wpr->json, &wpr->article_len);
- wpr->article = xmalloc(wpr->article_len, "article");
- for (n = 0; n < wpr->article_len; n++) {
- if (str[n] == '\n')
- wpr->article[n] = '\r';
- else
- wpr->article[n] = str[n];
- }
+ if (sscanf(wpr->buf, "n from=\"%[^\"]\" to=\"%[^\"]\"%n",
+ &from_normalized, &to_normalized, &count) == 2 &&
+ count > 10) {
+ if (wpr->normalized_title != NULL)
+ xfree(&wpr->normalized_title);
+ wpr->normalized_title = xstrdup(to_normalized,
+ "to_normalized");
+ } else
+ xstate = XML_DEFAULT;
}
-
-next_context:
- if (context_type == JSON_OBJECT && tmp_depth > 0) {
- if (tmp_depth % 2 == 0)
- wpr->json_context_depth--;
- else
- wpr->json_context_depth++;
- }
+ } else if (wpr->buf_len < wpr->buf_size) {
+ wpr->buf[wpr->buf_len++] = c;
}
- break;
- }
- case WP_STATE_PARSE_WIKITEXT: {
- short bracket_level = 0, apos_level = 0;
- char *buf = xmalloczero(wpr->article_len, "article tmp buf");
- size_t buflen = 0;
- char *c = wpr->article;
- unsigned short style = 0, last_style = 0;
- bool printed = false, in_ref = false;
+ goto get_char;
+
+ case WP_STATE_WIKITEXT_INIT:
+ snprintf(wpr->buf, wpr->buf_size, "%s: %s", PROGRAM_NAME,
+ wpr->normalized_title);
+ SetWTitle(wpr->browser->win, CtoPstr(wpr->buf));
+
+ browser_clear(wpr->browser);
browser_print(wpr->browser, wpr->normalized_title,
strlen(wpr->normalized_title), STYLE_H1);
- while (*c != '\0') {
- if (*c == '{') {
- bracket_level++;
- c++;
- } else if (*c == '}') {
- bracket_level--;
- c++;
- } else if (bracket_level > 0) {
- c++;
- } else if (c[0] == '<' && c[1] == 'r' && c[2] == 'e' &&
- c[3] == 'f') {
+ wpr->article_len = 0;
+ wpr->buf_len = 0;
+ wpr->buf[0] = '\0';
+
+ wpr->curlys = 0;
+ wpr->brackets = 0;
+ wpr->apostrophes = 0;
+ wpr->equals = 0;
+ wpr->style = 0;
+ wpr->last_style = 0;
+
+ wpr->state = WP_STATE_WIKITEXT_PARSE;
+ /* FALLTHROUGH */
+
+ case WP_STATE_WIKITEXT_PARSE: {
+ c = req->chunk[req->chunk_off];
+
+parse_char:
+ if (c == '{') {
+ wpr->curlys++;
+ } else if (c == '}') {
+ wpr->curlys--;
+ } else if (wpr->curlys > 0) {
+ /* consume, obey */
+ } else if (c == '[') {
+ wpr->brackets++;
+ } else if (c == ']') {
+ wpr->brackets--;
+ } else if (c == '\'') {
+ wpr->apostrophes++;
+ } else if (c == '=') {
+ wpr->equals++;
+ } else {
+ last = wpr->buf + wpr->buf_len - 1;
+
+ if (last[-3] == '<' && last[-2] == 'r' && last[-1] == 'e' &&
+ last[0] == 'f') {
/* <ref */
- in_ref = true;
- c += 4;
- } else if (c[0] == '<' && c[1] == '/' && c[2] == 'r' &&
- c[3] == 'e' && c[4] == 'f' && c[5] == '>') {
+ wpr->in_ref = true;
+ wpr->apostrophes = wpr->equals = wpr->brackets = wpr->curlys = 0;
+ dump = true;
+ wpr->buf_len -= 4;
+ } else if (last[-5] == '<' && last[-4] == '/' &&
+ last[-3] == 'r' && last[-2] == 'e' && last[-1] == 'f' &&
+ last[0] == '>') {
/* </ref> */
- in_ref = false;
- c += 6;
- } else if (in_ref) {
- c++;
- } else if (c[0] == '\'' && c[1] == '\'') {
- /* ''' or '' */
- if (c[2] == '\'') {
- if (style & STYLE_BOLD)
- style &= ~(STYLE_BOLD);
+ wpr->in_ref = false;
+ wpr->buf_len = 0;
+ } else if (wpr->in_ref) {
+ /* consume, obey */
+ } else if (wpr->apostrophes == 3) {
+ if (wpr->style & STYLE_BOLD)
+ wpr->style &= ~(STYLE_BOLD);
+ else
+ wpr->style |= STYLE_BOLD;
+ wpr->apostrophes = 0;
+ } else if (wpr->apostrophes == 2) {
+ if (wpr->style & STYLE_ITALIC)
+ wpr->style &= ~(STYLE_ITALIC);
+ else
+ wpr->style |= STYLE_ITALIC;
+ wpr->apostrophes = 0;
+ } else if (wpr->apostrophes == 1) {
+ /* literal apostrophe, add and go back to handle c */
+ wpr->apostrophes = 0;
+ wpr->buf[wpr->buf_len++] = '\'';
+ goto parse_char;
+ } else if (wpr->equals) {
+ if (wpr->equals == 5) {
+ if (wpr->style & STYLE_H5)
+ wpr->style &= ~(STYLE_H5);
else
- style |= STYLE_BOLD;
- c += 3;
- } else {
- if (style & STYLE_ITALIC)
- style &= ~(STYLE_ITALIC);
+ wpr->style |= STYLE_H5;
+ } else if (wpr->equals == 4) {
+ if (wpr->style & STYLE_H4)
+ wpr->style &= ~(STYLE_H4);
else
- style |= STYLE_ITALIC;
- c += 2;
- }
- } else if (c[0] == '=' && c[1] == '=') {
- /* == or === or ==== or ===== */
- if (c[2] == '=') {
- if (c[3] == '=') {
- if (c[4] == '=') {
- if (style & STYLE_H5)
- style &= ~(STYLE_H5);
- else
- style |= STYLE_H5;
- c += 5;
- } else {
- if (style & STYLE_H4)
- style &= ~(STYLE_H4);
- else
- style |= STYLE_H4;
- c += 4;
- }
- } else {
- if (style & STYLE_H3)
- style &= ~(STYLE_H3);
- else
- style |= STYLE_H3;
- c += 3;
- }
- } else {
- if (style & STYLE_H2)
- style &= ~(STYLE_H2);
+ wpr->style |= STYLE_H4;
+ } else if (wpr->equals == 3) {
+ if (wpr->style & STYLE_H3)
+ wpr->style &= ~(STYLE_H3);
else
- style |= STYLE_H2;
- c += 2;
+ wpr->style |= STYLE_H3;
+ } else if (wpr->equals == 2) {
+ if (wpr->style & STYLE_H2)
+ wpr->style &= ~(STYLE_H2);
+ else
+ wpr->style |= STYLE_H2;
+ } else {
+ /* literal equals, add and go back to handle c */
+ wpr->equals = 0;
+ wpr->buf[wpr->buf_len++] = '=';
+ goto parse_char;
}
-
- while (*c == ' ')
- c++;
- } else if (c[0] == '[' && c[1] == '[') {
- style |= STYLE_LINK;
- c += 2;
- } else if (c[0] == ']' && c[1] == ']') {
- style &= ~(STYLE_LINK);
- c += 2;
- } else if (c[0] == '\r' && !printed) {
+ wpr->equals = 0;
+ } else if (wpr->brackets == 2) {
+ wpr->style |= STYLE_LINK;
+ } else if (wpr->brackets == 0 && (wpr->style & STYLE_LINK)) {
+ wpr->style &= ~(STYLE_LINK);
+ }
+
+ if (c == '\n') {
/* skip leading newlines */
- c++;
- } else if (c[0] == '&' && strncmp(c, " ", 6) == 0) {
- buf[buflen++] = ' ';
- c += 6;
- } else {
- buf[buflen++] = *c;
- c++;
+ if (wpr->article_len == 0)
+ c = 0;
+ else
+ c = '\r';
}
- if (style != last_style) {
- if (buflen) {
- browser_print(wpr->browser, buf, buflen, last_style);
- buflen = 0;
- printed = true;
+ if (wpr->style != wpr->last_style || dump) {
+ if (wpr->buf_len) {
+ browser_print(wpr->browser, wpr->buf, wpr->buf_len,
+ wpr->last_style);
+ wpr->article_len += wpr->buf_len;
+ wpr->buf_len = 0;
}
- last_style = style;
+ wpr->last_style = wpr->style;
+ dump = false;
}
+
+ if (c == ' ' && wpr->buf_len == 0 &&
+ (wpr->style & (STYLE_H2|STYLE_H3|STYLE_H4|STYLE_H5)))
+ /* skip leading spaces around headers */
+ c = 0;
+
+ if (c != 0) {
+ wpr->buf[wpr->buf_len++] = c;
+
+ /* XML entity decode */
+ if (c == ';') {
+ last = wpr->buf + wpr->buf_len - 1;
+ if (last[-4] == '&' && last[-3] == 'a' && last[-2] == 'm' &&
+ last[-1] == 'p') {
+ last[-4] = '&';
+ wpr->buf_len -= 4;
+ } else if (last[-5] == '&' && last[-3] == 'n' &&
+ last[-3] == 'b' && last[-2] == 's' && last[-1] == 'p') {
+ last[-5] = ' ';
+ wpr->buf_len -= 5;
+ } else if (last[-3] == '&' && last[-2] == 'l' && last[-1] == 't') {
+ last[-3] = '<';
+ wpr->buf_len -= 3;
+ } else if (last[-3] == '&' && last[-2] == 'g' && last[-1] == 't') {
+ last[-3] = '>';
+ wpr->buf_len -= 3;
+ }
+ }
+ }
}
- wpr->state = WP_STATE_DONE;
- wpr->browser->state = BROWSER_STATE_SEARCH_DONE;
-
- break;
+ req->chunk_off++;
+ goto get_char;
}
}
+
+done_parsing:
+ wpr->state = WP_STATE_DONE;
+
+ if (wpr->buf != NULL)
+ xfree(&wpr->buf);
+
+ if (wpr->http_request != NULL)
+ http_req_free(&wpr->http_request);
}
void
--- wikipedia.h Thu Aug 25 08:50:29 2022
+++ wikipedia.h Fri Sep 2 13:33:44 2022
@@ -42,8 +42,10 @@ extern MenuHandle file_menu, edit_menu;
void menu_defaults(void);
enum {
- WP_STATE_PARSE_JSON,
- WP_STATE_PARSE_WIKITEXT,
+ WP_STATE_XML_INIT,
+ WP_STATE_XML_PARSE,
+ WP_STATE_WIKITEXT_INIT,
+ WP_STATE_WIKITEXT_PARSE,
WP_STATE_DONE
};
@@ -52,11 +54,14 @@ struct wikipedia_request {
struct browser *browser;
struct http_request *http_request;
char *normalized_title;
- json_stream json;
- char *article;
size_t article_len;
- char json_context[PDJSON_STACK_MAX][32];
- short json_context_depth;
+
+ char *buf;
+ size_t buf_size;
+ size_t buf_len;
+ short curlys, brackets, apostrophes, equals;
+ bool in_ref;
+ unsigned short style, last_style;
};
struct wikipedia_request * wikipedia_fetch_article(struct browser *,