AmendHub

jcs

/

wikipedia

/

amendments

/

19

wikipedia: Parse XML instead of JSON, do all parsing at once


jcs made amendment 19 5 months ago
--- browser.c Thu Aug 25 15:58:37 2022 +++ browser.c Fri Sep 2 14:03:49 2022 @@ -56,7 +56,7 @@ browser_idle(struct focusable *focusable, EventRecord browser_live_search(browser); } break; - case BROWSER_STATE_DO_SEARCH: { + case BROWSER_STATE_ARTICLE_GET: { TERec *te; char *input; @@ -76,21 +76,26 @@ browser_idle(struct focusable *focusable, EventRecord } HUnlock(browser->te); + SetCursor(*(GetCursor(watchCursor))); browser->wpr = wikipedia_fetch_article(browser, input); xfree(&input); - browser->state = BROWSER_STATE_PROCESS_SEARCH; + browser->state = BROWSER_STATE_ARTICLE_PROCESS; break; } - case BROWSER_STATE_PROCESS_SEARCH: + case BROWSER_STATE_ARTICLE_PROCESS: if (browser->wpr == NULL) { browser->state = BROWSER_STATE_IDLE; break; } - SetCursor(*(GetCursor(watchCursor))); wikipedia_request_process(browser->wpr); + + if (browser->wpr->state == WP_STATE_DONE) { + browser->state = BROWSER_STATE_ARTICLE_DONE; + break; + } break; - case BROWSER_STATE_SEARCH_DONE: + case BROWSER_STATE_ARTICLE_DONE: UpdateScrollbarForTE(browser->te_scroller, browser->te, false); UpdtControl(browser->win, browser->win->visRgn); progress(NULL); @@ -319,7 +324,7 @@ browser_mouse_down(struct focusable *focusable, EventR TESetText(str, len, browser->input_te); InvalRect(&(*(browser->input_te))->viewRect); browser_hide_search_results(browser); - browser->state = BROWSER_STATE_DO_SEARCH; + browser->state = BROWSER_STATE_ARTICLE_GET; } return; @@ -370,7 +375,7 @@ browser_key_down(struct focusable *focusable, EventRec k = (event->message & charCodeMask); if (k == '\r') { - browser->state = BROWSER_STATE_DO_SEARCH; + browser->state = BROWSER_STATE_ARTICLE_GET; } else { TEKey(k, browser->input_te); TESelView(browser->input_te); @@ -613,4 +618,10 @@ no_overflow: } return len; +} + +void +browser_clear(struct browser *browser) +{ + } --- browser.h Thu Aug 25 15:57:24 2022 +++ browser.h Fri Sep 2 13:57:10 2022 @@ -22,9 +22,9 @@ enum { BROWSER_STATE_IDLE, - BROWSER_STATE_DO_SEARCH, - BROWSER_STATE_PROCESS_SEARCH, - BROWSER_STATE_SEARCH_DONE + BROWSER_STATE_ARTICLE_GET, + BROWSER_STATE_ARTICLE_PROCESS, + BROWSER_STATE_ARTICLE_DONE }; #define STYLE_BOLD (1 << 0) @@ -50,5 +50,6 @@ struct browser { struct browser *browser_init(void); size_t browser_print(struct browser *browser, const char *str, size_t len, unsigned short style); +void browser_clear(struct browser *browser); #endif --- dnr.c Fri Aug 19 13:15:23 2022 +++ dnr.c Wed Aug 31 17:42:22 2022 @@ -6,6 +6,14 @@ * Modifications by Jim Matthews, Dartmouth College, 5/91 */ +/* + * TODO: update to avoid having to include MacTraps2 for: + * CloseWD + * HOpenResFile + * FindFolder + * GetWDInfo + */ + #include <OSUtils.h> #include <Files.h> #include <Folders.h> --- wikipedia.c Thu Aug 25 15:59:08 2022 +++ wikipedia.c Sat Sep 3 22:26:38 2022 @@ -26,15 +26,6 @@ /* en.wikipedia.org doesn't support non-TLS :( */ #define WIKIPEDIA_HOST "wikipedia.jcs.org" -/* {"query":{"normalized":[{"to": */ -#define NORMALIZED_TITLE_CONTEXT "{\"query\":{\"normalized\":[{\"to\":" - -/* {"query":{"pages":[{"revisions":[{"slots":{"main":{"content": */ -#define ARTICLE_TEXT_CONTEXT "{\"query\":{\"pages\":[{\"revisions\":[{\"slots\":{\"main\":{\"content\":" - -short wikipedia_json_peek(void *cookie); -short wikipedia_json_get(void *cookie); - struct wikipedia_request * wikipedia_fetch_article(struct browser *browser, char *name) { @@ -57,10 +48,10 @@ wikipedia_fetch_article(struct browser *browser, char snprintf(url, sizeof(url), "http://%s/w/api.php?action=query&" "prop=revisions&rvslots=*&rvprop=content&" - "format=json&formatversion=2&titles=%s", WIKIPEDIA_HOST, name); + "format=xml&titles=%s", WIKIPEDIA_HOST, name); wpr->http_request = http_get(url); http_req_skip_header(wpr->http_request); - wpr->state = WP_STATE_PARSE_JSON; + wpr->state = WP_STATE_XML_INIT; wpr->normalized_title = xstrdup(name, "normalized_title"); return wpr; @@ -121,309 +112,243 @@ wikipedia_fetch_search_results(struct browser *browser return nrets; } -struct wikipedia_request * -wikipedia_read_cached_article(struct browser *browser, char *name) -{ - struct wikipedia_request *wpr; - short state; - short error, frefnum; - size_t len, size; - - wpr = xmalloczero(sizeof(struct wikipedia_request), - "fetch_article wpr"); - wpr->browser = browser; - - /* XXX TODO */ - error = FSOpen("\pMacintosh:wp.txt", 0, &frefnum); - if (error) - panic("FSOpen failed: %d", error); - - error = SetFPos(frefnum, fsFromLEOF, 0); - if (error) - panic("SetFPos failed: %d", error); - GetFPos(frefnum, &len); - SetFPos(frefnum, fsFromStart, 0); - - wpr->article_len = len; - wpr->article = xmalloc(wpr->article_len, "article"); - - for (len = 0; len < wpr->article_len; ) { - size = 1024; - error = FSRead(frefnum, &size, wpr->article + len); - if (error && error != eofErr) - panic("FSRead failed: %d", error); - - len += size; - - if (size < 1024) - break; - } - - FSClose(frefnum); - - wpr->state = WP_STATE_PARSE_JSON; - wpr->normalized_title = xstrdup(name, "normalized_title"); - - return wpr; -} - -short -wikipedia_json_peek(void *cookie) -{ - struct wikipedia_request *wpr = (struct wikipedia_request *)cookie; - struct http_request *req = wpr->http_request; - - return http_req_chunk_peek(req); -} - -short -wikipedia_json_get(void *cookie) -{ - struct wikipedia_request *wpr = (struct wikipedia_request *)cookie; - struct http_request *req = wpr->http_request; - - return http_req_chunk_read(req); -} - void wikipedia_request_process(struct wikipedia_request *wpr) { struct http_request *req = wpr->http_request; size_t len, n; + char c, *last; + enum xml_state { + XML_DEFAULT, + XML_IN_NORMALIZED + } xstate = 0; + bool dump = false; +get_char: + if (req->chunk_len == 0 || (req->chunk_off + 1 > req->chunk_len)) { + req->chunk_len = http_req_read(req, req->chunk, + sizeof(req->chunk)); + req->chunk_off = 0; + + if (req->chunk_len == 0 || (req->chunk_off + 1 > req->chunk_len)) + goto done_parsing; + } + switch (wpr->state) { - case WP_STATE_PARSE_JSON: { - char context_str[PDJSON_STACK_MAX * 32]; - const char *str; - enum json_type type, context_type; - size_t tmp_depth; + case WP_STATE_XML_INIT: + progress("Fetching and parsing response..."); + wpr->buf_size = 1024; + wpr->buf_len = 0; + wpr->buf = xmalloc(wpr->buf_size, "wpr buf"); + wpr->state = WP_STATE_XML_PARSE; + goto get_char; - wpr->json_context_depth = 0; - - progress("Parsing JSON response..."); - json_open_user(&wpr->json, wikipedia_json_get, wikipedia_json_peek, - wpr); - - for (;;) { - type = json_next(&wpr->json); + case WP_STATE_XML_PARSE: + c = req->chunk[req->chunk_off++]; - if (type == JSON_ERROR || type == JSON_DONE) { - if (type == JSON_ERROR) { - char err[100]; - size_t len; - len = snprintf(err, sizeof(err), - "%s at line %ld pos %ld", json_get_error(&wpr->json), - json_get_lineno(&wpr->json), - json_get_position(&wpr->json)); - browser_print(wpr->browser, err, len, 0); + if (c == '<') { + wpr->buf[0] = '\0'; + wpr->buf_len = 0; + } else if (c == '>') { + wpr->buf[wpr->buf_len] = '\0'; + if (xstate == XML_DEFAULT) { + if (strcmp(wpr->buf, "normalized") == 0) { + xstate = XML_IN_NORMALIZED; + } else if (strncmp(wpr->buf, "slot ", 5) == 0) { + wpr->state = WP_STATE_WIKITEXT_INIT; } - json_close(&wpr->json); - if (wpr->http_request != NULL) - http_req_free(&wpr->http_request); - if (type == JSON_ERROR) - wpr->state = WP_STATE_DONE; - else { - char title[100]; - snprintf(title, sizeof(title), "%s: %s", PROGRAM_NAME, - wpr->normalized_title); - SetWTitle(wpr->browser->win, CtoPstr(title)); - wpr->state = WP_STATE_PARSE_WIKITEXT; - progress("Formatting article..."); - } - break; - } - - context_type = json_get_context(&wpr->json, &tmp_depth); - -#define wprjcd wpr->json_context[wpr->json_context_depth] - - switch (type) { - case JSON_OBJECT: - snprintf(wprjcd, sizeof(wprjcd), "{"); - wpr->json_context_depth++; - break; - case JSON_OBJECT_END: - snprintf(wprjcd, sizeof(wprjcd), "}"); - wpr->json_context_depth--; - break; - case JSON_ARRAY: - snprintf(wprjcd, sizeof(wprjcd), "["); - wpr->json_context_depth++; - break; - case JSON_ARRAY_END: - snprintf(wprjcd, sizeof(wprjcd), "]"); - wpr->json_context_depth--; - break; - case JSON_STRING: - snprintf(wprjcd, sizeof(wprjcd), "\"%s\"", - json_get_string(&wpr->json, NULL)); - break; - case JSON_NUMBER: - snprintf(wprjcd, sizeof(wprjcd), "%s", - json_get_string(&wpr->json, NULL)); - break; - case JSON_TRUE: - snprintf(wprjcd, sizeof(wprjcd), "true"); - break; - case JSON_FALSE: - snprintf(wprjcd, sizeof(wprjcd), "false"); - break; - case JSON_NULL: - snprintf(wprjcd, sizeof(wprjcd), "null"); - break; - } - - if (tmp_depth > 0 && (tmp_depth % 2) != 0) - strlcat(wprjcd, ":", sizeof(wprjcd)); - - if (type != JSON_STRING) - goto next_context; + } else if (xstate == XML_IN_NORMALIZED) { + char from_normalized[255], to_normalized[255]; + size_t count; - context_str[0] = '\0'; - for (n = 0; n < wpr->json_context_depth; n++) - strlcat(context_str, wpr->json_context[n], - sizeof(context_str)); - - if (strcmp(context_str, NORMALIZED_TITLE_CONTEXT) == 0) { - xfree(&wpr->normalized_title); - wpr->normalized_title = - xstrdup(json_get_string(&wpr->json, NULL), - "normalized_title"); - } else if (strcmp(context_str, ARTICLE_TEXT_CONTEXT) == 0) { - str = json_get_string(&wpr->json, &wpr->article_len); - wpr->article = xmalloc(wpr->article_len, "article"); - for (n = 0; n < wpr->article_len; n++) { - if (str[n] == '\n') - wpr->article[n] = '\r'; - else - wpr->article[n] = str[n]; - } + if (sscanf(wpr->buf, "n from=\"%[^\"]\" to=\"%[^\"]\"%n", + &from_normalized, &to_normalized, &count) == 2 && + count > 10) { + if (wpr->normalized_title != NULL) + xfree(&wpr->normalized_title); + wpr->normalized_title = xstrdup(to_normalized, + "to_normalized"); + } else + xstate = XML_DEFAULT; } - -next_context: - if (context_type == JSON_OBJECT && tmp_depth > 0) { - if (tmp_depth % 2 == 0) - wpr->json_context_depth--; - else - wpr->json_context_depth++; - } + } else if (wpr->buf_len < wpr->buf_size) { + wpr->buf[wpr->buf_len++] = c; } - break; - } - case WP_STATE_PARSE_WIKITEXT: { - short bracket_level = 0, apos_level = 0; - char *buf = xmalloczero(wpr->article_len, "article tmp buf"); - size_t buflen = 0; - char *c = wpr->article; - unsigned short style = 0, last_style = 0; - bool printed = false, in_ref = false; + goto get_char; + + case WP_STATE_WIKITEXT_INIT: + snprintf(wpr->buf, wpr->buf_size, "%s: %s", PROGRAM_NAME, + wpr->normalized_title); + SetWTitle(wpr->browser->win, CtoPstr(wpr->buf)); + + browser_clear(wpr->browser); browser_print(wpr->browser, wpr->normalized_title, strlen(wpr->normalized_title), STYLE_H1); - while (*c != '\0') { - if (*c == '{') { - bracket_level++; - c++; - } else if (*c == '}') { - bracket_level--; - c++; - } else if (bracket_level > 0) { - c++; - } else if (c[0] == '<' && c[1] == 'r' && c[2] == 'e' && - c[3] == 'f') { + wpr->article_len = 0; + wpr->buf_len = 0; + wpr->buf[0] = '\0'; + + wpr->curlys = 0; + wpr->brackets = 0; + wpr->apostrophes = 0; + wpr->equals = 0; + wpr->style = 0; + wpr->last_style = 0; + + wpr->state = WP_STATE_WIKITEXT_PARSE; + /* FALLTHROUGH */ + + case WP_STATE_WIKITEXT_PARSE: { + c = req->chunk[req->chunk_off]; + +parse_char: + if (c == '{') { + wpr->curlys++; + } else if (c == '}') { + wpr->curlys--; + } else if (wpr->curlys > 0) { + /* consume, obey */ + } else if (c == '[') { + wpr->brackets++; + } else if (c == ']') { + wpr->brackets--; + } else if (c == '\'') { + wpr->apostrophes++; + } else if (c == '=') { + wpr->equals++; + } else { + last = wpr->buf + wpr->buf_len - 1; + + if (last[-3] == '<' && last[-2] == 'r' && last[-1] == 'e' && + last[0] == 'f') { /* <ref */ - in_ref = true; - c += 4; - } else if (c[0] == '<' && c[1] == '/' && c[2] == 'r' && - c[3] == 'e' && c[4] == 'f' && c[5] == '>') { + wpr->in_ref = true; + wpr->apostrophes = wpr->equals = wpr->brackets = wpr->curlys = 0; + dump = true; + wpr->buf_len -= 4; + } else if (last[-5] == '<' && last[-4] == '/' && + last[-3] == 'r' && last[-2] == 'e' && last[-1] == 'f' && + last[0] == '>') { /* </ref> */ - in_ref = false; - c += 6; - } else if (in_ref) { - c++; - } else if (c[0] == '\'' && c[1] == '\'') { - /* ''' or '' */ - if (c[2] == '\'') { - if (style & STYLE_BOLD) - style &= ~(STYLE_BOLD); + wpr->in_ref = false; + wpr->buf_len = 0; + } else if (wpr->in_ref) { + /* consume, obey */ + } else if (wpr->apostrophes == 3) { + if (wpr->style & STYLE_BOLD) + wpr->style &= ~(STYLE_BOLD); + else + wpr->style |= STYLE_BOLD; + wpr->apostrophes = 0; + } else if (wpr->apostrophes == 2) { + if (wpr->style & STYLE_ITALIC) + wpr->style &= ~(STYLE_ITALIC); + else + wpr->style |= STYLE_ITALIC; + wpr->apostrophes = 0; + } else if (wpr->apostrophes == 1) { + /* literal apostrophe, add and go back to handle c */ + wpr->apostrophes = 0; + wpr->buf[wpr->buf_len++] = '\''; + goto parse_char; + } else if (wpr->equals) { + if (wpr->equals == 5) { + if (wpr->style & STYLE_H5) + wpr->style &= ~(STYLE_H5); else - style |= STYLE_BOLD; - c += 3; - } else { - if (style & STYLE_ITALIC) - style &= ~(STYLE_ITALIC); + wpr->style |= STYLE_H5; + } else if (wpr->equals == 4) { + if (wpr->style & STYLE_H4) + wpr->style &= ~(STYLE_H4); else - style |= STYLE_ITALIC; - c += 2; - } - } else if (c[0] == '=' && c[1] == '=') { - /* == or === or ==== or ===== */ - if (c[2] == '=') { - if (c[3] == '=') { - if (c[4] == '=') { - if (style & STYLE_H5) - style &= ~(STYLE_H5); - else - style |= STYLE_H5; - c += 5; - } else { - if (style & STYLE_H4) - style &= ~(STYLE_H4); - else - style |= STYLE_H4; - c += 4; - } - } else { - if (style & STYLE_H3) - style &= ~(STYLE_H3); - else - style |= STYLE_H3; - c += 3; - } - } else { - if (style & STYLE_H2) - style &= ~(STYLE_H2); + wpr->style |= STYLE_H4; + } else if (wpr->equals == 3) { + if (wpr->style & STYLE_H3) + wpr->style &= ~(STYLE_H3); else - style |= STYLE_H2; - c += 2; + wpr->style |= STYLE_H3; + } else if (wpr->equals == 2) { + if (wpr->style & STYLE_H2) + wpr->style &= ~(STYLE_H2); + else + wpr->style |= STYLE_H2; + } else { + /* literal equals, add and go back to handle c */ + wpr->equals = 0; + wpr->buf[wpr->buf_len++] = '='; + goto parse_char; } - - while (*c == ' ') - c++; - } else if (c[0] == '[' && c[1] == '[') { - style |= STYLE_LINK; - c += 2; - } else if (c[0] == ']' && c[1] == ']') { - style &= ~(STYLE_LINK); - c += 2; - } else if (c[0] == '\r' && !printed) { + wpr->equals = 0; + } else if (wpr->brackets == 2) { + wpr->style |= STYLE_LINK; + } else if (wpr->brackets == 0 && (wpr->style & STYLE_LINK)) { + wpr->style &= ~(STYLE_LINK); + } + + if (c == '\n') { /* skip leading newlines */ - c++; - } else if (c[0] == '&' && strncmp(c, "&nbsp;", 6) == 0) { - buf[buflen++] = ' '; - c += 6; - } else { - buf[buflen++] = *c; - c++; + if (wpr->article_len == 0) + c = 0; + else + c = '\r'; } - if (style != last_style) { - if (buflen) { - browser_print(wpr->browser, buf, buflen, last_style); - buflen = 0; - printed = true; + if (wpr->style != wpr->last_style || dump) { + if (wpr->buf_len) { + browser_print(wpr->browser, wpr->buf, wpr->buf_len, + wpr->last_style); + wpr->article_len += wpr->buf_len; + wpr->buf_len = 0; } - last_style = style; + wpr->last_style = wpr->style; + dump = false; } + + if (c == ' ' && wpr->buf_len == 0 && + (wpr->style & (STYLE_H2|STYLE_H3|STYLE_H4|STYLE_H5))) + /* skip leading spaces around headers */ + c = 0; + + if (c != 0) { + wpr->buf[wpr->buf_len++] = c; + + /* XML entity decode */ + if (c == ';') { + last = wpr->buf + wpr->buf_len - 1; + if (last[-4] == '&' && last[-3] == 'a' && last[-2] == 'm' && + last[-1] == 'p') { + last[-4] = '&'; + wpr->buf_len -= 4; + } else if (last[-5] == '&' && last[-3] == 'n' && + last[-3] == 'b' && last[-2] == 's' && last[-1] == 'p') { + last[-5] = ' '; + wpr->buf_len -= 5; + } else if (last[-3] == '&' && last[-2] == 'l' && last[-1] == 't') { + last[-3] = '<'; + wpr->buf_len -= 3; + } else if (last[-3] == '&' && last[-2] == 'g' && last[-1] == 't') { + last[-3] = '>'; + wpr->buf_len -= 3; + } + } + } } - wpr->state = WP_STATE_DONE; - wpr->browser->state = BROWSER_STATE_SEARCH_DONE; - - break; + req->chunk_off++; + goto get_char; } } + +done_parsing: + wpr->state = WP_STATE_DONE; + + if (wpr->buf != NULL) + xfree(&wpr->buf); + + if (wpr->http_request != NULL) + http_req_free(&wpr->http_request); } void --- wikipedia.h Thu Aug 25 08:50:29 2022 +++ wikipedia.h Fri Sep 2 13:33:44 2022 @@ -42,8 +42,10 @@ extern MenuHandle file_menu, edit_menu; void menu_defaults(void); enum { - WP_STATE_PARSE_JSON, - WP_STATE_PARSE_WIKITEXT, + WP_STATE_XML_INIT, + WP_STATE_XML_PARSE, + WP_STATE_WIKITEXT_INIT, + WP_STATE_WIKITEXT_PARSE, WP_STATE_DONE }; @@ -52,11 +54,14 @@ struct wikipedia_request { struct browser *browser; struct http_request *http_request; char *normalized_title; - json_stream json; - char *article; size_t article_len; - char json_context[PDJSON_STACK_MAX][32]; - short json_context_depth; + + char *buf; + size_t buf_size; + size_t buf_len; + short curlys, brackets, apostrophes, equals; + bool in_ref; + unsigned short style, last_style; }; struct wikipedia_request * wikipedia_fetch_article(struct browser *,