AmendHub

Download:

jcs

/

wikipedia

/

amendments

/

11

wikipedia: Start on WP API support, Wikitext parsing


jcs made amendment 11 about 1 year ago
--- wikipedia.c Sun Aug 21 23:05:29 2022 +++ wikipedia.c Wed Aug 24 17:56:32 2022 @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2021-2022 joshua stein <jcs@jcs.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <stdarg.h> +#include <stdio.h> +#include <string.h> + +#include "wikipedia.h" +#include "http.h" +#include "pdjson.h" +#include "util.h" + +/* en.wikipedia.org doesn't support non-TLS :( */ +#define WIKIPEDIA_HOST "wikipedia.jcs.org" + +#define NORMALIZED_TITLE_CONTEXT "{\"query\":{\"normalized\":[{\"to\":" +#define ARTICLE_TEXT_CONTEXT "{\"query\":{\"pages\":[{\"revisions\":[{\"slots\":{\"main\":{\"content\":" + +short wikipedia_json_peek(void *cookie); +short wikipedia_json_get(void *cookie); + +struct wikipedia_request * +wikipedia_fetch_article(struct browser *browser, char *name) +{ + static char url[256]; + struct wikipedia_request *wpr; + struct http_request *req; + short state; + char *c; + + /* XXX */ + for (c = name; *c != '\0'; c++) { + if (*c == ' ') + *c = '_'; + } + + wpr = xmalloczero(sizeof(struct wikipedia_request), + "fetch_article wpr"); + wpr->browser = browser; + + progress("Contacting Wikipedia..."); + + snprintf(url, sizeof(url), "http://%s/w/api.php?action=query&" + "prop=revisions&rvslots=*&rvprop=content&formatversion=2&" + "format=json&titles=%s", WIKIPEDIA_HOST, name); + wpr->http_request = http_get(url); + wpr->state = WP_STATE_FIND_BODY; + wpr->normalized_title = xstrdup(name, "normalized_title"); + + return wpr; +} + +struct wikipedia_request * +wikipedia_read_cached_article(struct browser *browser, char *name) +{ + struct wikipedia_request *wpr; + short state; + short error, frefnum; + size_t len, size; + + wpr = xmalloczero(sizeof(struct wikipedia_request), + "fetch_article wpr"); + wpr->browser = browser; + + /* XXX TODO */ + error = FSOpen("\pMacintosh:wp.txt", 0, &frefnum); + if (error) + panic("FSOpen failed: %d", error); + + error = SetFPos(frefnum, fsFromLEOF, 0); + if (error) + panic("SetFPos failed: %d", error); + GetFPos(frefnum, &len); + SetFPos(frefnum, fsFromStart, 0); + + wpr->article_len = len; + wpr->article = xmalloc(wpr->article_len, "article"); + + for (len = 0; len < wpr->article_len; ) { + size = 1024; + error = FSRead(frefnum, &size, wpr->article + len); + if (error && error != eofErr) + panic("FSRead failed: %d", error); + + len += size; + + if (size < 1024) + break; + } + + FSClose(frefnum); + + wpr->state = WP_STATE_FIND_BODY; + wpr->normalized_title = xstrdup(name, "normalized_title"); + + return wpr; +} + +short +wikipedia_json_peek(void *cookie) +{ + struct wikipedia_request *wpr = (struct wikipedia_request *)cookie; + + if (wpr->buf_len == 0 || (wpr->buf_off + 1 > wpr->buf_len)) { + wpr->buf_len = http_req_read(wpr->http_request, wpr->buf, + sizeof(wpr->buf)); + wpr->buf_off = 0; + } + + if (wpr->buf_len == 0 || (wpr->buf_off + 1 > wpr->buf_len)) + return EOF; + + return wpr->buf[wpr->buf_off]; +} + +short +wikipedia_json_get(void *cookie) +{ + struct wikipedia_request *wpr = (struct wikipedia_request *)cookie; + short c; + + c = wikipedia_json_peek(cookie); + if (c == EOF) + return c; + + wpr->buf_off++; + + return c; +} + +void +wikipedia_request_process(struct wikipedia_request *wpr) +{ + size_t len, n; + + switch (wpr->state) { + case WP_STATE_FIND_BODY: + if (wpr->buf_len > 3) { + /* + * Leave last 3 bytes of previous read in case \r\n\r\n happens + * across reads. + */ + memmove(wpr->buf, wpr->buf + wpr->buf_len - 3, + wpr->buf_len - 3); + wpr->buf_len = 3; + } + len = http_req_read(wpr->http_request, wpr->buf + wpr->buf_len, + sizeof(wpr->buf) - wpr->buf_len); + wpr->buf_len += len; + if (!len) + break; + + for (n = 3; n < wpr->buf_len; n++) { + if (wpr->buf[n - 3] != '\r' || wpr->buf[n - 2] != '\n' || + wpr->buf[n - 1] != '\r' || wpr->buf[n] != '\n') + continue; + + wpr->buf_off = n + 1; + wpr->state = WP_STATE_PARSE_JSON; + wpr->json_context_depth = 0; + + progress("Parsing JSON response..."); + json_open_user(&wpr->json, wikipedia_json_get, + wikipedia_json_peek, wpr); + break; + } + + break; + case WP_STATE_PARSE_JSON: { + static char context_str[PDJSON_STACK_MAX * 32]; + const char *str; + enum json_type type, context_type; + size_t tmp_depth; + + type = json_next(&wpr->json); + + if (type == JSON_ERROR || type == JSON_DONE) { + if (type == JSON_ERROR) { + char err[100]; + size_t len; + len = snprintf(err, sizeof(err), "%s at line %ld pos %ld", + json_get_error(&wpr->json), json_get_lineno(&wpr->json), + json_get_position(&wpr->json)); + browser_print(wpr->browser, err, len, 0); + } + json_close(&wpr->json); + if (wpr->http_request != NULL) + http_req_free(&wpr->http_request); + if (type == JSON_ERROR) + wpr->state = WP_STATE_DONE; + else { + wpr->state = WP_STATE_PARSE_WIKITEXT; + progress("Formatting article..."); + } + break; + } + + context_type = json_get_context(&wpr->json, &tmp_depth); + +#define wprjcd wpr->json_context[wpr->json_context_depth] + + switch (type) { + case JSON_OBJECT: + snprintf(wprjcd, sizeof(wprjcd), "{"); + wpr->json_context_depth++; + break; + case JSON_OBJECT_END: + snprintf(wprjcd, sizeof(wprjcd), "}"); + wpr->json_context_depth--; + break; + case JSON_ARRAY: + snprintf(wprjcd, sizeof(wprjcd), "["); + wpr->json_context_depth++; + break; + case JSON_ARRAY_END: + snprintf(wprjcd, sizeof(wprjcd), "]"); + wpr->json_context_depth--; + break; + case JSON_STRING: + snprintf(wprjcd, sizeof(wprjcd), "\"%s\"", + json_get_string(&wpr->json, NULL)); + break; + case JSON_NUMBER: + snprintf(wprjcd, sizeof(wprjcd), "%s", + json_get_string(&wpr->json, NULL)); + break; + case JSON_TRUE: + snprintf(wprjcd, sizeof(wprjcd), "true"); + break; + case JSON_FALSE: + snprintf(wprjcd, sizeof(wprjcd), "false"); + break; + case JSON_NULL: + snprintf(wprjcd, sizeof(wprjcd), "null"); + break; + } + + if (tmp_depth > 0 && (tmp_depth % 2) != 0) + strlcat(wprjcd, ":", sizeof(wprjcd)); + + if (type != JSON_STRING) + goto next_context; + + context_str[0] = '\0'; + for (n = 0; n < wpr->json_context_depth; n++) + strlcat(context_str, wpr->json_context[n], sizeof(context_str)); + + if (strcmp(context_str, NORMALIZED_TITLE_CONTEXT) == 0) { + xfree(&wpr->normalized_title); + wpr->normalized_title = + xstrdup(json_get_string(&wpr->json, NULL), "normalized_title"); + } else if (strcmp(context_str, ARTICLE_TEXT_CONTEXT) == 0) { + str = json_get_string(&wpr->json, &wpr->article_len); + wpr->article = xmalloc(wpr->article_len, "article"); + for (n = 0; n < wpr->article_len; n++) { + if (str[n] == '\n') + wpr->article[n] = '\r'; + else + wpr->article[n] = str[n]; + } + } + +next_context: + if (context_type == JSON_OBJECT && tmp_depth > 0) { + if (tmp_depth % 2 == 0) + wpr->json_context_depth--; + else + wpr->json_context_depth++; + } + break; + } + case WP_STATE_PARSE_WIKITEXT: { + short bracket_level = 0, apos_level = 0; + char *buf = xmalloczero(wpr->article_len, "article tmp buf"); + size_t buflen = 0; + char *c = wpr->article; + unsigned short style = 0, last_style = 0; + bool printed = false, in_ref = false; + + browser_print(wpr->browser, wpr->normalized_title, + strlen(wpr->normalized_title), STYLE_H1); + + while (*c != '\0') { + if (*c == '{') { + bracket_level++; + c++; + } else if (*c == '}') { + bracket_level--; + c++; + } else if (bracket_level > 0) { + c++; + } else if (strncmp(c, "<ref", 4) == 0) { + in_ref = true; + c += 4; + } else if (strncmp(c, "</ref>", 6) == 0) { + in_ref = false; + c += 6; + } else if (in_ref) { + c++; + } else if (strncmp(c, "'''", 3) == 0) { + if (style & STYLE_BOLD) + style &= ~(STYLE_BOLD); + else + style |= STYLE_BOLD; + c += 3; + } else if (strncmp(c, "''", 2) == 0) { + if (style & STYLE_ITALIC) + style &= ~(STYLE_ITALIC); + else + style |= STYLE_ITALIC; + c += 2; + } else if (strncmp(c, "=====", 5) == 0) { + if (style & STYLE_H5) + style &= ~(STYLE_H5); + else + style |= STYLE_H5; + c += 5; + } else if (strncmp(c, "====", 4) == 0) { + if (style & STYLE_H4) + style &= ~(STYLE_H4); + else + style |= STYLE_H4; + c += 4; + } else if (strncmp(c, "===", 3) == 0) { + if (style & STYLE_H3) + style &= ~(STYLE_H3); + else + style |= STYLE_H3; + c += 3; + } else if (strncmp(c, "==", 2) == 0) { + if (style & STYLE_H2) + style &= ~(STYLE_H2); + else + style |= STYLE_H2; + c += 2; + } else if (*c == '=') { + if (style & STYLE_H1) + style &= ~(STYLE_H1); + else + style |= STYLE_H1; + c += 1; + } else if (strncmp(c, "[[", 2) == 0) { + style |= STYLE_LINK; + c += 2; + } else if (strncmp(c, "]]", 2) == 0) { + style &= ~(STYLE_LINK); + c += 2; + } else if (*c == '\r' && !printed) { + /* skip leading newlines */ + c++; + } else if (strncmp(c, "&nbsp;", 6) == 0) { + buf[buflen++] = ' '; + c += 6; + } else { + buf[buflen++] = *c; + c++; + } + + if (style != last_style) { + if (buflen) { + browser_print(wpr->browser, buf, buflen, last_style); + buflen = 0; + printed = true; + } + last_style = style; + } + } + + UpdateScrollbarForTE(wpr->browser->te_scroller, + wpr->browser->te, true); + SetWTitle(wpr->browser->win, CtoPstr(wpr->normalized_title)); + wpr->state = WP_STATE_DONE; + progress(NULL); + break; + } + } +} + +void +wikipedia_request_abort(struct wikipedia_request *wpr) +{ + if (wpr->http_request != NULL) + http_req_free(&wpr->http_request); +} --- wikipedia.h Thu Aug 18 21:06:48 2022 +++ wikipedia.h Wed Aug 24 17:20:19 2022 @@ -14,6 +14,13 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#ifndef __WIKIPEDIA_H__ +#define __WIKIPEDIA_H__ + +#include "browser.h" +#include "http.h" +#include "pdjson.h" + #define PROGRAM_NAME "Wikipedia" #define MBAR_ID 128 @@ -32,4 +39,35 @@ extern MenuHandle file_menu, edit_menu; -void menu_defaults(void); +void menu_defaults(void); + +enum { + WP_STATE_FIND_BODY, + WP_STATE_PARSE_JSON, + WP_STATE_PARSE_WIKITEXT, + WP_STATE_DONE +}; + +struct wikipedia_request { + short state; + struct browser *browser; + struct http_request *http_request; + char *normalized_title; + char buf[1024]; + size_t buf_len; + size_t buf_off; + json_stream json; + char *article; + size_t article_len; + char json_context[PDJSON_STACK_MAX][32]; + short json_context_depth; +}; + +struct wikipedia_request * wikipedia_fetch_article(struct browser *, + char *); +struct wikipedia_request * wikipedia_read_cached_article(struct browser *browser, + char *name); +void wikipedia_request_process(struct wikipedia_request *wpr); +void wikipedia_request_abort(struct wikipedia_request *wpr); + +#endif