jcs
/wikipedia
/amendments
/11
wikipedia: Start on WP API support, Wikitext parsing
jcs made amendment 11 over 2 years ago
--- wikipedia.c Sun Aug 21 23:05:29 2022
+++ wikipedia.c Wed Aug 24 17:56:32 2022
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2021-2022 joshua stein <jcs@jcs.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "wikipedia.h"
+#include "http.h"
+#include "pdjson.h"
+#include "util.h"
+
+/* en.wikipedia.org doesn't support non-TLS :( */
+#define WIKIPEDIA_HOST "wikipedia.jcs.org"
+
+#define NORMALIZED_TITLE_CONTEXT "{\"query\":{\"normalized\":[{\"to\":"
+#define ARTICLE_TEXT_CONTEXT "{\"query\":{\"pages\":[{\"revisions\":[{\"slots\":{\"main\":{\"content\":"
+
+short wikipedia_json_peek(void *cookie);
+short wikipedia_json_get(void *cookie);
+
+struct wikipedia_request *
+wikipedia_fetch_article(struct browser *browser, char *name)
+{
+ static char url[256];
+ struct wikipedia_request *wpr;
+ struct http_request *req;
+ short state;
+ char *c;
+
+ /* XXX */
+ for (c = name; *c != '\0'; c++) {
+ if (*c == ' ')
+ *c = '_';
+ }
+
+ wpr = xmalloczero(sizeof(struct wikipedia_request),
+ "fetch_article wpr");
+ wpr->browser = browser;
+
+ progress("Contacting Wikipedia...");
+
+ snprintf(url, sizeof(url), "http://%s/w/api.php?action=query&"
+ "prop=revisions&rvslots=*&rvprop=content&formatversion=2&"
+ "format=json&titles=%s", WIKIPEDIA_HOST, name);
+ wpr->http_request = http_get(url);
+ wpr->state = WP_STATE_FIND_BODY;
+ wpr->normalized_title = xstrdup(name, "normalized_title");
+
+ return wpr;
+}
+
+struct wikipedia_request *
+wikipedia_read_cached_article(struct browser *browser, char *name)
+{
+ struct wikipedia_request *wpr;
+ short state;
+ short error, frefnum;
+ size_t len, size;
+
+ wpr = xmalloczero(sizeof(struct wikipedia_request),
+ "fetch_article wpr");
+ wpr->browser = browser;
+
+ /* XXX TODO */
+ error = FSOpen("\pMacintosh:wp.txt", 0, &frefnum);
+ if (error)
+ panic("FSOpen failed: %d", error);
+
+ error = SetFPos(frefnum, fsFromLEOF, 0);
+ if (error)
+ panic("SetFPos failed: %d", error);
+ GetFPos(frefnum, &len);
+ SetFPos(frefnum, fsFromStart, 0);
+
+ wpr->article_len = len;
+ wpr->article = xmalloc(wpr->article_len, "article");
+
+ for (len = 0; len < wpr->article_len; ) {
+ size = 1024;
+ error = FSRead(frefnum, &size, wpr->article + len);
+ if (error && error != eofErr)
+ panic("FSRead failed: %d", error);
+
+ len += size;
+
+ if (size < 1024)
+ break;
+ }
+
+ FSClose(frefnum);
+
+ wpr->state = WP_STATE_FIND_BODY;
+ wpr->normalized_title = xstrdup(name, "normalized_title");
+
+ return wpr;
+}
+
+short
+wikipedia_json_peek(void *cookie)
+{
+ struct wikipedia_request *wpr = (struct wikipedia_request *)cookie;
+
+ if (wpr->buf_len == 0 || (wpr->buf_off + 1 > wpr->buf_len)) {
+ wpr->buf_len = http_req_read(wpr->http_request, wpr->buf,
+ sizeof(wpr->buf));
+ wpr->buf_off = 0;
+ }
+
+ if (wpr->buf_len == 0 || (wpr->buf_off + 1 > wpr->buf_len))
+ return EOF;
+
+ return wpr->buf[wpr->buf_off];
+}
+
+short
+wikipedia_json_get(void *cookie)
+{
+ struct wikipedia_request *wpr = (struct wikipedia_request *)cookie;
+ short c;
+
+ c = wikipedia_json_peek(cookie);
+ if (c == EOF)
+ return c;
+
+ wpr->buf_off++;
+
+ return c;
+}
+
+void
+wikipedia_request_process(struct wikipedia_request *wpr)
+{
+ size_t len, n;
+
+ switch (wpr->state) {
+ case WP_STATE_FIND_BODY:
+ if (wpr->buf_len > 3) {
+ /*
+ * Leave last 3 bytes of previous read in case \r\n\r\n happens
+ * across reads.
+ */
+ memmove(wpr->buf, wpr->buf + wpr->buf_len - 3,
+ wpr->buf_len - 3);
+ wpr->buf_len = 3;
+ }
+ len = http_req_read(wpr->http_request, wpr->buf + wpr->buf_len,
+ sizeof(wpr->buf) - wpr->buf_len);
+ wpr->buf_len += len;
+ if (!len)
+ break;
+
+ for (n = 3; n < wpr->buf_len; n++) {
+ if (wpr->buf[n - 3] != '\r' || wpr->buf[n - 2] != '\n' ||
+ wpr->buf[n - 1] != '\r' || wpr->buf[n] != '\n')
+ continue;
+
+ wpr->buf_off = n + 1;
+ wpr->state = WP_STATE_PARSE_JSON;
+ wpr->json_context_depth = 0;
+
+ progress("Parsing JSON response...");
+ json_open_user(&wpr->json, wikipedia_json_get,
+ wikipedia_json_peek, wpr);
+ break;
+ }
+
+ break;
+ case WP_STATE_PARSE_JSON: {
+ static char context_str[PDJSON_STACK_MAX * 32];
+ const char *str;
+ enum json_type type, context_type;
+ size_t tmp_depth;
+
+ type = json_next(&wpr->json);
+
+ if (type == JSON_ERROR || type == JSON_DONE) {
+ if (type == JSON_ERROR) {
+ char err[100];
+ size_t len;
+ len = snprintf(err, sizeof(err), "%s at line %ld pos %ld",
+ json_get_error(&wpr->json), json_get_lineno(&wpr->json),
+ json_get_position(&wpr->json));
+ browser_print(wpr->browser, err, len, 0);
+ }
+ json_close(&wpr->json);
+ if (wpr->http_request != NULL)
+ http_req_free(&wpr->http_request);
+ if (type == JSON_ERROR)
+ wpr->state = WP_STATE_DONE;
+ else {
+ wpr->state = WP_STATE_PARSE_WIKITEXT;
+ progress("Formatting article...");
+ }
+ break;
+ }
+
+ context_type = json_get_context(&wpr->json, &tmp_depth);
+
+#define wprjcd wpr->json_context[wpr->json_context_depth]
+
+ switch (type) {
+ case JSON_OBJECT:
+ snprintf(wprjcd, sizeof(wprjcd), "{");
+ wpr->json_context_depth++;
+ break;
+ case JSON_OBJECT_END:
+ snprintf(wprjcd, sizeof(wprjcd), "}");
+ wpr->json_context_depth--;
+ break;
+ case JSON_ARRAY:
+ snprintf(wprjcd, sizeof(wprjcd), "[");
+ wpr->json_context_depth++;
+ break;
+ case JSON_ARRAY_END:
+ snprintf(wprjcd, sizeof(wprjcd), "]");
+ wpr->json_context_depth--;
+ break;
+ case JSON_STRING:
+ snprintf(wprjcd, sizeof(wprjcd), "\"%s\"",
+ json_get_string(&wpr->json, NULL));
+ break;
+ case JSON_NUMBER:
+ snprintf(wprjcd, sizeof(wprjcd), "%s",
+ json_get_string(&wpr->json, NULL));
+ break;
+ case JSON_TRUE:
+ snprintf(wprjcd, sizeof(wprjcd), "true");
+ break;
+ case JSON_FALSE:
+ snprintf(wprjcd, sizeof(wprjcd), "false");
+ break;
+ case JSON_NULL:
+ snprintf(wprjcd, sizeof(wprjcd), "null");
+ break;
+ }
+
+ if (tmp_depth > 0 && (tmp_depth % 2) != 0)
+ strlcat(wprjcd, ":", sizeof(wprjcd));
+
+ if (type != JSON_STRING)
+ goto next_context;
+
+ context_str[0] = '\0';
+ for (n = 0; n < wpr->json_context_depth; n++)
+ strlcat(context_str, wpr->json_context[n], sizeof(context_str));
+
+ if (strcmp(context_str, NORMALIZED_TITLE_CONTEXT) == 0) {
+ xfree(&wpr->normalized_title);
+ wpr->normalized_title =
+ xstrdup(json_get_string(&wpr->json, NULL), "normalized_title");
+ } else if (strcmp(context_str, ARTICLE_TEXT_CONTEXT) == 0) {
+ str = json_get_string(&wpr->json, &wpr->article_len);
+ wpr->article = xmalloc(wpr->article_len, "article");
+ for (n = 0; n < wpr->article_len; n++) {
+ if (str[n] == '\n')
+ wpr->article[n] = '\r';
+ else
+ wpr->article[n] = str[n];
+ }
+ }
+
+next_context:
+ if (context_type == JSON_OBJECT && tmp_depth > 0) {
+ if (tmp_depth % 2 == 0)
+ wpr->json_context_depth--;
+ else
+ wpr->json_context_depth++;
+ }
+ break;
+ }
+ case WP_STATE_PARSE_WIKITEXT: {
+ short bracket_level = 0, apos_level = 0;
+ char *buf = xmalloczero(wpr->article_len, "article tmp buf");
+ size_t buflen = 0;
+ char *c = wpr->article;
+ unsigned short style = 0, last_style = 0;
+ bool printed = false, in_ref = false;
+
+ browser_print(wpr->browser, wpr->normalized_title,
+ strlen(wpr->normalized_title), STYLE_H1);
+
+ while (*c != '\0') {
+ if (*c == '{') {
+ bracket_level++;
+ c++;
+ } else if (*c == '}') {
+ bracket_level--;
+ c++;
+ } else if (bracket_level > 0) {
+ c++;
+ } else if (strncmp(c, "<ref", 4) == 0) {
+ in_ref = true;
+ c += 4;
+ } else if (strncmp(c, "</ref>", 6) == 0) {
+ in_ref = false;
+ c += 6;
+ } else if (in_ref) {
+ c++;
+ } else if (strncmp(c, "'''", 3) == 0) {
+ if (style & STYLE_BOLD)
+ style &= ~(STYLE_BOLD);
+ else
+ style |= STYLE_BOLD;
+ c += 3;
+ } else if (strncmp(c, "''", 2) == 0) {
+ if (style & STYLE_ITALIC)
+ style &= ~(STYLE_ITALIC);
+ else
+ style |= STYLE_ITALIC;
+ c += 2;
+ } else if (strncmp(c, "=====", 5) == 0) {
+ if (style & STYLE_H5)
+ style &= ~(STYLE_H5);
+ else
+ style |= STYLE_H5;
+ c += 5;
+ } else if (strncmp(c, "====", 4) == 0) {
+ if (style & STYLE_H4)
+ style &= ~(STYLE_H4);
+ else
+ style |= STYLE_H4;
+ c += 4;
+ } else if (strncmp(c, "===", 3) == 0) {
+ if (style & STYLE_H3)
+ style &= ~(STYLE_H3);
+ else
+ style |= STYLE_H3;
+ c += 3;
+ } else if (strncmp(c, "==", 2) == 0) {
+ if (style & STYLE_H2)
+ style &= ~(STYLE_H2);
+ else
+ style |= STYLE_H2;
+ c += 2;
+ } else if (*c == '=') {
+ if (style & STYLE_H1)
+ style &= ~(STYLE_H1);
+ else
+ style |= STYLE_H1;
+ c += 1;
+ } else if (strncmp(c, "[[", 2) == 0) {
+ style |= STYLE_LINK;
+ c += 2;
+ } else if (strncmp(c, "]]", 2) == 0) {
+ style &= ~(STYLE_LINK);
+ c += 2;
+ } else if (*c == '\r' && !printed) {
+ /* skip leading newlines */
+ c++;
+ } else if (strncmp(c, " ", 6) == 0) {
+ buf[buflen++] = ' ';
+ c += 6;
+ } else {
+ buf[buflen++] = *c;
+ c++;
+ }
+
+ if (style != last_style) {
+ if (buflen) {
+ browser_print(wpr->browser, buf, buflen, last_style);
+ buflen = 0;
+ printed = true;
+ }
+ last_style = style;
+ }
+ }
+
+ UpdateScrollbarForTE(wpr->browser->te_scroller,
+ wpr->browser->te, true);
+ SetWTitle(wpr->browser->win, CtoPstr(wpr->normalized_title));
+ wpr->state = WP_STATE_DONE;
+ progress(NULL);
+ break;
+ }
+ }
+}
+
+void
+wikipedia_request_abort(struct wikipedia_request *wpr)
+{
+ if (wpr->http_request != NULL)
+ http_req_free(&wpr->http_request);
+}
--- wikipedia.h Thu Aug 18 21:06:48 2022
+++ wikipedia.h Wed Aug 24 17:20:19 2022
@@ -14,6 +14,13 @@
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
+#ifndef __WIKIPEDIA_H__
+#define __WIKIPEDIA_H__
+
+#include "browser.h"
+#include "http.h"
+#include "pdjson.h"
+
#define PROGRAM_NAME "Wikipedia"
#define MBAR_ID 128
@@ -32,4 +39,35 @@
extern MenuHandle file_menu, edit_menu;
-void menu_defaults(void);
+void menu_defaults(void);
+
+enum {
+ WP_STATE_FIND_BODY,
+ WP_STATE_PARSE_JSON,
+ WP_STATE_PARSE_WIKITEXT,
+ WP_STATE_DONE
+};
+
+struct wikipedia_request {
+ short state;
+ struct browser *browser;
+ struct http_request *http_request;
+ char *normalized_title;
+ char buf[1024];
+ size_t buf_len;
+ size_t buf_off;
+ json_stream json;
+ char *article;
+ size_t article_len;
+ char json_context[PDJSON_STACK_MAX][32];
+ short json_context_depth;
+};
+
+struct wikipedia_request * wikipedia_fetch_article(struct browser *,
+ char *);
+struct wikipedia_request * wikipedia_read_cached_article(struct browser *browser,
+ char *name);
+void wikipedia_request_process(struct wikipedia_request *wpr);
+void wikipedia_request_abort(struct wikipedia_request *wpr);
+
+#endif