/* * Copyright (c) 2021-2022 joshua stein * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include "wikipedia.h" #include "http.h" #include "utf8.h" #include "util.h" struct wikipedia_request * wikipedia_fetch_article(struct browser *browser, char *name) { static char url[256]; struct wikipedia_request *wpr; short state; char *nencoded, *hostname; unsigned char *uname; progress("Fetching \"%s\"...", name); wpr = xmalloczero(sizeof(struct wikipedia_request)); if (wpr == NULL) { progress(NULL); warn("Out of memory!"); return NULL; } wpr->browser = browser; uname = macroman_to_utf8_string((unsigned char *)name, strlen(name)); if (uname == NULL) { progress(NULL); xfree(&wpr); return NULL; } nencoded = url_encode(uname); xfree(&uname); if (nencoded == NULL) { progress(NULL); xfree(&wpr); return NULL; } hostname = xGetStringAsChar(STR_HOSTNAME_ID); if (hostname == NULL) { progress(NULL); warn("No Wikipedia hostname set, check Settings"); xfree(&wpr); return NULL; } snprintf(url, sizeof(url), "http://%s/w/api.php?action=query&" "prop=revisions&rvslots=*&rvprop=size|content&" "format=xml&titles=%s", hostname, nencoded); xfree(&nencoded); xfree(&hostname); wpr->http_request = http_get(url); if (wpr->http_request == NULL) { progress(NULL); xfree(&nencoded); xfree(&wpr); return NULL; } http_req_skip_header(wpr->http_request); wpr->read_len = wpr->http_request->chunk_len; wpr->normalized_title = xstrdup(name); if (wpr->normalized_title == NULL) { progress(NULL); warn("Out of memory!"); xfree(&nencoded); xfree(&wpr); return NULL; } wpr->state = WP_STATE_XML_INIT; browser_debug_print(wpr->browser, wpr->http_request->chunk, wpr->http_request->chunk_len); return wpr; } size_t wikipedia_fetch_search_results(struct browser *browser, char *query, char ***results) /* a triple-star program! */ { static char url[256]; struct http_request *req; char *qencoded, *hostname; char **rets = NULL, **trets = NULL; char *str = NULL, *nstr = NULL, c; unsigned char *uquery; short strings = 0; size_t nrets = 0, len, n, npos; utf8_char utf8 = { 0 }; enum xml_state { XML_DEFAULT, XML_IN_TAG, XML_IN_TEXT } xstate = XML_DEFAULT; char *buf, *obuf; size_t buf_size; size_t buf_idx; uquery = macroman_to_utf8_string((unsigned char *)query, strlen(query)); if (uquery == NULL) return 0; qencoded = url_encode(uquery); xfree(&uquery); if (qencoded == NULL) return 0; hostname = xGetStringAsChar(STR_HOSTNAME_ID); if (hostname == NULL) { warn("No Wikipedia hostname set, check Settings"); return 0; } len = snprintf(url, sizeof(url), "http://%s/w/api.php?" "action=opensearch&format=xml&namespace=0&limit=10&" "redirects=return&search=%s", hostname, qencoded); xfree(&qencoded); xfree(&hostname); if (len > sizeof(url)) return 0; req = http_get(url); if (req == NULL) return 0; http_req_skip_header(req); buf_size = 256; buf_idx = 0; buf = xmalloc(buf_size); if (buf == NULL) { warn("Out of memory!"); http_req_free(&req); return 0; } for (;;) { if (req->chunk_len == 0 || (req->chunk_off + 1 > req->chunk_len)) { req->chunk_len = http_req_read(req, req->chunk, sizeof(req->chunk)); req->chunk_off = 0; if (req->chunk_len < 1 || (req->chunk_off + 1 > req->chunk_len)) break; } c = req->chunk[req->chunk_off++]; if (c == '<') { if (xstate == XML_IN_TEXT) { nrets++; trets = xreallocarray(rets, sizeof(Ptr), nrets); if (trets == NULL) { warn("Out of memory!"); break; } rets = trets; nstr = xstrndup(buf, buf_idx); if (nstr == NULL) { warn("Out of memory!"); break; } rets[nrets - 1] = nstr; } buf[0] = '\0'; buf_idx = 0; xstate = XML_IN_TAG; } else if (c == '>') { if (xstate == XML_IN_TAG && strncmp(buf, "Text xml:", 9) == 0) xstate = XML_IN_TEXT; else xstate = XML_DEFAULT; buf[0] = '\0'; buf_idx = 0; } else if (xstate == XML_IN_TAG || xstate == XML_IN_TEXT) { if (xstate == XML_IN_TAG && buf_idx == 9 && strncmp(buf, "Text xml:", 9) != 0) { /* not an interesting tag, don't bother saving it */ xstate = XML_DEFAULT; buf[0] = '\0'; buf_idx = 0; continue; } if ((unsigned char)c >= UTF8_RANGE_START && (unsigned char)c <= UTF8_RANGE_END) { if (utf8[0] == 0) utf8[0] = c; else if (utf8[1] == 0) utf8[1] = c; else if (utf8[2] == 0) utf8[2] = c; else if (utf8[3] == 0) utf8[3] = c; else { /* bogus */ utf8[0] = 0; c = 0; } c = utf8_to_macroman(&utf8); if (c) memset(&utf8, 0, sizeof(utf8)); } if (c) { if (buf_idx >= buf_size) { buf_size *= 2; obuf = buf; buf = xrealloc(buf, buf_size); if (buf == NULL) { warn("Out of text buffer space, failed resizing " "to %d bytes", buf_size); xfree(&obuf); http_req_free(&req); return 0; } } buf[buf_idx++] = c; } } } http_req_free(&req); xfree(&buf); *results = rets; return nrets; } void wikipedia_request_present(struct wikipedia_request *wpr) { char title[255]; snprintf(title, sizeof(title), "%s: %s", PROGRAM_NAME, wpr->normalized_title); SetWTitle(wpr->browser->win, CtoPstr(title)); browser_clear(wpr->browser); browser_print(wpr->browser, wpr->normalized_title, strlen(wpr->normalized_title), STYLE_H1); } void wikipedia_request_process(struct wikipedia_request *wpr) { struct http_request *req = wpr->http_request; size_t len, n; short pct; char c, *last, *tbuf; enum xml_state { XML_DEFAULT, XML_IN_NORMALIZED } xstate = 0; utf8_char utf8 = { 0 }; get_char: if (req->chunk_len == 0 || (req->chunk_off >= req->chunk_len)) { req->chunk_len = http_req_read(req, req->chunk, sizeof(req->chunk)); req->chunk_off = 0; wpr->read_len += req->chunk_len; if (req->chunk_len < 1 || (req->chunk_off >= req->chunk_len)) { wpr->state = WP_STATE_DONE; goto done_parsing; } if (req->content_len > 0) { pct = (wpr->read_len * 100) / req->content_len; if (pct > 100) pct = 100; progress("Fetching \"%s\" (%d%%)...", wpr->normalized_title, pct); } browser_debug_print(wpr->browser, req->chunk, req->chunk_len); } switch (wpr->state) { case WP_STATE_XML_INIT: wpr->buf_size = 1024; wpr->buf_idx = 0; wpr->buf = xmalloc(wpr->buf_size); if (wpr->buf == NULL) { warn("Out of memory!"); wpr->state = WP_STATE_DONE; break; } wpr->state = WP_STATE_XML_PARSE; goto get_char; case WP_STATE_XML_PARSE: c = req->chunk[req->chunk_off++]; if (c == '<') { wpr->buf[0] = '\0'; wpr->buf_idx = 0; } else if (c == '>') { wpr->buf[wpr->buf_idx] = '\0'; if (xstate == XML_DEFAULT) { if (strcmp(wpr->buf, "normalized") == 0) { xstate = XML_IN_NORMALIZED; } else if (strncmp(wpr->buf, "slot ", 5) == 0) { wpr->state = WP_STATE_WIKITEXT_INIT; } else if (!req->content_len && strncmp(wpr->buf, "rev size=", 9) == 0) { if (sscanf(wpr->buf, "rev size=\"%ld\"", &len) == 1) req->content_len = len; } } else if (xstate == XML_IN_NORMALIZED) { char from_normalized[255], to_normalized[255]; size_t count; if (sscanf(wpr->buf, "n from=\"%254[^\"]\" to=\"%254[^\"]\"%n", &from_normalized, &to_normalized, &count) == 2 && count > 10) { if (wpr->normalized_title != NULL) xfree(&wpr->normalized_title); wpr->normalized_title = xstrdup(to_normalized); if (wpr->normalized_title == NULL) { warn("Out of memory!"); goto done_parsing; } } else xstate = XML_DEFAULT; } } else { if (wpr->buf_idx >= wpr->buf_size) panic("ran out of buf space parsing xml"); wpr->buf[wpr->buf_idx++] = c; } goto get_char; case WP_STATE_WIKITEXT_INIT: wpr->article_len = 0; wpr->buf_idx = 0; wpr->buf[0] = '\0'; wpr->curlys = 0; wpr->brackets = 0; wpr->refs = 0; wpr->style = 0; wpr->last_style = 0; wpr->trim_whitespace = true; wpr->state = WP_STATE_WIKITEXT_PARSE; /* FALLTHROUGH */ case WP_STATE_WIKITEXT_PARSE: { c = req->chunk[req->chunk_off]; last = wpr->buf + wpr->buf_idx - 1; if (c == '<' || c == '\0') { wpr->state = WP_STATE_DONE; goto done_parsing; } /* character conversions */ if (c == ';') { /* XML entity decode */ if (wpr->buf_idx >= 4 && last[-3] == '&' && last[-2] == 'a' && last[-1] == 'm' && last[0] == 'p') { c = '&'; wpr->buf_idx -= 4; } else if (wpr->buf_idx >= 5 && last[-4] == '&' && last[-3] == 'n' && last[-2] == 'b' && last[-1] == 's' && last[0] == 'p') { c = ' '; wpr->buf_idx -= 5; } else if (wpr->buf_idx >= 2 && last[-2] == '&' && last[-1] == 'l' && last[0] == 't') { c = '<'; wpr->buf_idx -= 3; } else if (wpr->buf_idx >= 2 && last[-2] == '&' && last[-1] == 'g' && last[0] == 't') { c = '>'; wpr->buf_idx -= 3; } last = wpr->buf + wpr->buf_idx - 1; } else if (c == '\n') { c = '\r'; } else if ((unsigned char)c >= UTF8_RANGE_START && (unsigned char)c <= UTF8_RANGE_END) { /* utf-8 */ if (utf8[0] == 0) utf8[0] = c; else if (utf8[1] == 0) utf8[1] = c; else if (utf8[2] == 0) utf8[2] = c; else if (utf8[3] == 0) utf8[3] = c; else { /* bogus */ utf8[0] = 0; c = 0; } c = utf8_to_macroman(&utf8); if (c) memset(&utf8, 0, sizeof(utf8)); } /* check for style changes */ if (wpr->buf_idx >= 1 && last[0] == '{' && (c == '{' || c == '|')) { wpr->curlys++; wpr->buf_idx--; wpr->style |= STYLE_TEMPLATE; c = 0; } else if (wpr->buf_idx >= 1 && (last[0] == '}' || last[0] == '|') && c == '}') { if (wpr->curlys) wpr->curlys--; wpr->buf_idx--; if (wpr->curlys == 0) wpr->style &= ~(STYLE_TEMPLATE); c = 0; } else if (wpr->buf_idx >= 1 && last[0] == '[' && c == '[') { if (wpr->brackets) wpr->brackets++; wpr->buf_idx--; wpr->style |= STYLE_LINK; c = 0; } else if (wpr->buf_idx >= 1 && last[0] == ']' && c == ']') { if (wpr->brackets) wpr->brackets--; wpr->buf_idx--; if (wpr->brackets == 0) wpr->style &= ~(STYLE_LINK); c = 0; } else if (wpr->buf_idx >= 2 && last[-1] == '\'' && last[0] == '\'' && c == '\'') { if (wpr->style & STYLE_BOLD) wpr->style &= ~(STYLE_BOLD); else wpr->style |= STYLE_BOLD; wpr->buf_idx -= 2; c = 0; } else if (wpr->buf_idx >= 2 && last[-1] == '\'' && last[0] == '\'' && c != '\'') { if (wpr->style & STYLE_ITALIC) wpr->style &= ~(STYLE_ITALIC); else wpr->style |= STYLE_ITALIC; wpr->buf_idx -= 2; /* keep c */ } else if (wpr->buf_idx >= 4 && last[-3] == '=' && last[-2] == '=' && last[-1] == '=' && last[0] == '=' && c == '=') { if (wpr->style & STYLE_H5) wpr->style &= ~(STYLE_H5); else wpr->style |= STYLE_H5; wpr->buf_idx -= 4; c = 0; } else if (wpr->buf_idx >= 4 && last[-3] == '=' && last[-2] == '=' && last[-1] == '=' && last[0] == '=' && c != '=') { if (wpr->style & STYLE_H4) wpr->style &= ~(STYLE_H4); else wpr->style |= STYLE_H4; wpr->buf_idx -= 4; /* keep c */ } else if (wpr->buf_idx >= 3 && last[-2] == '=' && last[-1] == '=' && last[0] == '=' && c != '=') { if (wpr->style & STYLE_H3) wpr->style &= ~(STYLE_H3); else wpr->style |= STYLE_H3; wpr->buf_idx -= 3; /* keep c */ } else if (wpr->buf_idx >= 2 && last[-1] == '=' && last[0] == '=' && c != '=') { if (wpr->style & STYLE_H2) wpr->style &= ~(STYLE_H2); else wpr->style |= STYLE_H2; wpr->buf_idx -= 2; /* keep c */ } else if (wpr->buf_idx >= 3 && last[-2] == '<' && last[-1] == 'r' && last[0] == 'e' && c == 'f') { /* refs++; wpr->style |= STYLE_REF; wpr->buf_idx -= 3; c = 0; } else if ((wpr->style & STYLE_REF) && ((wpr->buf_idx >= 5 && last[-4] == '<' && last[-3] == '/' && last[-2] == 'r' && last[-1] == 'e' && last[0] == 'f' && c == '>') || (wpr->buf_idx >= 1 && last[0] == '/' && c == '>'))) { /* or */ if (wpr->refs) wpr->refs--; if (wpr->refs == 0) wpr->style &= ~(STYLE_REF); c = 0; } /* * If our style changed as of this character, dump the buffer in * the previous style and clear the buffer. */ if (wpr->style != wpr->last_style) { if (wpr->last_style & STYLE_TEMPLATE) { if (strncmp(wpr->buf, "convert|", 8) == 0) { /* convert|5.1|lb|... */ /* convert|9|in|cm|adj=on */ char *conv, *conv2; size_t len; conv = xmalloc(wpr->buf_idx); if (conv == NULL) { warn("Failed allocating %ld", wpr->buf_idx); break; } conv2 = xmalloc(wpr->buf_idx); if (conv2 == NULL) { warn("Failed allocating %ld", wpr->buf_idx); xfree(&conv); break; } wpr->buf[wpr->buf_idx] = '\0'; if (sscanf(wpr->buf, "convert|%[^|]|%[^|]|%n", conv, conv2, &len) == 2 && len >= 13) wpr->buf_idx = snprintf(wpr->buf, wpr->buf_size, "%s %s ", conv, conv2); else wpr->buf_idx = 0; xfree(&conv); xfree(&conv2); } else wpr->buf_idx = 0; } /* maybe we can do something with these later */ if (wpr->last_style & STYLE_REF) wpr->buf_idx = 0; /* we can't show inline images */ if ((wpr->last_style & STYLE_LINK) && strncmp(wpr->buf, "File:", 5) == 0) { wpr->buf_idx = 0; wpr->trim_whitespace = true; } if (wpr->last_style & (STYLE_TEMPLATE | STYLE_H1 | STYLE_H2 | STYLE_H3 | STYLE_H4 | STYLE_H5)) wpr->trim_whitespace = true; if ((wpr->style & STYLE_LINK) && wpr->article_len == 0 && strncmp(wpr->buf, "#REDIRECT ", 10) == 0) { wpr->buf_idx = 0; wpr->redirect = true; } else if (wpr->redirect && !(wpr->style & STYLE_LINK) && (wpr->last_style & STYLE_LINK)) { if (wpr->normalized_title) xfree(&wpr->normalized_title); wpr->buf[wpr->buf_idx] = '\0'; wpr->normalized_title = xstrdup(wpr->buf); if (wpr->normalized_title == NULL) { warn("Out of memory!"); wpr->state = WP_STATE_DONE; } else wpr->state = WP_STATE_HAVE_REDIRECT; goto done_parsing; } if (wpr->buf_idx) { if (wpr->article_len == 0) wikipedia_request_present(wpr); if (!browser_print(wpr->browser, wpr->buf, wpr->buf_idx, wpr->last_style)) { wpr->state = WP_STATE_DONE; goto done_parsing; } wpr->article_len += wpr->buf_idx; wpr->buf_idx = 0; } wpr->last_style = wpr->style; } /* remove whitespace */ if (c != 0 && wpr->trim_whitespace) { if (c == '\r' || c == '\t' || c == ' ') /* trim whitespace after these */ c = 0; else wpr->trim_whitespace = false; } /* and finally, add the new character */ if (c != 0) { if (wpr->buf_idx >= wpr->buf_size) { tbuf = wpr->buf; wpr->buf = xrealloc(wpr->buf, wpr->buf_size * 2); if (wpr->buf == NULL) { wpr->buf = tbuf; warn("Failed resizing parse buffer to %ld bytes, " "not enough memory", wpr->buf_size * 2); wpr->state = WP_STATE_DONE; goto done_parsing; } wpr->buf_size *= 2; } wpr->buf[wpr->buf_idx++] = c; } req->chunk_off++; goto get_char; } } done_parsing: if (wpr->buf != NULL) xfree(&wpr->buf); if (wpr->http_request != NULL) http_req_free(&wpr->http_request); } void wikipedia_request_free(struct wikipedia_request **wprptr) { struct wikipedia_request *wpr = (struct wikipedia_request *)*wprptr; if (wpr == NULL) { *wprptr = NULL; return; } if (wpr->http_request != NULL) http_req_free(&wpr->http_request); *wprptr = NULL; }