AmendHub

jcs

/

wikipedia

/

amendments

/

33

wikipedia: Use XML API for searching too, get rid of pdjson


jcs made amendment 33 4 months ago
--- wikipedia.c Wed Sep 7 15:42:01 2022 +++ wikipedia.c Wed Sep 7 16:45:33 2022 @@ -20,7 +20,6 @@ #include "wikipedia.h" #include "http.h" -#include "pdjson.h" #include "utf8.h" #include "util.h" @@ -66,86 +65,99 @@ wikipedia_fetch_search_results(struct browser *browser char ***results) { static char url[256]; - json_stream json; struct http_request *req; char *qencoded; char **rets = NULL; char *str = NULL, *nstr = NULL, c; unsigned char *uquery; - enum json_type type; short strings = 0; size_t nrets = 0, len, n, npos; utf8_char utf8 = { 0 }; + enum xml_state { + XML_DEFAULT, + XML_IN_TAG, + XML_IN_TEXT + } xstate = 0; + char *buf; + size_t buf_size; + size_t buf_len; uquery = macroman_to_utf8_string((unsigned char *)query, strlen(query)); qencoded = url_encode(uquery); xfree(&uquery); snprintf(url, sizeof(url), "http://%s/w/api.php?action=opensearch&" - "format=json&formatversion=2&namespace=0&limit=10&" - "search=%s", WIKIPEDIA_HOST, qencoded); + "format=xml&namespace=0&limit=10&redirects=return&search=%s", + WIKIPEDIA_HOST, qencoded); xfree(&qencoded); req = http_get(url); http_req_skip_header(req); - - json_open_user(&json, http_req_chunk_read, http_req_chunk_peek, req); - + + buf_size = 256; + buf_len = 0; + buf = xmalloc(buf_size, "xml buf"); + for (;;) { - type = json_next(&json); + if (req->chunk_len == 0 || (req->chunk_off + 1 > req->chunk_len)) { + req->chunk_len = http_req_read(req, req->chunk, + sizeof(req->chunk)); + req->chunk_off = 0; + + if (req->chunk_len < 1 || (req->chunk_off + 1 > req->chunk_len)) + break; + } - if (type == JSON_ERROR || type == JSON_DONE || - type == JSON_ARRAY_END) - break; + c = req->chunk[req->chunk_off++]; + + if (c == '<') { + if (xstate == XML_IN_TEXT) { + nrets++; + rets = xreallocarray(rets, sizeof(Ptr), nrets); + nstr = xstrndup(buf, buf_len, "search result"); + rets[nrets - 1] = nstr; + } - if (type == JSON_STRING) { - strings++; - - /* skip first, it'll be our query */ - if (strings == 1) - continue; - - nrets++; - rets = xreallocarray(rets, sizeof(Ptr), nrets); - - str = (char *)json_get_string(&json, NULL); - len = strlen(str); - nstr = xmalloc(len + 1, "search result"); - - for (n = 0, npos = 0; n < len; n++) { - c = str[n]; + buf[0] = '\0'; + buf_len = 0; + xstate = XML_IN_TAG; + } else if (c == '>') { + if (xstate == XML_IN_TAG && + strncmp(buf, "Text xml:", 9) == 0) + xstate = XML_IN_TEXT; + else + xstate = XML_DEFAULT; - if ((unsigned char)c >= UTF8_RANGE_START && - (unsigned char)c <= UTF8_RANGE_END) { - if (utf8[0] == 0) - utf8[0] = c; - else if (utf8[1] == 0) - utf8[1] = c; - else if (utf8[2] == 0) - utf8[2] = c; - else if (utf8[3] == 0) - utf8[3] = c; - else { - /* bogus */ - utf8[0] = 0; - c = 0; - } - - c = utf8_to_macroman(&utf8); - if (c) - memset(&utf8, 0, sizeof(utf8)); + buf[0] = '\0'; + buf_len = 0; + } else if (buf_len < buf_size) { + if ((unsigned char)c >= UTF8_RANGE_START && + (unsigned char)c <= UTF8_RANGE_END) { + if (utf8[0] == 0) + utf8[0] = c; + else if (utf8[1] == 0) + utf8[1] = c; + else if (utf8[2] == 0) + utf8[2] = c; + else if (utf8[3] == 0) + utf8[3] = c; + else { + /* bogus */ + utf8[0] = 0; + c = 0; } + + c = utf8_to_macroman(&utf8); if (c) - nstr[npos++] = c; + memset(&utf8, 0, sizeof(utf8)); } - nstr[npos] = '\0'; - rets[nrets - 1] = nstr; - } else if (type == JSON_ARRAY_END) { - break; + + if (c) + buf[buf_len++] = c; } } - json_close(&json); http_req_free(&req); + http_req_free(&buf); *results = rets; --- wikipedia.h Tue Sep 6 23:06:18 2022 +++ wikipedia.h Wed Sep 7 16:47:24 2022 @@ -19,7 +19,6 @@ #include "browser.h" #include "http.h" -#include "pdjson.h" #define PROGRAM_NAME "Wikipedia"