AmendHub

Download:

jcs

/

wikipedia

/

amendments

/

53

wikipedia: Only collect characters inside a <Text> tag

Others with image URLs and other things can be quite large and we're
not doing anything with that data anyway. If we do run out of room,
try to grow before failing.

jcs made amendment 53 about 1 year ago
--- wikipedia.c Mon Oct 16 17:30:54 2023 +++ wikipedia.c Mon Oct 30 13:08:05 2023 @@ -113,8 +113,8 @@ wikipedia_fetch_search_results(struct browser *browser XML_DEFAULT, XML_IN_TAG, XML_IN_TEXT - } xstate = 0; - char *buf; + } xstate = XML_DEFAULT; + char *buf, *obuf; size_t buf_size; size_t buf_idx; @@ -196,7 +196,16 @@ wikipedia_fetch_search_results(struct browser *browser buf[0] = '\0'; buf_idx = 0; - } else if (buf_idx < buf_size) { + } else if (xstate == XML_IN_TAG || xstate == XML_IN_TEXT) { + if (xstate == XML_IN_TAG && buf_idx == 9 && + strncmp(buf, "Text xml:", 9) != 0) { + /* not an interesting tag, don't bother saving it */ + xstate = XML_DEFAULT; + buf[0] = '\0'; + buf_idx = 0; + continue; + } + if ((unsigned char)c >= UTF8_RANGE_START && (unsigned char)c <= UTF8_RANGE_END) { if (utf8[0] == 0) @@ -217,11 +226,23 @@ wikipedia_fetch_search_results(struct browser *browser if (c) memset(&utf8, 0, sizeof(utf8)); } - - if (c) + + if (c) { + if (buf_idx >= buf_size) { + buf_size *= 2; + obuf = buf; + buf = xrealloc(buf, buf_size); + if (buf == NULL) { + warn("Out of text buffer space, failed resizing " + "to %d bytes", buf_size); + xfree(&obuf); + http_req_free(&req); + return 0; + } + } + buf[buf_idx++] = c; - } else { - panic("out of buf space"); + } } }