jcs
/wikipedia
/amendments
/53
wikipedia: Only collect characters inside a <Text> tag
Others with image URLs and other things can be quite large and we're
not doing anything with that data anyway. If we do run out of room,
try to grow before failing.
jcs made amendment 53 about 1 year ago
--- wikipedia.c Mon Oct 16 17:30:54 2023
+++ wikipedia.c Mon Oct 30 13:08:05 2023
@@ -113,8 +113,8 @@ wikipedia_fetch_search_results(struct browser *browser
XML_DEFAULT,
XML_IN_TAG,
XML_IN_TEXT
- } xstate = 0;
- char *buf;
+ } xstate = XML_DEFAULT;
+ char *buf, *obuf;
size_t buf_size;
size_t buf_idx;
@@ -196,7 +196,16 @@ wikipedia_fetch_search_results(struct browser *browser
buf[0] = '\0';
buf_idx = 0;
- } else if (buf_idx < buf_size) {
+ } else if (xstate == XML_IN_TAG || xstate == XML_IN_TEXT) {
+ if (xstate == XML_IN_TAG && buf_idx == 9 &&
+ strncmp(buf, "Text xml:", 9) != 0) {
+ /* not an interesting tag, don't bother saving it */
+ xstate = XML_DEFAULT;
+ buf[0] = '\0';
+ buf_idx = 0;
+ continue;
+ }
+
if ((unsigned char)c >= UTF8_RANGE_START &&
(unsigned char)c <= UTF8_RANGE_END) {
if (utf8[0] == 0)
@@ -217,11 +226,23 @@ wikipedia_fetch_search_results(struct browser *browser
if (c)
memset(&utf8, 0, sizeof(utf8));
}
-
- if (c)
+
+ if (c) {
+ if (buf_idx >= buf_size) {
+ buf_size *= 2;
+ obuf = buf;
+ buf = xrealloc(buf, buf_size);
+ if (buf == NULL) {
+ warn("Out of text buffer space, failed resizing "
+ "to %d bytes", buf_size);
+ xfree(&obuf);
+ http_req_free(&req);
+ return 0;
+ }
+ }
+
buf[buf_idx++] = c;
- } else {
- panic("out of buf space");
+ }
}
}