AmendHub

Download

jcs

/

wikipedia

/

wikipedia.c

 

(View History)

jcs   wikipedia: Try resizing buf if we run out of room, don't just dump it Latest amendment: 55 on 2023-10-31

1 /*
2 * Copyright (c) 2021-2022 joshua stein <jcs@jcs.org>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include <stdarg.h>
18 #include <stdio.h>
19 #include <string.h>
20
21 #include "wikipedia.h"
22 #include "http.h"
23 #include "utf8.h"
24 #include "util.h"
25
26 struct wikipedia_request *
27 wikipedia_fetch_article(struct browser *browser, char *name)
28 {
29 static char url[256];
30 struct wikipedia_request *wpr;
31 short state;
32 char *nencoded, *hostname;
33 unsigned char *uname;
34
35 progress("Fetching \"%s\"...", name);
36
37 wpr = xmalloczero(sizeof(struct wikipedia_request));
38 if (wpr == NULL) {
39 progress(NULL);
40 warn("Out of memory!");
41 return NULL;
42 }
43 wpr->browser = browser;
44
45 uname = macroman_to_utf8_string((unsigned char *)name, strlen(name));
46 if (uname == NULL) {
47 progress(NULL);
48 xfree(&wpr);
49 return NULL;
50 }
51 nencoded = url_encode(uname);
52 xfree(&uname);
53 if (nencoded == NULL) {
54 progress(NULL);
55 xfree(&wpr);
56 return NULL;
57 }
58
59 hostname = xGetStringAsChar(STR_HOSTNAME_ID);
60 if (hostname == NULL) {
61 progress(NULL);
62 warn("No Wikipedia hostname set, check Settings");
63 xfree(&wpr);
64 return NULL;
65 }
66
67 snprintf(url, sizeof(url), "http://%s/w/api.php?action=query&"
68 "prop=revisions&rvslots=*&rvprop=size|content&"
69 "format=xml&titles=%s", hostname, nencoded);
70 xfree(&nencoded);
71 xfree(&hostname);
72 wpr->http_request = http_get(url);
73 if (wpr->http_request == NULL) {
74 progress(NULL);
75 xfree(&nencoded);
76 xfree(&wpr);
77 return NULL;
78 }
79
80 http_req_skip_header(wpr->http_request);
81 wpr->read_len = wpr->http_request->chunk_len;
82
83 wpr->normalized_title = xstrdup(name);
84 if (wpr->normalized_title == NULL) {
85 progress(NULL);
86 warn("Out of memory!");
87 xfree(&nencoded);
88 xfree(&wpr);
89 return NULL;
90 }
91 wpr->state = WP_STATE_XML_INIT;
92
93 browser_debug_print(wpr->browser, wpr->http_request->chunk,
94 wpr->http_request->chunk_len);
95
96 return wpr;
97 }
98
99 size_t
100 wikipedia_fetch_search_results(struct browser *browser, char *query,
101 char ***results) /* a triple-star program! */
102 {
103 static char url[256];
104 struct http_request *req;
105 char *qencoded, *hostname;
106 char **rets = NULL, **trets = NULL;
107 char *str = NULL, *nstr = NULL, c;
108 unsigned char *uquery;
109 short strings = 0;
110 size_t nrets = 0, len, n, npos;
111 utf8_char utf8 = { 0 };
112 enum xml_state {
113 XML_DEFAULT,
114 XML_IN_TAG,
115 XML_IN_TEXT
116 } xstate = XML_DEFAULT;
117 char *buf, *obuf;
118 size_t buf_size;
119 size_t buf_idx;
120
121 uquery = macroman_to_utf8_string((unsigned char *)query,
122 strlen(query));
123 if (uquery == NULL)
124 return 0;
125 qencoded = url_encode(uquery);
126 xfree(&uquery);
127 if (qencoded == NULL)
128 return 0;
129
130 hostname = xGetStringAsChar(STR_HOSTNAME_ID);
131 if (hostname == NULL) {
132 warn("No Wikipedia hostname set, check Settings");
133 return 0;
134 }
135
136 len = snprintf(url, sizeof(url), "http://%s/w/api.php?"
137 "action=opensearch&format=xml&namespace=0&limit=10&"
138 "redirects=return&search=%s", hostname, qencoded);
139 xfree(&qencoded);
140 xfree(&hostname);
141 if (len > sizeof(url))
142 return 0;
143
144 req = http_get(url);
145 if (req == NULL)
146 return 0;
147
148 http_req_skip_header(req);
149
150 buf_size = 256;
151 buf_idx = 0;
152 buf = xmalloc(buf_size);
153 if (buf == NULL) {
154 warn("Out of memory!");
155 http_req_free(&req);
156 return 0;
157 }
158
159 for (;;) {
160 if (req->chunk_len == 0 || (req->chunk_off + 1 > req->chunk_len)) {
161 req->chunk_len = http_req_read(req, req->chunk,
162 sizeof(req->chunk));
163 req->chunk_off = 0;
164
165 if (req->chunk_len < 1 || (req->chunk_off + 1 > req->chunk_len))
166 break;
167 }
168
169 c = req->chunk[req->chunk_off++];
170
171 if (c == '<') {
172 if (xstate == XML_IN_TEXT) {
173 nrets++;
174 trets = xreallocarray(rets, sizeof(Ptr), nrets);
175 if (trets == NULL) {
176 warn("Out of memory!");
177 break;
178 }
179 rets = trets;
180 nstr = xstrndup(buf, buf_idx);
181 if (nstr == NULL) {
182 warn("Out of memory!");
183 break;
184 }
185 rets[nrets - 1] = nstr;
186 }
187
188 buf[0] = '\0';
189 buf_idx = 0;
190 xstate = XML_IN_TAG;
191 } else if (c == '>') {
192 if (xstate == XML_IN_TAG && strncmp(buf, "Text xml:", 9) == 0)
193 xstate = XML_IN_TEXT;
194 else
195 xstate = XML_DEFAULT;
196
197 buf[0] = '\0';
198 buf_idx = 0;
199 } else if (xstate == XML_IN_TAG || xstate == XML_IN_TEXT) {
200 if (xstate == XML_IN_TAG && buf_idx == 9 &&
201 strncmp(buf, "Text xml:", 9) != 0) {
202 /* not an interesting tag, don't bother saving it */
203 xstate = XML_DEFAULT;
204 buf[0] = '\0';
205 buf_idx = 0;
206 continue;
207 }
208
209 if ((unsigned char)c >= UTF8_RANGE_START &&
210 (unsigned char)c <= UTF8_RANGE_END) {
211 if (utf8[0] == 0)
212 utf8[0] = c;
213 else if (utf8[1] == 0)
214 utf8[1] = c;
215 else if (utf8[2] == 0)
216 utf8[2] = c;
217 else if (utf8[3] == 0)
218 utf8[3] = c;
219 else {
220 /* bogus */
221 utf8[0] = 0;
222 c = 0;
223 }
224
225 c = utf8_to_macroman(&utf8);
226 if (c)
227 memset(&utf8, 0, sizeof(utf8));
228 }
229
230 if (c) {
231 if (buf_idx >= buf_size) {
232 buf_size *= 2;
233 obuf = buf;
234 buf = xrealloc(buf, buf_size);
235 if (buf == NULL) {
236 warn("Out of text buffer space, failed resizing "
237 "to %d bytes", buf_size);
238 xfree(&obuf);
239 http_req_free(&req);
240 return 0;
241 }
242 }
243
244 buf[buf_idx++] = c;
245 }
246 }
247 }
248
249 http_req_free(&req);
250 xfree(&buf);
251
252 *results = rets;
253
254 return nrets;
255 }
256
257 void
258 wikipedia_request_present(struct wikipedia_request *wpr)
259 {
260 char title[255];
261
262 snprintf(title, sizeof(title), "%s: %s", PROGRAM_NAME,
263 wpr->normalized_title);
264 SetWTitle(wpr->browser->win, CtoPstr(title));
265
266 browser_clear(wpr->browser);
267 browser_print(wpr->browser, wpr->normalized_title,
268 strlen(wpr->normalized_title), STYLE_H1);
269 }
270
271 void
272 wikipedia_request_process(struct wikipedia_request *wpr)
273 {
274 struct http_request *req = wpr->http_request;
275 size_t len, n;
276 short pct;
277 char c, *last, *tbuf;
278 enum xml_state {
279 XML_DEFAULT,
280 XML_IN_NORMALIZED
281 } xstate = 0;
282 utf8_char utf8 = { 0 };
283
284 get_char:
285 if (req->chunk_len == 0 || (req->chunk_off >= req->chunk_len)) {
286 req->chunk_len = http_req_read(req, req->chunk,
287 sizeof(req->chunk));
288 req->chunk_off = 0;
289 wpr->read_len += req->chunk_len;
290
291 if (req->chunk_len < 1 || (req->chunk_off >= req->chunk_len)) {
292 wpr->state = WP_STATE_DONE;
293 goto done_parsing;
294 }
295
296 if (req->content_len > 0) {
297 pct = (wpr->read_len * 100) / req->content_len;
298 if (pct > 100)
299 pct = 100;
300
301 progress("Fetching \"%s\" (%d%%)...",
302 wpr->normalized_title, pct);
303 }
304
305 browser_debug_print(wpr->browser, req->chunk, req->chunk_len);
306 }
307
308 switch (wpr->state) {
309 case WP_STATE_XML_INIT:
310 wpr->buf_size = 1024;
311 wpr->buf_idx = 0;
312 wpr->buf = xmalloc(wpr->buf_size);
313 if (wpr->buf == NULL) {
314 warn("Out of memory!");
315 wpr->state = WP_STATE_DONE;
316 break;
317 }
318 wpr->state = WP_STATE_XML_PARSE;
319 goto get_char;
320
321 case WP_STATE_XML_PARSE:
322 c = req->chunk[req->chunk_off++];
323
324 if (c == '<') {
325 wpr->buf[0] = '\0';
326 wpr->buf_idx = 0;
327 } else if (c == '>') {
328 wpr->buf[wpr->buf_idx] = '\0';
329 if (xstate == XML_DEFAULT) {
330 if (strcmp(wpr->buf, "normalized") == 0) {
331 xstate = XML_IN_NORMALIZED;
332 } else if (strncmp(wpr->buf, "slot ", 5) == 0) {
333 wpr->state = WP_STATE_WIKITEXT_INIT;
334 } else if (!req->content_len &&
335 strncmp(wpr->buf, "rev size=", 9) == 0) {
336 if (sscanf(wpr->buf, "rev size=\"%ld\"", &len) == 1)
337 req->content_len = len;
338 }
339 } else if (xstate == XML_IN_NORMALIZED) {
340 char from_normalized[255], to_normalized[255];
341 size_t count;
342
343 if (sscanf(wpr->buf, "n from=\"%254[^\"]\" to=\"%254[^\"]\"%n",
344 &from_normalized, &to_normalized, &count) == 2 &&
345 count > 10) {
346 if (wpr->normalized_title != NULL)
347 xfree(&wpr->normalized_title);
348 wpr->normalized_title = xstrdup(to_normalized);
349 if (wpr->normalized_title == NULL) {
350 warn("Out of memory!");
351 goto done_parsing;
352 }
353 } else
354 xstate = XML_DEFAULT;
355 }
356 } else {
357 if (wpr->buf_idx >= wpr->buf_size)
358 panic("ran out of buf space parsing xml");
359 wpr->buf[wpr->buf_idx++] = c;
360 }
361
362 goto get_char;
363
364 case WP_STATE_WIKITEXT_INIT:
365 wpr->article_len = 0;
366 wpr->buf_idx = 0;
367 wpr->buf[0] = '\0';
368
369 wpr->curlys = 0;
370 wpr->brackets = 0;
371 wpr->refs = 0;
372 wpr->style = 0;
373 wpr->last_style = 0;
374 wpr->trim_whitespace = true;
375
376 wpr->state = WP_STATE_WIKITEXT_PARSE;
377 /* FALLTHROUGH */
378
379 case WP_STATE_WIKITEXT_PARSE: {
380 c = req->chunk[req->chunk_off];
381 last = wpr->buf + wpr->buf_idx - 1;
382
383 if (c == '<' || c == '\0') {
384 wpr->state = WP_STATE_DONE;
385 goto done_parsing;
386 }
387
388 /* character conversions */
389
390 if (c == ';') {
391 /* XML entity decode */
392 if (wpr->buf_idx >= 4 &&
393 last[-3] == '&' && last[-2] == 'a' && last[-1] == 'm' &&
394 last[0] == 'p') {
395 c = '&';
396 wpr->buf_idx -= 4;
397 } else if (wpr->buf_idx >= 5 &&
398 last[-4] == '&' && last[-3] == 'n' && last[-2] == 'b' &&
399 last[-1] == 's' && last[0] == 'p') {
400 c = ' ';
401 wpr->buf_idx -= 5;
402 } else if (wpr->buf_idx >= 2 &&
403 last[-2] == '&' && last[-1] == 'l' && last[0] == 't') {
404 c = '<';
405 wpr->buf_idx -= 3;
406 } else if (wpr->buf_idx >= 2 &&
407 last[-2] == '&' && last[-1] == 'g' && last[0] == 't') {
408 c = '>';
409 wpr->buf_idx -= 3;
410 }
411 last = wpr->buf + wpr->buf_idx - 1;
412 } else if (c == '\n') {
413 c = '\r';
414 } else if ((unsigned char)c >= UTF8_RANGE_START &&
415 (unsigned char)c <= UTF8_RANGE_END) {
416 /* utf-8 */
417 if (utf8[0] == 0)
418 utf8[0] = c;
419 else if (utf8[1] == 0)
420 utf8[1] = c;
421 else if (utf8[2] == 0)
422 utf8[2] = c;
423 else if (utf8[3] == 0)
424 utf8[3] = c;
425 else {
426 /* bogus */
427 utf8[0] = 0;
428 c = 0;
429 }
430
431 c = utf8_to_macroman(&utf8);
432 if (c)
433 memset(&utf8, 0, sizeof(utf8));
434 }
435
436 /* check for style changes */
437
438 if (wpr->buf_idx >= 1 &&
439 last[0] == '{' && (c == '{' || c == '|')) {
440 wpr->curlys++;
441 wpr->buf_idx--;
442 wpr->style |= STYLE_TEMPLATE;
443 c = 0;
444 } else if (wpr->buf_idx >= 1 &&
445 (last[0] == '}' || last[0] == '|') && c == '}') {
446 if (wpr->curlys)
447 wpr->curlys--;
448 wpr->buf_idx--;
449 if (wpr->curlys == 0)
450 wpr->style &= ~(STYLE_TEMPLATE);
451 c = 0;
452 } else if (wpr->buf_idx >= 1 &&
453 last[0] == '[' && c == '[') {
454 if (wpr->brackets)
455 wpr->brackets++;
456 wpr->buf_idx--;
457 wpr->style |= STYLE_LINK;
458 c = 0;
459 } else if (wpr->buf_idx >= 1 &&
460 last[0] == ']' && c == ']') {
461 if (wpr->brackets)
462 wpr->brackets--;
463 wpr->buf_idx--;
464 if (wpr->brackets == 0)
465 wpr->style &= ~(STYLE_LINK);
466 c = 0;
467 } else if (wpr->buf_idx >= 2 &&
468 last[-1] == '\'' && last[0] == '\'' && c == '\'') {
469 if (wpr->style & STYLE_BOLD)
470 wpr->style &= ~(STYLE_BOLD);
471 else
472 wpr->style |= STYLE_BOLD;
473 wpr->buf_idx -= 2;
474 c = 0;
475 } else if (wpr->buf_idx >= 2 &&
476 last[-1] == '\'' && last[0] == '\'' && c != '\'') {
477 if (wpr->style & STYLE_ITALIC)
478 wpr->style &= ~(STYLE_ITALIC);
479 else
480 wpr->style |= STYLE_ITALIC;
481 wpr->buf_idx -= 2;
482 /* keep c */
483 } else if (wpr->buf_idx >= 4 &&
484 last[-3] == '=' && last[-2] == '=' && last[-1] == '=' &&
485 last[0] == '=' && c == '=') {
486 if (wpr->style & STYLE_H5)
487 wpr->style &= ~(STYLE_H5);
488 else
489 wpr->style |= STYLE_H5;
490 wpr->buf_idx -= 4;
491 c = 0;
492 } else if (wpr->buf_idx >= 4 &&
493 last[-3] == '=' && last[-2] == '=' && last[-1] == '=' &&
494 last[0] == '=' && c != '=') {
495 if (wpr->style & STYLE_H4)
496 wpr->style &= ~(STYLE_H4);
497 else
498 wpr->style |= STYLE_H4;
499 wpr->buf_idx -= 4;
500 /* keep c */
501 } else if (wpr->buf_idx >= 3 &&
502 last[-2] == '=' && last[-1] == '=' && last[0] == '=' &&
503 c != '=') {
504 if (wpr->style & STYLE_H3)
505 wpr->style &= ~(STYLE_H3);
506 else
507 wpr->style |= STYLE_H3;
508 wpr->buf_idx -= 3;
509 /* keep c */
510 } else if (wpr->buf_idx >= 2 &&
511 last[-1] == '=' && last[0] == '=' && c != '=') {
512 if (wpr->style & STYLE_H2)
513 wpr->style &= ~(STYLE_H2);
514 else
515 wpr->style |= STYLE_H2;
516 wpr->buf_idx -= 2;
517 /* keep c */
518 } else if (wpr->buf_idx >= 3 &&
519 last[-2] == '<' && last[-1] == 'r' && last[0] == 'e' &&
520 c == 'f') {
521 /* <ref */
522 wpr->refs++;
523 wpr->style |= STYLE_REF;
524 wpr->buf_idx -= 3;
525 c = 0;
526 } else if ((wpr->style & STYLE_REF) &&
527 ((wpr->buf_idx >= 5 && last[-4] == '<' && last[-3] == '/' &&
528 last[-2] == 'r' && last[-1] == 'e' && last[0] == 'f' &&
529 c == '>') ||
530 (wpr->buf_idx >= 1 && last[0] == '/' && c == '>'))) {
531 /* </ref> or <ref /> */
532 if (wpr->refs)
533 wpr->refs--;
534 if (wpr->refs == 0)
535 wpr->style &= ~(STYLE_REF);
536 c = 0;
537 }
538
539 /*
540 * If our style changed as of this character, dump the buffer in
541 * the previous style and clear the buffer.
542 */
543
544 if (wpr->style != wpr->last_style) {
545 if (wpr->last_style & STYLE_TEMPLATE) {
546 if (strncmp(wpr->buf, "convert|", 8) == 0) {
547 /* convert|5.1|lb|... */
548 /* convert|9|in|cm|adj=on */
549 char *conv, *conv2;
550 size_t len;
551
552 conv = xmalloc(wpr->buf_idx);
553 if (conv == NULL) {
554 warn("Failed allocating %ld", wpr->buf_idx);
555 break;
556 }
557 conv2 = xmalloc(wpr->buf_idx);
558 if (conv2 == NULL) {
559 warn("Failed allocating %ld", wpr->buf_idx);
560 xfree(&conv);
561 break;
562 }
563 wpr->buf[wpr->buf_idx] = '\0';
564 if (sscanf(wpr->buf, "convert|%[^|]|%[^|]|%n", conv,
565 conv2, &len) == 2 && len >= 13)
566 wpr->buf_idx = snprintf(wpr->buf, wpr->buf_size,
567 "%s %s ", conv, conv2);
568 else
569 wpr->buf_idx = 0;
570 xfree(&conv);
571 xfree(&conv2);
572 } else
573 wpr->buf_idx = 0;
574 }
575
576 /* maybe we can do something with these later */
577 if (wpr->last_style & STYLE_REF)
578 wpr->buf_idx = 0;
579
580 /* we can't show inline images */
581 if ((wpr->last_style & STYLE_LINK) &&
582 strncmp(wpr->buf, "File:", 5) == 0) {
583 wpr->buf_idx = 0;
584 wpr->trim_whitespace = true;
585 }
586
587 if (wpr->last_style & (STYLE_TEMPLATE |
588 STYLE_H1 | STYLE_H2 | STYLE_H3 | STYLE_H4 | STYLE_H5))
589 wpr->trim_whitespace = true;
590
591 if ((wpr->style & STYLE_LINK) && wpr->article_len == 0 &&
592 strncmp(wpr->buf, "#REDIRECT ", 10) == 0) {
593 wpr->buf_idx = 0;
594 wpr->redirect = true;
595 } else if (wpr->redirect &&
596 !(wpr->style & STYLE_LINK) && (wpr->last_style & STYLE_LINK)) {
597 if (wpr->normalized_title)
598 xfree(&wpr->normalized_title);
599 wpr->buf[wpr->buf_idx] = '\0';
600 wpr->normalized_title = xstrdup(wpr->buf);
601 if (wpr->normalized_title == NULL) {
602 warn("Out of memory!");
603 wpr->state = WP_STATE_DONE;
604 } else
605 wpr->state = WP_STATE_HAVE_REDIRECT;
606 goto done_parsing;
607 }
608
609 if (wpr->buf_idx) {
610 if (wpr->article_len == 0)
611 wikipedia_request_present(wpr);
612
613 if (!browser_print(wpr->browser, wpr->buf, wpr->buf_idx,
614 wpr->last_style)) {
615 wpr->state = WP_STATE_DONE;
616 goto done_parsing;
617 }
618 wpr->article_len += wpr->buf_idx;
619 wpr->buf_idx = 0;
620 }
621 wpr->last_style = wpr->style;
622 }
623
624 /* remove whitespace */
625 if (c != 0 && wpr->trim_whitespace) {
626 if (c == '\r' || c == '\t' || c == ' ')
627 /* trim whitespace after these */
628 c = 0;
629 else
630 wpr->trim_whitespace = false;
631 }
632
633 /* and finally, add the new character */
634 if (c != 0) {
635 if (wpr->buf_idx >= wpr->buf_size) {
636 tbuf = wpr->buf;
637 wpr->buf = xrealloc(wpr->buf, wpr->buf_size * 2);
638 if (wpr->buf == NULL) {
639 wpr->buf = tbuf;
640 warn("Failed resizing parse buffer to %ld bytes, "
641 "not enough memory", wpr->buf_size * 2);
642 wpr->state = WP_STATE_DONE;
643 goto done_parsing;
644 }
645 wpr->buf_size *= 2;
646 }
647 wpr->buf[wpr->buf_idx++] = c;
648 }
649
650 req->chunk_off++;
651 goto get_char;
652 }
653 }
654
655 done_parsing:
656 if (wpr->buf != NULL)
657 xfree(&wpr->buf);
658
659 if (wpr->http_request != NULL)
660 http_req_free(&wpr->http_request);
661 }
662
663 void
664 wikipedia_request_free(struct wikipedia_request **wprptr)
665 {
666 struct wikipedia_request *wpr = (struct wikipedia_request *)*wprptr;
667
668 if (wpr == NULL) {
669 *wprptr = NULL;
670 return;
671 }
672
673 if (wpr->http_request != NULL)
674 http_req_free(&wpr->http_request);
675
676 *wprptr = NULL;
677 }