AmendHub

Download

jcs

/

wikipedia

/

wikipedia.c

 

(View History)

jcs   wikipedia: Remove debugging code Latest amendment: 36 on 2022-09-28

1 /*
2 * Copyright (c) 2021-2022 joshua stein <jcs@jcs.org>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include <stdarg.h>
18 #include <stdio.h>
19 #include <string.h>
20
21 #include "wikipedia.h"
22 #include "http.h"
23 #include "utf8.h"
24 #include "util.h"
25
26 /* en.wikipedia.org doesn't support non-TLS :( */
27 #define WIKIPEDIA_HOST "wikipedia.jcs.org"
28
29 struct wikipedia_request *
30 wikipedia_fetch_article(struct browser *browser, char *name)
31 {
32 static char url[256];
33 struct wikipedia_request *wpr;
34 short state;
35 char *nencoded;
36 unsigned char *uname;
37
38 progress("Fetching article \"%s\"...", name);
39
40 wpr = xmalloczero(sizeof(struct wikipedia_request),
41 "fetch_article wpr");
42 wpr->browser = browser;
43
44 uname = macroman_to_utf8_string((unsigned char *)name, strlen(name));
45 nencoded = url_encode(uname);
46 xfree(&uname);
47
48 snprintf(url, sizeof(url), "http://%s/w/api.php?action=query&"
49 "prop=revisions&rvslots=*&rvprop=content&"
50 "format=xml&titles=%s", WIKIPEDIA_HOST, nencoded);
51 xfree(&nencoded);
52 wpr->http_request = http_get(url);
53 http_req_skip_header(wpr->http_request);
54 wpr->state = WP_STATE_XML_INIT;
55 wpr->normalized_title = xstrdup(name, "normalized_title");
56
57 browser_debug_print(wpr->browser, wpr->http_request->chunk,
58 wpr->http_request->chunk_len);
59
60 return wpr;
61 }
62
63 size_t
64 wikipedia_fetch_search_results(struct browser *browser, char *query,
65 char ***results)
66 {
67 static char url[256];
68 struct http_request *req;
69 char *qencoded;
70 char **rets = NULL;
71 char *str = NULL, *nstr = NULL, c;
72 unsigned char *uquery;
73 short strings = 0;
74 size_t nrets = 0, len, n, npos;
75 utf8_char utf8 = { 0 };
76 enum xml_state {
77 XML_DEFAULT,
78 XML_IN_TAG,
79 XML_IN_TEXT
80 } xstate = 0;
81 char *buf;
82 size_t buf_size;
83 size_t buf_len;
84
85 uquery = macroman_to_utf8_string((unsigned char *)query, strlen(query));
86 qencoded = url_encode(uquery);
87 xfree(&uquery);
88
89 snprintf(url, sizeof(url), "http://%s/w/api.php?action=opensearch&"
90 "format=xml&namespace=0&limit=10&redirects=return&search=%s",
91 WIKIPEDIA_HOST, qencoded);
92 xfree(&qencoded);
93 req = http_get(url);
94 http_req_skip_header(req);
95
96 buf_size = 256;
97 buf_len = 0;
98 buf = xmalloc(buf_size, "xml buf");
99
100 for (;;) {
101 if (req->chunk_len == 0 || (req->chunk_off + 1 > req->chunk_len)) {
102 req->chunk_len = http_req_read(req, req->chunk,
103 sizeof(req->chunk));
104 req->chunk_off = 0;
105
106 if (req->chunk_len < 1 || (req->chunk_off + 1 > req->chunk_len))
107 break;
108 }
109
110 c = req->chunk[req->chunk_off++];
111
112 if (c == '<') {
113 if (xstate == XML_IN_TEXT) {
114 nrets++;
115 rets = xreallocarray(rets, sizeof(Ptr), nrets);
116 nstr = xstrndup(buf, buf_len, "search result");
117 rets[nrets - 1] = nstr;
118 }
119
120 buf[0] = '\0';
121 buf_len = 0;
122 xstate = XML_IN_TAG;
123 } else if (c == '>') {
124 if (xstate == XML_IN_TAG &&
125 strncmp(buf, "Text xml:", 9) == 0)
126 xstate = XML_IN_TEXT;
127 else
128 xstate = XML_DEFAULT;
129
130 buf[0] = '\0';
131 buf_len = 0;
132 } else if (buf_len < buf_size) {
133 if ((unsigned char)c >= UTF8_RANGE_START &&
134 (unsigned char)c <= UTF8_RANGE_END) {
135 if (utf8[0] == 0)
136 utf8[0] = c;
137 else if (utf8[1] == 0)
138 utf8[1] = c;
139 else if (utf8[2] == 0)
140 utf8[2] = c;
141 else if (utf8[3] == 0)
142 utf8[3] = c;
143 else {
144 /* bogus */
145 utf8[0] = 0;
146 c = 0;
147 }
148
149 c = utf8_to_macroman(&utf8);
150 if (c)
151 memset(&utf8, 0, sizeof(utf8));
152 }
153
154 if (c)
155 buf[buf_len++] = c;
156 }
157 }
158
159 http_req_free(&req);
160 http_req_free(&buf);
161
162 *results = rets;
163
164 return nrets;
165 }
166
167 void
168 wikipedia_request_present(struct wikipedia_request *wpr)
169 {
170 char title[255];
171
172 snprintf(title, sizeof(title), "%s: %s", PROGRAM_NAME,
173 wpr->normalized_title);
174 SetWTitle(wpr->browser->win, CtoPstr(title));
175
176 browser_clear(wpr->browser);
177 browser_print(wpr->browser, wpr->normalized_title,
178 strlen(wpr->normalized_title), STYLE_H1);
179 }
180
181 void
182 wikipedia_request_process(struct wikipedia_request *wpr)
183 {
184 struct http_request *req = wpr->http_request;
185 size_t len, n;
186 char c, *last;
187 enum xml_state {
188 XML_DEFAULT,
189 XML_IN_NORMALIZED
190 } xstate = 0;
191 utf8_char utf8 = { 0 };
192
193 get_char:
194 if (req->chunk_len == 0 || (req->chunk_off + 1 > req->chunk_len)) {
195 req->chunk_len = http_req_read(req, req->chunk,
196 sizeof(req->chunk));
197 req->chunk_off = 0;
198
199 if (req->chunk_len < 1 || (req->chunk_off + 1 > req->chunk_len)) {
200 wpr->state = WP_STATE_DONE;
201 goto done_parsing;
202 }
203
204 browser_debug_print(wpr->browser, req->chunk, req->chunk_len);
205 }
206
207 switch (wpr->state) {
208 case WP_STATE_XML_INIT:
209 wpr->buf_size = 1024;
210 wpr->buf_len = 0;
211 wpr->buf = xmalloc(wpr->buf_size, "wpr buf");
212 wpr->state = WP_STATE_XML_PARSE;
213 goto get_char;
214
215 case WP_STATE_XML_PARSE:
216 c = req->chunk[req->chunk_off++];
217
218 if (c == '<') {
219 wpr->buf[0] = '\0';
220 wpr->buf_len = 0;
221 } else if (c == '>') {
222 wpr->buf[wpr->buf_len] = '\0';
223 if (xstate == XML_DEFAULT) {
224 if (strcmp(wpr->buf, "normalized") == 0) {
225 xstate = XML_IN_NORMALIZED;
226 } else if (strncmp(wpr->buf, "slot ", 5) == 0) {
227 wpr->state = WP_STATE_WIKITEXT_INIT;
228 }
229 } else if (xstate == XML_IN_NORMALIZED) {
230 char from_normalized[255], to_normalized[255];
231 size_t count;
232
233 if (sscanf(wpr->buf, "n from=\"%[^\"]\" to=\"%[^\"]\"%n",
234 &from_normalized, &to_normalized, &count) == 2 &&
235 count > 10) {
236 if (wpr->normalized_title != NULL)
237 xfree(&wpr->normalized_title);
238 wpr->normalized_title = xstrdup(to_normalized,
239 "to_normalized");
240 } else
241 xstate = XML_DEFAULT;
242 }
243 } else if (wpr->buf_len < wpr->buf_size) {
244 wpr->buf[wpr->buf_len++] = c;
245 }
246
247 goto get_char;
248
249 case WP_STATE_WIKITEXT_INIT:
250 wpr->article_len = 0;
251 wpr->buf_len = 0;
252 wpr->buf[0] = '\0';
253
254 wpr->curlys = 0;
255 wpr->brackets = 0;
256 wpr->refs = 0;
257 wpr->style = 0;
258 wpr->last_style = 0;
259 wpr->trim_whitespace = true;
260
261 wpr->state = WP_STATE_WIKITEXT_PARSE;
262 /* FALLTHROUGH */
263
264 case WP_STATE_WIKITEXT_PARSE: {
265 c = req->chunk[req->chunk_off];
266 last = wpr->buf + wpr->buf_len - 1;
267
268 if (c == '<' || c == '\0') {
269 wpr->state = WP_STATE_DONE;
270 goto done_parsing;
271 }
272
273 /* character conversions */
274
275 if (c == ';') {
276 /* XML entity decode */
277 if (last[-3] == '&' && last[-2] == 'a' && last[-1] == 'm' &&
278 last[0] == 'p') {
279 c = '&';
280 wpr->buf_len -= 4;
281 } else if (last[-4] == '&' && last[-3] == 'n' &&
282 last[-2] == 'b' && last[-1] == 's' && last[0] == 'p') {
283 c = ' ';
284 wpr->buf_len -= 5;
285 } else if (last[-2] == '&' && last[-1] == 'l' && last[0] == 't') {
286 c = '<';
287 wpr->buf_len -= 3;
288 } else if (last[-2] == '&' && last[-1] == 'g' && last[0] == 't') {
289 c = '>';
290 wpr->buf_len -= 3;
291 }
292 last = wpr->buf + wpr->buf_len - 1;
293 } else if (c == '\n') {
294 c = '\r';
295 } else if ((unsigned char)c >= UTF8_RANGE_START &&
296 (unsigned char)c <= UTF8_RANGE_END) {
297 /* utf-8 */
298 if (utf8[0] == 0)
299 utf8[0] = c;
300 else if (utf8[1] == 0)
301 utf8[1] = c;
302 else if (utf8[2] == 0)
303 utf8[2] = c;
304 else if (utf8[3] == 0)
305 utf8[3] = c;
306 else {
307 /* bogus */
308 utf8[0] = 0;
309 c = 0;
310 }
311
312 c = utf8_to_macroman(&utf8);
313 if (c)
314 memset(&utf8, 0, sizeof(utf8));
315 }
316
317 /* check for style changes */
318
319 if (last[0] == '{' && (c == '{' || c == '|')) {
320 wpr->curlys++;
321 wpr->buf_len--;
322 wpr->style |= STYLE_TEMPLATE;
323 c = 0;
324 } else if ((last[0] == '}' || last[0] == '|') && c == '}') {
325 if (wpr->curlys)
326 wpr->curlys--;
327 wpr->buf_len--;
328 if (wpr->curlys == 0)
329 wpr->style &= ~(STYLE_TEMPLATE);
330 c = 0;
331 } else if (last[0] == '[' && c == '[') {
332 if (wpr->brackets)
333 wpr->brackets++;
334 wpr->buf_len--;
335 wpr->style |= STYLE_LINK;
336 c = 0;
337 } else if (last[0] == ']' && c == ']') {
338 if (wpr->brackets)
339 wpr->brackets--;
340 wpr->buf_len--;
341 if (wpr->brackets == 0)
342 wpr->style &= ~(STYLE_LINK);
343 c = 0;
344 } else if (last[-1] == '\'' && last[0] == '\'' && c == '\'') {
345 if (wpr->style & STYLE_BOLD)
346 wpr->style &= ~(STYLE_BOLD);
347 else
348 wpr->style |= STYLE_BOLD;
349 wpr->buf_len -= 2;
350 c = 0;
351 } else if (last[-1] == '\'' && last[0] == '\'' && c != '\'') {
352 if (wpr->style & STYLE_ITALIC)
353 wpr->style &= ~(STYLE_ITALIC);
354 else
355 wpr->style |= STYLE_ITALIC;
356 wpr->buf_len -= 2;
357 /* keep c */
358 } else if (last[-3] == '=' && last[-2] == '=' && last[-1] == '=' &&
359 last[0] == '=' && c == '=') {
360 if (wpr->style & STYLE_H5)
361 wpr->style &= ~(STYLE_H5);
362 else
363 wpr->style |= STYLE_H5;
364 wpr->buf_len -= 4;
365 c = 0;
366 } else if (last[-3] == '=' && last[-2] == '=' && last[-1] == '=' &&
367 last[0] == '=' && c != '=') {
368 if (wpr->style & STYLE_H4)
369 wpr->style &= ~(STYLE_H4);
370 else
371 wpr->style |= STYLE_H4;
372 wpr->buf_len -= 4;
373 /* keep c */
374 } else if (last[-2] == '=' && last[-1] == '=' && last[0] == '=' &&
375 c != '=') {
376 if (wpr->style & STYLE_H3)
377 wpr->style &= ~(STYLE_H3);
378 else
379 wpr->style |= STYLE_H3;
380 wpr->buf_len -= 3;
381 /* keep c */
382 } else if (last[-1] == '=' && last[0] == '=' && c != '=') {
383 if (wpr->style & STYLE_H2)
384 wpr->style &= ~(STYLE_H2);
385 else
386 wpr->style |= STYLE_H2;
387 wpr->buf_len -= 2;
388 /* keep c */
389 } else if (last[-2] == '<' && last[-1] == 'r' && last[0] == 'e' &&
390 c == 'f') {
391 /* <ref */
392 wpr->refs++;
393 wpr->style |= STYLE_REF;
394 wpr->buf_len -= 3;
395 c = 0;
396 } else if ((wpr->style & STYLE_REF) &&
397 ((last[-4] == '<' && last[-3] == '/' && last[-2] == 'r' &&
398 last[-1] == 'e' && last[0] == 'f' && c == '>') ||
399 (last[0] == '/' && c == '>'))) {
400 /* </ref> or <ref /> */
401 if (wpr->refs)
402 wpr->refs--;
403 if (wpr->refs == 0)
404 wpr->style &= ~(STYLE_REF);
405 c = 0;
406 }
407
408 /*
409 * If our style changed as of this character, dump the buffer in
410 * the previous style and clear the buffer.
411 */
412
413 if (wpr->style != wpr->last_style) {
414 if (wpr->last_style & STYLE_TEMPLATE) {
415 if (strncmp(wpr->buf, "convert|", 8) == 0) {
416 /* convert|5.1|lb|... */
417 /* convert|9|in|cm|adj=on */
418 char *conv = xmalloc(wpr->buf_len, "convert");
419 char *conv2 = xmalloc(wpr->buf_len, "convert");
420 size_t len;
421 wpr->buf[wpr->buf_len] = '\0';
422 if (sscanf(wpr->buf, "convert|%[^|]|%[^|]|%n", conv, conv2,
423 &len) == 2 && len >= 13)
424 wpr->buf_len = sprintf(wpr->buf, "%s %s ", conv, conv2);
425 else
426 wpr->buf_len = 0;
427 xfree(&conv);
428 xfree(&conv2);
429 } else
430 wpr->buf_len = 0;
431 }
432
433 /* maybe we can do something with these later */
434 if (wpr->last_style & STYLE_REF)
435 wpr->buf_len = 0;
436
437 /* we can't show inline images */
438 if ((wpr->last_style & STYLE_LINK) &&
439 strncmp(wpr->buf, "File:", 5) == 0) {
440 wpr->buf_len = 0;
441 wpr->trim_whitespace = true;
442 }
443
444 if (wpr->last_style & (STYLE_TEMPLATE |
445 STYLE_H1 | STYLE_H2 | STYLE_H3 | STYLE_H4 | STYLE_H5))
446 wpr->trim_whitespace = true;
447
448 if ((wpr->style & STYLE_LINK) && wpr->article_len == 0 &&
449 strncmp(wpr->buf, "#REDIRECT ", 10) == 0) {
450 wpr->buf_len = 0;
451 wpr->redirect = true;
452 } else if (wpr->redirect &&
453 !(wpr->style & STYLE_LINK) &&
454 (wpr->last_style & STYLE_LINK)) {
455 if (wpr->normalized_title)
456 xfree(&wpr->normalized_title);
457 wpr->buf[wpr->buf_len] = '\0';
458 wpr->normalized_title = xstrdup(wpr->buf, "title");
459 wpr->state = WP_STATE_HAVE_REDIRECT;
460 goto done_parsing;
461 }
462
463 if (wpr->buf_len) {
464 if (wpr->article_len == 0)
465 wikipedia_request_present(wpr);
466
467 browser_print(wpr->browser, wpr->buf, wpr->buf_len,
468 wpr->last_style);
469 wpr->article_len += wpr->buf_len;
470 wpr->buf_len = 0;
471 }
472 wpr->last_style = wpr->style;
473 }
474
475 /* remove whitespace */
476 if (c != 0 && wpr->trim_whitespace) {
477 if (c == '\r' || c == '\t' || c == ' ')
478 /* trim whitespace after these */
479 c = 0;
480 else
481 wpr->trim_whitespace = false;
482 }
483
484 /* and finally, add the new character */
485 if (c != 0)
486 wpr->buf[wpr->buf_len++] = c;
487
488 req->chunk_off++;
489 goto get_char;
490 }
491 }
492
493 done_parsing:
494 if (wpr->buf != NULL)
495 xfree(&wpr->buf);
496
497 if (wpr->http_request != NULL)
498 http_req_free(&wpr->http_request);
499 }
500
501 void
502 wikipedia_request_abort(struct wikipedia_request *wpr)
503 {
504 if (wpr->http_request != NULL)
505 http_req_free(&wpr->http_request);
506 }