Download
jcs
/wikipedia
/wikipedia.c
(View History)
jcs wikipedia: Remove debugging code | Latest amendment: 36 on 2022-09-28 |
1 | /* |
2 | * Copyright (c) 2021-2022 joshua stein <jcs@jcs.org> |
3 | * |
4 | * Permission to use, copy, modify, and distribute this software for any |
5 | * purpose with or without fee is hereby granted, provided that the above |
6 | * copyright notice and this permission notice appear in all copies. |
7 | * |
8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
15 | */ |
16 | |
17 | #include <stdarg.h> |
18 | #include <stdio.h> |
19 | #include <string.h> |
20 | |
21 | #include "wikipedia.h" |
22 | #include "http.h" |
23 | #include "utf8.h" |
24 | #include "util.h" |
25 | |
26 | /* en.wikipedia.org doesn't support non-TLS :( */ |
27 | #define WIKIPEDIA_HOST "wikipedia.jcs.org" |
28 | |
29 | struct wikipedia_request * |
30 | wikipedia_fetch_article(struct browser *browser, char *name) |
31 | { |
32 | static char url[256]; |
33 | struct wikipedia_request *wpr; |
34 | short state; |
35 | char *nencoded; |
36 | unsigned char *uname; |
37 | |
38 | progress("Fetching article \"%s\"...", name); |
39 | |
40 | wpr = xmalloczero(sizeof(struct wikipedia_request), |
41 | "fetch_article wpr"); |
42 | wpr->browser = browser; |
43 | |
44 | uname = macroman_to_utf8_string((unsigned char *)name, strlen(name)); |
45 | nencoded = url_encode(uname); |
46 | xfree(&uname); |
47 | |
48 | snprintf(url, sizeof(url), "http://%s/w/api.php?action=query&" |
49 | "prop=revisions&rvslots=*&rvprop=content&" |
50 | "format=xml&titles=%s", WIKIPEDIA_HOST, nencoded); |
51 | xfree(&nencoded); |
52 | wpr->http_request = http_get(url); |
53 | http_req_skip_header(wpr->http_request); |
54 | wpr->state = WP_STATE_XML_INIT; |
55 | wpr->normalized_title = xstrdup(name, "normalized_title"); |
56 | |
57 | browser_debug_print(wpr->browser, wpr->http_request->chunk, |
58 | wpr->http_request->chunk_len); |
59 | |
60 | return wpr; |
61 | } |
62 | |
63 | size_t |
64 | wikipedia_fetch_search_results(struct browser *browser, char *query, |
65 | char ***results) |
66 | { |
67 | static char url[256]; |
68 | struct http_request *req; |
69 | char *qencoded; |
70 | char **rets = NULL; |
71 | char *str = NULL, *nstr = NULL, c; |
72 | unsigned char *uquery; |
73 | short strings = 0; |
74 | size_t nrets = 0, len, n, npos; |
75 | utf8_char utf8 = { 0 }; |
76 | enum xml_state { |
77 | XML_DEFAULT, |
78 | XML_IN_TAG, |
79 | XML_IN_TEXT |
80 | } xstate = 0; |
81 | char *buf; |
82 | size_t buf_size; |
83 | size_t buf_len; |
84 | |
85 | uquery = macroman_to_utf8_string((unsigned char *)query, strlen(query)); |
86 | qencoded = url_encode(uquery); |
87 | xfree(&uquery); |
88 | |
89 | snprintf(url, sizeof(url), "http://%s/w/api.php?action=opensearch&" |
90 | "format=xml&namespace=0&limit=10&redirects=return&search=%s", |
91 | WIKIPEDIA_HOST, qencoded); |
92 | xfree(&qencoded); |
93 | req = http_get(url); |
94 | http_req_skip_header(req); |
95 | |
96 | buf_size = 256; |
97 | buf_len = 0; |
98 | buf = xmalloc(buf_size, "xml buf"); |
99 | |
100 | for (;;) { |
101 | if (req->chunk_len == 0 || (req->chunk_off + 1 > req->chunk_len)) { |
102 | req->chunk_len = http_req_read(req, req->chunk, |
103 | sizeof(req->chunk)); |
104 | req->chunk_off = 0; |
105 | |
106 | if (req->chunk_len < 1 || (req->chunk_off + 1 > req->chunk_len)) |
107 | break; |
108 | } |
109 | |
110 | c = req->chunk[req->chunk_off++]; |
111 | |
112 | if (c == '<') { |
113 | if (xstate == XML_IN_TEXT) { |
114 | nrets++; |
115 | rets = xreallocarray(rets, sizeof(Ptr), nrets); |
116 | nstr = xstrndup(buf, buf_len, "search result"); |
117 | rets[nrets - 1] = nstr; |
118 | } |
119 | |
120 | buf[0] = '\0'; |
121 | buf_len = 0; |
122 | xstate = XML_IN_TAG; |
123 | } else if (c == '>') { |
124 | if (xstate == XML_IN_TAG && |
125 | strncmp(buf, "Text xml:", 9) == 0) |
126 | xstate = XML_IN_TEXT; |
127 | else |
128 | xstate = XML_DEFAULT; |
129 | |
130 | buf[0] = '\0'; |
131 | buf_len = 0; |
132 | } else if (buf_len < buf_size) { |
133 | if ((unsigned char)c >= UTF8_RANGE_START && |
134 | (unsigned char)c <= UTF8_RANGE_END) { |
135 | if (utf8[0] == 0) |
136 | utf8[0] = c; |
137 | else if (utf8[1] == 0) |
138 | utf8[1] = c; |
139 | else if (utf8[2] == 0) |
140 | utf8[2] = c; |
141 | else if (utf8[3] == 0) |
142 | utf8[3] = c; |
143 | else { |
144 | /* bogus */ |
145 | utf8[0] = 0; |
146 | c = 0; |
147 | } |
148 | |
149 | c = utf8_to_macroman(&utf8); |
150 | if (c) |
151 | memset(&utf8, 0, sizeof(utf8)); |
152 | } |
153 | |
154 | if (c) |
155 | buf[buf_len++] = c; |
156 | } |
157 | } |
158 | |
159 | http_req_free(&req); |
160 | http_req_free(&buf); |
161 | |
162 | *results = rets; |
163 | |
164 | return nrets; |
165 | } |
166 | |
167 | void |
168 | wikipedia_request_present(struct wikipedia_request *wpr) |
169 | { |
170 | char title[255]; |
171 | |
172 | snprintf(title, sizeof(title), "%s: %s", PROGRAM_NAME, |
173 | wpr->normalized_title); |
174 | SetWTitle(wpr->browser->win, CtoPstr(title)); |
175 | |
176 | browser_clear(wpr->browser); |
177 | browser_print(wpr->browser, wpr->normalized_title, |
178 | strlen(wpr->normalized_title), STYLE_H1); |
179 | } |
180 | |
181 | void |
182 | wikipedia_request_process(struct wikipedia_request *wpr) |
183 | { |
184 | struct http_request *req = wpr->http_request; |
185 | size_t len, n; |
186 | char c, *last; |
187 | enum xml_state { |
188 | XML_DEFAULT, |
189 | XML_IN_NORMALIZED |
190 | } xstate = 0; |
191 | utf8_char utf8 = { 0 }; |
192 | |
193 | get_char: |
194 | if (req->chunk_len == 0 || (req->chunk_off + 1 > req->chunk_len)) { |
195 | req->chunk_len = http_req_read(req, req->chunk, |
196 | sizeof(req->chunk)); |
197 | req->chunk_off = 0; |
198 | |
199 | if (req->chunk_len < 1 || (req->chunk_off + 1 > req->chunk_len)) { |
200 | wpr->state = WP_STATE_DONE; |
201 | goto done_parsing; |
202 | } |
203 | |
204 | browser_debug_print(wpr->browser, req->chunk, req->chunk_len); |
205 | } |
206 | |
207 | switch (wpr->state) { |
208 | case WP_STATE_XML_INIT: |
209 | wpr->buf_size = 1024; |
210 | wpr->buf_len = 0; |
211 | wpr->buf = xmalloc(wpr->buf_size, "wpr buf"); |
212 | wpr->state = WP_STATE_XML_PARSE; |
213 | goto get_char; |
214 | |
215 | case WP_STATE_XML_PARSE: |
216 | c = req->chunk[req->chunk_off++]; |
217 | |
218 | if (c == '<') { |
219 | wpr->buf[0] = '\0'; |
220 | wpr->buf_len = 0; |
221 | } else if (c == '>') { |
222 | wpr->buf[wpr->buf_len] = '\0'; |
223 | if (xstate == XML_DEFAULT) { |
224 | if (strcmp(wpr->buf, "normalized") == 0) { |
225 | xstate = XML_IN_NORMALIZED; |
226 | } else if (strncmp(wpr->buf, "slot ", 5) == 0) { |
227 | wpr->state = WP_STATE_WIKITEXT_INIT; |
228 | } |
229 | } else if (xstate == XML_IN_NORMALIZED) { |
230 | char from_normalized[255], to_normalized[255]; |
231 | size_t count; |
232 | |
233 | if (sscanf(wpr->buf, "n from=\"%[^\"]\" to=\"%[^\"]\"%n", |
234 | &from_normalized, &to_normalized, &count) == 2 && |
235 | count > 10) { |
236 | if (wpr->normalized_title != NULL) |
237 | xfree(&wpr->normalized_title); |
238 | wpr->normalized_title = xstrdup(to_normalized, |
239 | "to_normalized"); |
240 | } else |
241 | xstate = XML_DEFAULT; |
242 | } |
243 | } else if (wpr->buf_len < wpr->buf_size) { |
244 | wpr->buf[wpr->buf_len++] = c; |
245 | } |
246 | |
247 | goto get_char; |
248 | |
249 | case WP_STATE_WIKITEXT_INIT: |
250 | wpr->article_len = 0; |
251 | wpr->buf_len = 0; |
252 | wpr->buf[0] = '\0'; |
253 | |
254 | wpr->curlys = 0; |
255 | wpr->brackets = 0; |
256 | wpr->refs = 0; |
257 | wpr->style = 0; |
258 | wpr->last_style = 0; |
259 | wpr->trim_whitespace = true; |
260 | |
261 | wpr->state = WP_STATE_WIKITEXT_PARSE; |
262 | /* FALLTHROUGH */ |
263 | |
264 | case WP_STATE_WIKITEXT_PARSE: { |
265 | c = req->chunk[req->chunk_off]; |
266 | last = wpr->buf + wpr->buf_len - 1; |
267 | |
268 | if (c == '<' || c == '\0') { |
269 | wpr->state = WP_STATE_DONE; |
270 | goto done_parsing; |
271 | } |
272 | |
273 | /* character conversions */ |
274 | |
275 | if (c == ';') { |
276 | /* XML entity decode */ |
277 | if (last[-3] == '&' && last[-2] == 'a' && last[-1] == 'm' && |
278 | last[0] == 'p') { |
279 | c = '&'; |
280 | wpr->buf_len -= 4; |
281 | } else if (last[-4] == '&' && last[-3] == 'n' && |
282 | last[-2] == 'b' && last[-1] == 's' && last[0] == 'p') { |
283 | c = ' '; |
284 | wpr->buf_len -= 5; |
285 | } else if (last[-2] == '&' && last[-1] == 'l' && last[0] == 't') { |
286 | c = '<'; |
287 | wpr->buf_len -= 3; |
288 | } else if (last[-2] == '&' && last[-1] == 'g' && last[0] == 't') { |
289 | c = '>'; |
290 | wpr->buf_len -= 3; |
291 | } |
292 | last = wpr->buf + wpr->buf_len - 1; |
293 | } else if (c == '\n') { |
294 | c = '\r'; |
295 | } else if ((unsigned char)c >= UTF8_RANGE_START && |
296 | (unsigned char)c <= UTF8_RANGE_END) { |
297 | /* utf-8 */ |
298 | if (utf8[0] == 0) |
299 | utf8[0] = c; |
300 | else if (utf8[1] == 0) |
301 | utf8[1] = c; |
302 | else if (utf8[2] == 0) |
303 | utf8[2] = c; |
304 | else if (utf8[3] == 0) |
305 | utf8[3] = c; |
306 | else { |
307 | /* bogus */ |
308 | utf8[0] = 0; |
309 | c = 0; |
310 | } |
311 | |
312 | c = utf8_to_macroman(&utf8); |
313 | if (c) |
314 | memset(&utf8, 0, sizeof(utf8)); |
315 | } |
316 | |
317 | /* check for style changes */ |
318 | |
319 | if (last[0] == '{' && (c == '{' || c == '|')) { |
320 | wpr->curlys++; |
321 | wpr->buf_len--; |
322 | wpr->style |= STYLE_TEMPLATE; |
323 | c = 0; |
324 | } else if ((last[0] == '}' || last[0] == '|') && c == '}') { |
325 | if (wpr->curlys) |
326 | wpr->curlys--; |
327 | wpr->buf_len--; |
328 | if (wpr->curlys == 0) |
329 | wpr->style &= ~(STYLE_TEMPLATE); |
330 | c = 0; |
331 | } else if (last[0] == '[' && c == '[') { |
332 | if (wpr->brackets) |
333 | wpr->brackets++; |
334 | wpr->buf_len--; |
335 | wpr->style |= STYLE_LINK; |
336 | c = 0; |
337 | } else if (last[0] == ']' && c == ']') { |
338 | if (wpr->brackets) |
339 | wpr->brackets--; |
340 | wpr->buf_len--; |
341 | if (wpr->brackets == 0) |
342 | wpr->style &= ~(STYLE_LINK); |
343 | c = 0; |
344 | } else if (last[-1] == '\'' && last[0] == '\'' && c == '\'') { |
345 | if (wpr->style & STYLE_BOLD) |
346 | wpr->style &= ~(STYLE_BOLD); |
347 | else |
348 | wpr->style |= STYLE_BOLD; |
349 | wpr->buf_len -= 2; |
350 | c = 0; |
351 | } else if (last[-1] == '\'' && last[0] == '\'' && c != '\'') { |
352 | if (wpr->style & STYLE_ITALIC) |
353 | wpr->style &= ~(STYLE_ITALIC); |
354 | else |
355 | wpr->style |= STYLE_ITALIC; |
356 | wpr->buf_len -= 2; |
357 | /* keep c */ |
358 | } else if (last[-3] == '=' && last[-2] == '=' && last[-1] == '=' && |
359 | last[0] == '=' && c == '=') { |
360 | if (wpr->style & STYLE_H5) |
361 | wpr->style &= ~(STYLE_H5); |
362 | else |
363 | wpr->style |= STYLE_H5; |
364 | wpr->buf_len -= 4; |
365 | c = 0; |
366 | } else if (last[-3] == '=' && last[-2] == '=' && last[-1] == '=' && |
367 | last[0] == '=' && c != '=') { |
368 | if (wpr->style & STYLE_H4) |
369 | wpr->style &= ~(STYLE_H4); |
370 | else |
371 | wpr->style |= STYLE_H4; |
372 | wpr->buf_len -= 4; |
373 | /* keep c */ |
374 | } else if (last[-2] == '=' && last[-1] == '=' && last[0] == '=' && |
375 | c != '=') { |
376 | if (wpr->style & STYLE_H3) |
377 | wpr->style &= ~(STYLE_H3); |
378 | else |
379 | wpr->style |= STYLE_H3; |
380 | wpr->buf_len -= 3; |
381 | /* keep c */ |
382 | } else if (last[-1] == '=' && last[0] == '=' && c != '=') { |
383 | if (wpr->style & STYLE_H2) |
384 | wpr->style &= ~(STYLE_H2); |
385 | else |
386 | wpr->style |= STYLE_H2; |
387 | wpr->buf_len -= 2; |
388 | /* keep c */ |
389 | } else if (last[-2] == '<' && last[-1] == 'r' && last[0] == 'e' && |
390 | c == 'f') { |
391 | /* <ref */ |
392 | wpr->refs++; |
393 | wpr->style |= STYLE_REF; |
394 | wpr->buf_len -= 3; |
395 | c = 0; |
396 | } else if ((wpr->style & STYLE_REF) && |
397 | ((last[-4] == '<' && last[-3] == '/' && last[-2] == 'r' && |
398 | last[-1] == 'e' && last[0] == 'f' && c == '>') || |
399 | (last[0] == '/' && c == '>'))) { |
400 | /* </ref> or <ref /> */ |
401 | if (wpr->refs) |
402 | wpr->refs--; |
403 | if (wpr->refs == 0) |
404 | wpr->style &= ~(STYLE_REF); |
405 | c = 0; |
406 | } |
407 | |
408 | /* |
409 | * If our style changed as of this character, dump the buffer in |
410 | * the previous style and clear the buffer. |
411 | */ |
412 | |
413 | if (wpr->style != wpr->last_style) { |
414 | if (wpr->last_style & STYLE_TEMPLATE) { |
415 | if (strncmp(wpr->buf, "convert|", 8) == 0) { |
416 | /* convert|5.1|lb|... */ |
417 | /* convert|9|in|cm|adj=on */ |
418 | char *conv = xmalloc(wpr->buf_len, "convert"); |
419 | char *conv2 = xmalloc(wpr->buf_len, "convert"); |
420 | size_t len; |
421 | wpr->buf[wpr->buf_len] = '\0'; |
422 | if (sscanf(wpr->buf, "convert|%[^|]|%[^|]|%n", conv, conv2, |
423 | &len) == 2 && len >= 13) |
424 | wpr->buf_len = sprintf(wpr->buf, "%s %s ", conv, conv2); |
425 | else |
426 | wpr->buf_len = 0; |
427 | xfree(&conv); |
428 | xfree(&conv2); |
429 | } else |
430 | wpr->buf_len = 0; |
431 | } |
432 | |
433 | /* maybe we can do something with these later */ |
434 | if (wpr->last_style & STYLE_REF) |
435 | wpr->buf_len = 0; |
436 | |
437 | /* we can't show inline images */ |
438 | if ((wpr->last_style & STYLE_LINK) && |
439 | strncmp(wpr->buf, "File:", 5) == 0) { |
440 | wpr->buf_len = 0; |
441 | wpr->trim_whitespace = true; |
442 | } |
443 | |
444 | if (wpr->last_style & (STYLE_TEMPLATE | |
445 | STYLE_H1 | STYLE_H2 | STYLE_H3 | STYLE_H4 | STYLE_H5)) |
446 | wpr->trim_whitespace = true; |
447 | |
448 | if ((wpr->style & STYLE_LINK) && wpr->article_len == 0 && |
449 | strncmp(wpr->buf, "#REDIRECT ", 10) == 0) { |
450 | wpr->buf_len = 0; |
451 | wpr->redirect = true; |
452 | } else if (wpr->redirect && |
453 | !(wpr->style & STYLE_LINK) && |
454 | (wpr->last_style & STYLE_LINK)) { |
455 | if (wpr->normalized_title) |
456 | xfree(&wpr->normalized_title); |
457 | wpr->buf[wpr->buf_len] = '\0'; |
458 | wpr->normalized_title = xstrdup(wpr->buf, "title"); |
459 | wpr->state = WP_STATE_HAVE_REDIRECT; |
460 | goto done_parsing; |
461 | } |
462 | |
463 | if (wpr->buf_len) { |
464 | if (wpr->article_len == 0) |
465 | wikipedia_request_present(wpr); |
466 | |
467 | browser_print(wpr->browser, wpr->buf, wpr->buf_len, |
468 | wpr->last_style); |
469 | wpr->article_len += wpr->buf_len; |
470 | wpr->buf_len = 0; |
471 | } |
472 | wpr->last_style = wpr->style; |
473 | } |
474 | |
475 | /* remove whitespace */ |
476 | if (c != 0 && wpr->trim_whitespace) { |
477 | if (c == '\r' || c == '\t' || c == ' ') |
478 | /* trim whitespace after these */ |
479 | c = 0; |
480 | else |
481 | wpr->trim_whitespace = false; |
482 | } |
483 | |
484 | /* and finally, add the new character */ |
485 | if (c != 0) |
486 | wpr->buf[wpr->buf_len++] = c; |
487 | |
488 | req->chunk_off++; |
489 | goto get_char; |
490 | } |
491 | } |
492 | |
493 | done_parsing: |
494 | if (wpr->buf != NULL) |
495 | xfree(&wpr->buf); |
496 | |
497 | if (wpr->http_request != NULL) |
498 | http_req_free(&wpr->http_request); |
499 | } |
500 | |
501 | void |
502 | wikipedia_request_abort(struct wikipedia_request *wpr) |
503 | { |
504 | if (wpr->http_request != NULL) |
505 | http_req_free(&wpr->http_request); |
506 | } |