AmendHub

Download

jcs

/

detritus

/

html.c

 

(View History)

jcs   html: Put all of this behind HTML_ENABLE Latest amendment: 68 on 2025-03-04

1 /*
2 * Copyright (c) 2024 joshua stein <jcs@jcs.org>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 /*
18 * Glue for tying browser_page to the tokenizer, and then on the other end
19 * handling tags and text output by the tree builder.
20 */
21
22 #include <stdarg.h>
23 #include <stdio.h>
24 #include <string.h>
25
26 #include "html.h"
27
28 #ifdef HTML_ENABLE
29
30 #ifdef HTML_ENABLE_DEBUGGING
31 struct html_page *the_html = NULL;
32 #endif
33
34 struct html_page *
35 html_init_page(void *cookie)
36 {
37 struct html_page *html;
38
39 /* sanity check */
40 if (strcmp(html_tag_names[HTML_TAG_XMP], "xmp") != 0)
41 panic("html_tag_names is out of sync with HTML_TAGs");
42
43 html = xmalloczero(sizeof(struct html_page));
44 if (html == NULL)
45 return NULL;
46 html->cookie = cookie;
47 html->mode = HTML_MODE_INITIAL;
48 html->state = HTML_STATE_DATA;
49 html->frameset_ok = true;
50
51 html->new_token.doctype.public_identifier_len = -1;
52 html->new_token.doctype.system_identifier_len = -1;
53
54 #ifdef HTML_ENABLE_DEBUGGING
55 the_html = html;
56 #endif
57 return html;
58 }
59
60 bool
61 html_parse(struct html_page *html, char *str, size_t len)
62 {
63 size_t n;
64 register char cc;
65
66 for (n = 0; n < len; n++) {
67 cc = str[n];
68
69 /* https://infra.spec.whatwg.org/#normalize-newlines */
70 if (html->parse_last_cr) {
71 html->parse_last_cr = false;
72 if (cc != '\n') {
73 cc = '\n';
74 n--;
75 }
76 }
77 if (cc == '\r') {
78 html->parse_last_cr = true;
79 continue;
80 }
81
82 html_tokenize(html, cc);
83
84 if (html->eof) {
85 HTML_DEBUG(("\rEOF\r"));
86 break;
87 }
88 }
89
90 if (html->eof)
91 return false;
92
93 return true;
94 }
95
96 void
97 html_page_finish(struct html_page **htmlp)
98 {
99 struct html_page *html = *htmlp;
100
101 html_tokenize_finish(html);
102 html_xfree(htmlp);
103 }
104
105 void
106 html_xfree(struct html_page **htmlp)
107 {
108 struct html_page *html = *htmlp;
109
110 if (html->escaped_buf)
111 xfree(&html->escaped_buf);
112
113 xfree(&html);
114 }
115
116 void
117 html_parse_error(struct html_page *html)
118 {
119 HTML_DEBUG((": [[PARSE ERROR at %d]]", html->input_pos));
120 }
121
122 #if 0
123 void
124 html_emit_token(struct html_page *html, html_token *token)
125 {
126 /*
127 * html_tokenize handles each byte of html and runs it through the state
128 * machine, possibly emitting a token to us here.
129 *
130 * https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
131 */
132
133 /*
134 * At some point we might collect tags and proces them all at once, or
135 * maybe just keep a buffer of a few, before handing them off to the tree
136 * constructor.
137 *
138 * But for now, just feed them all to the tree constructor as soon as we get
139 * them.
140 */
141
142 html_process_token(html, token);
143 }
144 #endif
145
146 void
147 html_insert_character(struct html_page *html, short cc)
148 {
149 register unsigned char c = cc;
150
151 if (html->current_node == NULL) {
152 Debugger();
153 return;
154 }
155
156 if (html->current_node->text == NULL) {
157 html->current_node->text_size = HTML_TAG_TEXT_CHUNK_SIZE;
158 html->current_node->text = xmalloc(HTML_TAG_TEXT_CHUNK_SIZE);
159 } else if (html->current_node->text_len >=
160 html->current_node->text_size) {
161 html->current_node->text_size += HTML_TAG_TEXT_CHUNK_SIZE;
162 html->current_node->text = xrealloc(html->current_node->text,
163 html->current_node->text_size);
164 }
165 if (html->current_node->text == NULL)
166 panic("OOM");
167
168 if (html->current_node->type == HTML_TAG_TEXTAREA ||
169 html->current_node->type == HTML_TAG_PRE) {
170 /* TODO: still remove leading newlines */
171 goto append;
172 }
173
174 if (c == '\t' || c == '\n' || c == '\f' || c == '\r')
175 c = ' ';
176
177 if (c == ' ') {
178 /* collapse multiple whitespaces */
179 if (html->current_node->text_len && html->current_node->text[
180 html->current_node->text_len - 1] == ' ')
181 return;
182 }
183
184 append:
185 html->current_node->text[html->current_node->text_len++] = c;
186 }
187
188 bool
189 html_is_block_tag(struct html_page *html, html_tag_type tag)
190 {
191 /* https://html.spec.whatwg.org/multipage/sections.html#sections */
192 switch (tag) {
193 case HTML_TAG_ADDRESS:
194 case HTML_TAG_ARTICLE:
195 case HTML_TAG_ASIDE:
196 case HTML_TAG_BLOCKQUOTE:
197 case HTML_TAG_BODY:
198 case HTML_TAG_DD:
199 case HTML_TAG_DIV:
200 case HTML_TAG_DL:
201 case HTML_TAG_DT:
202 case HTML_TAG_FIGCAPTION:
203 case HTML_TAG_FIGURE:
204 case HTML_TAG_FOOTER:
205 case HTML_TAG_H1:
206 case HTML_TAG_H2:
207 case HTML_TAG_H3:
208 case HTML_TAG_H4:
209 case HTML_TAG_H5:
210 case HTML_TAG_H6:
211 case HTML_TAG_HEADER:
212 case HTML_TAG_HGROUP:
213 case HTML_TAG_HR:
214 case HTML_TAG_LI:
215 case HTML_TAG_MAIN:
216 case HTML_TAG_MENU:
217 case HTML_TAG_NAV:
218 case HTML_TAG_OL:
219 case HTML_TAG_P:
220 case HTML_TAG_PRE:
221 case HTML_TAG_SEARCH:
222 case HTML_TAG_SECTION:
223 case HTML_TAG_UL:
224 return true;
225 case HTML_TAG_CENTER:
226 return true;
227 default:
228 return false;
229 }
230 }
231
232 long
233 html_get_attribute_value(struct html_page *html,
234 struct html_element *element, char *name, char **ret)
235 {
236 short n, namelen;
237
238 namelen = strlen(name);
239
240 for (n = 0; n < element->attrs_count; n++) {
241 if (element->attrs[n].name_len != namelen)
242 continue;
243
244 if (strcasecmp(element->attrs[n].name, name) == 0) {
245 *ret = (char *)&element->attrs[n].val;
246 return element->attrs[n].val_len;
247 }
248 }
249
250 *ret = NULL;
251 return 0;
252 }
253
254 void
255 html_render_current_node(struct html_page *html, bool popping)
256 {
257 struct html_element *el = html->current_node;
258 struct html_element *list_parent;
259 short n, len;
260 char ol_li[10];
261 char *val;
262 bool have_height = false;
263 bool found;
264
265 el->renders++;
266
267 /* trim trailing whitespace */
268 if (popping) {
269 while (el->text_len && el->text[el->text_len - 1] == ' ')
270 el->text_len--;
271 }
272
273 /* ignore non-title tags before <body> */
274 if (!html->render_in_body) {
275 for (n = 0; n < html->open_count; n++) {
276 if (html->open[n]->type == HTML_TAG_BODY) {
277 html->render_in_body = true;
278 break;
279 }
280
281 if (n == html->open_count - 1) {
282 if (el->type == HTML_TAG_TITLE)
283 html_have_title(html->cookie, html, el->text,
284 el->text_len);
285 return;
286 }
287 }
288 }
289
290 if (el->renders == 1) {
291 /* block elements should start on a new line */
292 if (html->last_output != '\r' && html->last_output != '\0' &&
293 html_is_block_tag(html, el->type)) {
294 HTML_DEBUG(("[block-separate:%s\\r]", html_tag_names[el->type]));
295 html_output(html->cookie, html, "\r", 1);
296 }
297
298 /* if the element has a top margin, add more space */
299 if (el->margin_top) {
300 /* unless the last element had a bottom margin */
301 if (html->last_margin_bottom || html->last_output == '\0') {
302 HTML_DEBUG(("[margin-top-but-merging:%s]",
303 html_tag_names[el->type]));
304 } else {
305 HTML_DEBUG(("[margin-top:%s\\r]", html_tag_names[el->type]));
306 html_output_margin(html->cookie, html);
307 }
308 html->last_margin_bottom = 0;
309 }
310
311 html->last_margin_top = el->margin_top;
312
313 switch (el->type) {
314 case HTML_TAG_OL:
315 case HTML_TAG_UL:
316 html->render_list_depth++;
317 break;
318 case HTML_TAG_INPUT:
319 have_height = true;
320 html_output_field(html->cookie, html, el);
321 break;
322 case HTML_TAG_IMG:
323 have_height = true;
324 html_output(html->cookie, html, "[ img: ", 7);
325 /* show img alt text */
326 len = html_get_attribute_value(html, el, "alt", &val);
327 if (!val || !len)
328 /* try img title */
329 len = html_get_attribute_value(html, el, "title", &val);
330 if (val && len)
331 html_output(html->cookie, html, val, len);
332 else {
333 /* last resort, show img src filename */
334 len = html_get_attribute_value(html, el, "src", &val);
335 if (val && len) {
336 for (n = len; n >= 0; n--) {
337 if (val[n] == '/') {
338 html_output(html->cookie, html, val + n + 1,
339 len - n - 1);
340 break;
341 }
342 }
343 }
344 }
345 html_output(html->cookie, html, " ]", 2);
346 break;
347 }
348 }
349
350 /* remove leading whitespace */
351 if (el->text_len &&
352 (html->last_output == ' ' || html->last_output == '\r' ||
353 html->last_output == '\0')) {
354 while (el->text_len && el->text[el->text_off] == ' ') {
355 el->text_off++;
356 el->text_len--;
357 }
358 }
359
360 if (html->render_list_depth) {
361 if (el->type == HTML_TAG_LI && el->renders == 1) {
362 for (n = 1; n < html->render_list_depth; n++)
363 html_output(html->cookie, html, "\t", 1);
364
365 list_parent = NULL;
366 for (n = html->open_count - 1; n >= 0; n--) {
367 if (html->open[n]->type == HTML_TAG_OL ||
368 html->open[n]->type == HTML_TAG_UL) {
369 list_parent = html->open[n];
370 break;
371 }
372 }
373
374 if (list_parent && list_parent->type == HTML_TAG_UL) {
375 if (html->render_list_depth == 1)
376 html_output(html->cookie, html, " •\t", 5);
377 else if (html->render_list_depth == 2)
378 html_output(html->cookie, html, " o\t", 5);
379 else
380 html_output(html->cookie, html, " ◊\t", 5);
381 } else if (list_parent && list_parent->type == HTML_TAG_OL) {
382 list_parent->ol_count++;
383 len = snprintf(ol_li, sizeof(ol_li), "% 4d.\t",
384 list_parent->ol_count);
385 html_output(html->cookie, html, ol_li, len);
386 }
387
388 html->last_output = ' ';
389 have_height = true;
390 } else if (el->text_len) {
391 /* in a list but not a direct child of <li>, what are we in? */
392 for (n = html->open_count - 1; n >= 0; n--) {
393 if (html->open[n]->type == HTML_TAG_OL ||
394 html->open[n]->type == HTML_TAG_UL) {
395 /* text in root of list not in an li, ident it */
396 for (n = 0; n < html->render_list_depth; n++)
397 html_output(html->cookie, html, "\t", 1);
398 break;
399 }
400
401 if (html->open[n]->type == HTML_TAG_LI) {
402 if (html->last_output == '\r') {
403 /* text after a <br> inside an <li>, re-indent */
404 for (n = 0; n < html->render_list_depth; n++)
405 html_output(html->cookie, html, "\t", 1);
406 }
407 break;
408 }
409 }
410
411 html->last_output = ' ';
412 }
413 }
414
415 /* print inner text */
416 if (el->text_len) {
417 html_output(html->cookie, html, el->text + el->text_off,
418 el->text_len);
419 have_height = true;
420 }
421
422 /* brrrr */
423 if (el->type == HTML_TAG_BR) {
424 HTML_DEBUG(("[br\\r]"));
425 html_output(html->cookie, html, "\r", 1);
426 have_height = true;
427 }
428
429 /* mark this block (or its nearest parent block) as having height */
430 if (have_height) {
431 if (html_is_block_tag(html, el->type))
432 el->has_height = true;
433 else {
434 /* find parent block */
435 for (n = html->open_count - 1; n >= 0; n--) {
436 if (html_is_block_tag(html, html->open[n]->type)) {
437 html->open[n]->has_height = true;
438 break;
439 }
440 }
441 }
442 }
443
444 if (popping) {
445 /* block elements that had text (or br) get a separating newline */
446 if (el->has_height &&
447 !(el->type == HTML_TAG_OL || el->type == HTML_TAG_UL) &&
448 !(el->type == HTML_TAG_LI && html->last_output == '\r')) {
449 HTML_DEBUG(("[end-block:/%s\\r]", html_tag_names[el->type]));
450 html_output(html->cookie, html, "\r", 1);
451 }
452
453 if (el->margin_bottom) {
454 /* unless the last element had a bottom margin */
455 if (!html->last_margin_bottom) {
456 HTML_DEBUG(("[margin-bottom\\r]"));
457 html_output_margin(html->cookie, html);
458 html->last_margin_bottom = el->margin_bottom;
459 }
460 }
461
462 if (el->has_height) {
463 HTML_DEBUG(("[new-last-margin-bottom:%d]", el->margin_bottom));
464 html->last_margin_bottom = el->margin_bottom;
465 }
466
467 if (el->type == HTML_TAG_OL || el->type == HTML_TAG_UL)
468 html->render_list_depth--;
469 }
470
471 el->text_off = 0;
472 el->text_len = 0;
473 }
474
475 #ifdef HTML_ENABLE_DEBUGGING
476 void
477 html_debug(const char *fmt, ...)
478 {
479 static char buf[512];
480 size_t len;
481
482 va_list args;
483 va_start(args, fmt);
484 len = vsnprintf(buf, sizeof(buf), fmt, args);
485 va_end(args);
486
487 if (len > sizeof(buf))
488 len = sizeof(buf);
489
490 html_output(the_html->cookie, the_html, buf, len);
491 }
492 #endif
493
494 #endif /* HTML_ENABLE */