AmendHub

Download

jcs

/

detritus

/

html.h

 

(View History)

jcs   html: Put all of this behind HTML_ENABLE Latest amendment: 68 on 2025-03-04

1 /*
2 * Copyright (c) 2024 joshua stein <jcs@jcs.org>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include <stdlib.h>
18 #include <stdarg.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include "stdint.h"
22
23 #include "util.h"
24
25 //#define HTML_ENABLE
26
27 #ifdef HTML_ENABLE
28 void html_output(void *cookie, struct html_page *html, char *str,
29 size_t len);
30 void html_output_margin(void *cookie, struct html_page *html);
31 void html_output_field(void *cookie, struct html_page *html,
32 struct html_element *el);
33 void html_debug(const char *fmt, ...);
34 void html_have_title(void *cookie, struct html_page *html, char *str,
35 size_t len);
36
37 //#define HTML_ENABLE_DEBUGGING
38 #ifdef HTML_ENABLE_DEBUGGING
39 extern struct html_page *the_html;
40 # define HTML_DEBUG(x) do { html_debug x; } while (0)
41 #else
42 # define HTML_DEBUG(x) {}
43 #endif
44
45 /*
46 * tunables
47 */
48
49 #define HTML_STACK_DEPTH 128
50
51 /* this should in theory be the max size of an html_entity but that's huge */
52 #define HTML_LOOKAHEAD_SIZE 10
53
54 #define HTML_OUTPUT_BUF_SIZE 64
55 #define HTML_TAG_TEXT_CHUNK_SIZE 512
56
57 /*
58 * helpers
59 */
60
61 #define IS_WHITESPACE(c) ((c) == '\t' || (c) == '\n' || (c) == '\f' || \
62 (c) == '\r' || (c) == ' ')
63 #define IS_LOWER_ALPHA(c) ((c) >= 'a' && (c) <= 'z')
64 #define IS_UPPER_ALPHA(c) ((c) >= 'A' && (c) <= 'Z')
65 #define IS_ALPHA(c) (IS_LOWER_ALPHA((c)) || IS_UPPER_ALPHA((c)))
66 #define IS_DIGIT(c) (((c) >= '0' && (c) <= '9'))
67 #define IS_ALPHANUMERIC(c) (IS_ALPHA((c)) || IS_DIGIT((c)))
68 #define IS_LOWER_HEX_DIGIT(c) ((c) >= 'a' && (c) <= 'f')
69 #define IS_UPPER_HEX_DIGIT(c) ((c) >= 'A' && (c) <= 'F')
70 #define IS_HEX_DIGIT(c) (IS_LOWER_HEX_DIGIT(c) || IS_UPPER_HEX_DIGIT(c))
71 #define IS_LEADING_SURROGATE(c) ((c) >= 0xdb00 && (c) <= 0xdbff)
72 #define IS_TRAILING_SURROGATE(c) ((c) >= 0xdc00 && (c) <= 0xdfff)
73 #define IS_SURROGATE(c) (IS_LEADING_SURROGATE(c) || IS_TRAILING_SURROGATE(c))
74 #define IS_NONCHARACTER(c) (\
75 ((c) >= 0xfdd0 && (c) <= 0xfdef) || \
76 (c) == 0xfffe || (c) == 0xffff || \
77 (c) == 0x1fffe || (c) == 0x1ffff || \
78 (c) == 0x2fffe || (c) == 0x2ffff || \
79 (c) == 0x3fffe || (c) == 0x3ffff || \
80 (c) == 0x4fffe || (c) == 0x4ffff || \
81 (c) == 0x5fffe || (c) == 0x5ffff || \
82 (c) == 0x6fffe || (c) == 0x6ffff || \
83 (c) == 0x7fffe || (c) == 0x7ffff || \
84 (c) == 0x8fffe || (c) == 0x8ffff || \
85 (c) == 0x9fffe || (c) == 0x9ffff || \
86 (c) == 0xafffe || (c) == 0xaffff || \
87 (c) == 0xbfffe || (c) == 0xbffff || \
88 (c) == 0xcfffe || (c) == 0xcffff || \
89 (c) == 0xdfffe || (c) == 0xdffff || \
90 (c) == 0xefffe || (c) == 0xeffff || \
91 (c) == 0xffffe || (c) == 0xfffff || \
92 (c) == 0x10fffe || (c) == 0x10ffff)
93 #define IS_C0_CONTROL(c) ((c) >= 0 && (c) <= 0x1f)
94 #define IS_CONTROL(c) (IS_C0_CONTROL((c)) || ((c) >= 0x7f && (c) <= 0x9f))
95 #define IS_BLOCK(tag) ((tag) < HTML_TAG_LAST_BLOCK)
96
97 #define NEW_TOKEN_LAST_ATTR (html->new_token.tag.attrs[html->new_token.tag.attrs_count - 1])
98
99 /* only works on fixed-size char arrays */
100 #define STR_APPEND(field, len, ch) \
101 if ((len) < sizeof(field)) { \
102 (field)[(len)++] = (ch); \
103 (field)[(len)] = '\0'; \
104 }
105
106 #define CONSUMED_AS_PART_OF_AN_ATTRIBUTE \
107 (html->return_state == HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED || \
108 html->return_state == HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED || \
109 html->return_state == HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED)
110
111 #ifndef nitems
112 #define nitems(what) (sizeof((what)) / sizeof((what)[0]))
113 #endif
114
115 /* insertion mode */
116 /* https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode */
117 extern const char *html_mode_names[];
118 typedef enum {
119 HTML_MODE_NONE = 0,
120 HTML_MODE_INITIAL,
121 HTML_MODE_BEFORE_HTML,
122 HTML_MODE_BEFORE_HEAD,
123 HTML_MODE_IN_HEAD,
124 HTML_MODE_IN_HEAD_NOSCRIPT,
125 HTML_MODE_AFTER_HEAD,
126 HTML_MODE_IN_BODY,
127 HTML_MODE_TEXT,
128 HTML_MODE_IN_TABLE,
129 HTML_MODE_IN_TABLE_TEXT,
130 HTML_MODE_IN_CAPTION,
131 HTML_MODE_IN_COLUMN_GROUP,
132 HTML_MODE_IN_TABLE_BODY,
133 HTML_MODE_IN_ROW,
134 HTML_MODE_IN_CELL,
135 HTML_MODE_IN_SELECT,
136 HTML_MODE_IN_SELECT_IN_TABLE,
137 HTML_MODE_IN_TEMPLATE,
138 HTML_MODE_AFTER_BODY,
139 HTML_MODE_IN_FRAMESET,
140 HTML_MODE_AFTER_FRAMESET,
141 HTML_MODE_AFTER_AFTER_BODY,
142 HTML_MODE_AFTER_AFTER_FRAMESET
143 } html_mode;
144
145 /* tokenization state */
146 /* https://html.spec.whatwg.org/multipage/parsing.html#tokenization */
147 extern const char *html_state_names[];
148 typedef enum {
149 HTML_STATE_NONE = 0,
150 HTML_STATE_DATA,
151 HTML_STATE_RCDATA,
152 HTML_STATE_RAWTEXT,
153 HTML_STATE_SCRIPT_DATA,
154 HTML_STATE_PLAINTEXT,
155 HTML_STATE_TAG_OPEN,
156 HTML_STATE_END_TAG_OPEN,
157 HTML_STATE_TAG_NAME,
158 HTML_STATE_RCDATA_LESS_THAN_SIGN,
159 HTML_STATE_RCDATA_END_TAG_OPEN,
160 HTML_STATE_RCDATA_END_TAG_NAME,
161 HTML_STATE_RAWTEXT_LESS_THAN_SIGN,
162 HTML_STATE_RAWTEXT_END_TAG_OPEN,
163 HTML_STATE_RAWTEXT_END_TAG_NAME,
164 HTML_STATE_SCRIPT_DATA_LESS_THAN_SIGN,
165 HTML_STATE_SCRIPT_DATA_END_TAG_OPEN,
166 HTML_STATE_SCRIPT_DATA_END_TAG_NAME,
167 HTML_STATE_SCRIPT_DATA_ESCAPE_START,
168 HTML_STATE_SCRIPT_DATA_ESCAPE_START_DASH,
169 HTML_STATE_SCRIPT_DATA_ESCAPED,
170 HTML_STATE_SCRIPT_DATA_ESCAPED_DASH,
171 HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH,
172 HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN,
173 HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN,
174 HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME,
175 HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START,
176 HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED,
177 HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH,
178 HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH,
179 HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN,
180 HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END,
181 HTML_STATE_BEFORE_ATTRIBUTE_NAME,
182 HTML_STATE_ATTRIBUTE_NAME,
183 HTML_STATE_AFTER_ATTRIBUTE_NAME,
184 HTML_STATE_BEFORE_ATTRIBUTE_VALUE,
185 HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED,
186 HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED,
187 HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED,
188 HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED,
189 HTML_STATE_SELF_CLOSING_START_TAG,
190 HTML_STATE_BOGUS_COMMENT,
191 HTML_STATE_MARKUP_DECLARATION_OPEN,
192 HTML_STATE_COMMENT_START,
193 HTML_STATE_COMMENT_START_DASH,
194 HTML_STATE_COMMENT,
195 HTML_STATE_COMMENT_LESS_THAN_SIGN,
196 HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG,
197 HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH,
198 HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH,
199 HTML_STATE_COMMENT_END_DASH,
200 HTML_STATE_COMMENT_END,
201 HTML_STATE_COMMENT_END_BANG,
202 HTML_STATE_DOCTYPE,
203 HTML_STATE_BEFORE_DOCTYPE_NAME,
204 HTML_STATE_DOCTYPE_NAME,
205 HTML_STATE_AFTER_DOCTYPE_NAME,
206 HTML_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD,
207 HTML_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
208 HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED,
209 HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED,
210 HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER,
211 HTML_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
212 HTML_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD,
213 HTML_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
214 HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED,
215 HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED,
216 HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER,
217 HTML_STATE_BOGUS_DOCTYPE,
218 HTML_STATE_CDATA_SECTION,
219 HTML_STATE_CDATA_SECTION_BRACKET,
220 HTML_STATE_CDATA_SECTION_END,
221 HTML_STATE_CHARACTER_REFERENCE,
222 HTML_STATE_NAMED_CHARACTER_REFERENCE,
223 HTML_STATE_AMBIGUOUS_AMPERSAND,
224 HTML_STATE_NUMERIC_CHARACTER_REFERENCE,
225 HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE_START,
226 HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START,
227 HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE,
228 HTML_STATE_DECIMAL_CHARACTER_REFERENCE,
229 HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END
230 } html_state;
231
232 /* tokenization output */
233 /* https://html.spec.whatwg.org/multipage/parsing.html#tokenization */
234 #ifdef HTML_ENABLE_DEBUGGING
235 extern const char *html_token_names[];
236 #endif
237 typedef enum {
238 HTML_TOKEN_DOCTYPE = 1,
239 HTML_TOKEN_START_TAG,
240 HTML_TOKEN_END_TAG,
241 HTML_TOKEN_COMMENT,
242 HTML_TOKEN_CHARACTER,
243 HTML_TOKEN_EOF
244 } html_token_type;
245
246 /* html_process_token return states */
247 typedef enum {
248 HTML_TOKEN_REPROCESS = 1,
249 HTML_TOKEN_PROCESSED
250 } html_token_act;
251
252 /* parse errors */
253 /* https://html.spec.whatwg.org/multipage/parsing.html#parse-errors */
254 #ifdef HTML_ENABLE_DEBUGGING
255 extern const char *html_error_strings[];
256 #endif
257 typedef enum {
258 HTML_ERROR_NONE,
259 HTML_ERROR_ABRUPT_CLOSING_OF_EMPTY_COMMENT,
260 HTML_ERROR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER,
261 HTML_ERROR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER,
262 HTML_ERROR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
263 HTML_ERROR_CDATA_IN_HTML_CONTENT,
264 HTML_ERROR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
265 HTML_ERROR_CONTROL_CHARACTER_IN_INPUT_STREAM,
266 HTML_ERROR_CONTROL_CHARACTER_REFERENCE,
267 HTML_ERROR_DUPLICATE_ATTRIBUTE,
268 HTML_ERROR_END_TAG_WITH_ATTRIBUTES,
269 HTML_ERROR_END_TAG_WITH_TRAILING_SOLIDUS,
270 HTML_ERROR_EOF_BEFORE_TAG_NAME,
271 HTML_ERROR_EOF_IN_CDATA,
272 HTML_ERROR_EOF_IN_COMMENT,
273 HTML_ERROR_EOF_IN_DOCTYPE,
274 HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT,
275 HTML_ERROR_EOF_IN_TAG,
276 HTML_ERROR_INCORRECTLY_CLOSED_COMMENT,
277 HTML_ERROR_INCORRECTLY_OPENED_COMMENT,
278 HTML_ERROR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME,
279 HTML_ERROR_INVALID_FIRST_CHARACTER_OF_TAG_NAME,
280 HTML_ERROR_MISSING_ATTRIBUTE_VALUE,
281 HTML_ERROR_MISSING_DOCTYPE_NAME,
282 HTML_ERROR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER,
283 HTML_ERROR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER,
284 HTML_ERROR_MISSING_END_TAG_NAME,
285 HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
286 HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
287 HTML_ERROR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
288 HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD,
289 HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD,
290 HTML_ERROR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME,
291 HTML_ERROR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
292 HTML_ERROR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
293 HTML_ERROR_NESTED_COMMENT,
294 HTML_ERROR_NONCHARACTER_CHARACTER_REFERENCE,
295 HTML_ERROR_NONCHARACTER_IN_INPUT_STREAM,
296 HTML_ERROR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS,
297 HTML_ERROR_NULL_CHARACTER_REFERENCE,
298 HTML_ERROR_SURROGATE_CHARACTER_REFERENCE,
299 HTML_ERROR_SURROGATE_IN_INPUT_STREAM,
300 HTML_ERROR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER,
301 HTML_ERROR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME,
302 HTML_ERROR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE,
303 HTML_ERROR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME,
304 HTML_ERROR_UNEXPECTED_NULL_CHARACTER,
305 HTML_ERROR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME,
306 HTML_ERROR_UNEXPECTED_SOLIDUS_IN_TAG,
307 HTML_ERROR_UNKNOWN_NAMED_CHARACTER_REFERENCE
308 } html_error;
309
310 /* keep this in same order as html_tag_names[] */
311 extern const char *html_tag_names[];
312 typedef enum {
313 HTML_TAG__NONE = 0,
314 HTML_TAG_A,
315 HTML_TAG_ADDRESS,
316 HTML_TAG_APPLET,
317 HTML_TAG_AREA,
318 HTML_TAG_ARTICLE,
319 HTML_TAG_ASIDE,
320 HTML_TAG_B,
321 HTML_TAG_BASE,
322 HTML_TAG_BASEFONT,
323 HTML_TAG_BGSOUND,
324 HTML_TAG_BIG,
325 HTML_TAG_BLOCKQUOTE,
326 HTML_TAG_BODY,
327 HTML_TAG_BR,
328 HTML_TAG_BUTTON,
329 HTML_TAG_CAPTION,
330 HTML_TAG_CENTER,
331 HTML_TAG_CITE,
332 HTML_TAG_CODE,
333 HTML_TAG_COL,
334 HTML_TAG_COLGROUP,
335 HTML_TAG_DD,
336 HTML_TAG_DETAILS,
337 HTML_TAG_DFN,
338 HTML_TAG_DIALOG,
339 HTML_TAG_DIR,
340 HTML_TAG_DIV,
341 HTML_TAG_DL,
342 HTML_TAG_DT,
343 HTML_TAG_EM,
344 HTML_TAG_EMBED,
345 HTML_TAG_FIELDSET,
346 HTML_TAG_FIGCAPTION,
347 HTML_TAG_FIGURE,
348 HTML_TAG_FONT,
349 HTML_TAG_FOOTER,
350 HTML_TAG_FORM,
351 HTML_TAG_FRAME,
352 HTML_TAG_FRAMESET,
353 HTML_TAG_H1,
354 HTML_TAG_H2,
355 HTML_TAG_H3,
356 HTML_TAG_H4,
357 HTML_TAG_H5,
358 HTML_TAG_H6,
359 HTML_TAG_HEAD,
360 HTML_TAG_HEADER,
361 HTML_TAG_HGROUP,
362 HTML_TAG_HR,
363 HTML_TAG_HTML,
364 HTML_TAG_I,
365 HTML_TAG_IFRAME,
366 HTML_TAG_IMAGE,
367 HTML_TAG_IMG,
368 HTML_TAG_INPUT,
369 HTML_TAG_INS,
370 HTML_TAG_KBD,
371 HTML_TAG_KEYGEN,
372 HTML_TAG_LI,
373 HTML_TAG_LINK,
374 HTML_TAG_LISTING,
375 HTML_TAG_MAIN,
376 HTML_TAG_MARQUEE,
377 HTML_TAG_MATH,
378 HTML_TAG_MENU,
379 HTML_TAG_META,
380 HTML_TAG_NAV,
381 HTML_TAG_NOBR,
382 HTML_TAG_NOEMBED,
383 HTML_TAG_NOFRAMES,
384 HTML_TAG_NOSCRIPT,
385 HTML_TAG_OBJECT,
386 HTML_TAG_OL,
387 HTML_TAG_OPTGROUP,
388 HTML_TAG_OPTION,
389 HTML_TAG_P,
390 HTML_TAG_PARAM,
391 HTML_TAG_PLAINTEXT,
392 HTML_TAG_PRE,
393 HTML_TAG_RB,
394 HTML_TAG_RP,
395 HTML_TAG_RT,
396 HTML_TAG_RTC,
397 HTML_TAG_RUBY,
398 HTML_TAG_S,
399 HTML_TAG_SAMP,
400 HTML_TAG_SCRIPT,
401 HTML_TAG_SEARCH,
402 HTML_TAG_SECTION,
403 HTML_TAG_SELECT,
404 HTML_TAG_SMALL,
405 HTML_TAG_SOURCE,
406 HTML_TAG_SPAN,
407 HTML_TAG_STRIKE,
408 HTML_TAG_STRONG,
409 HTML_TAG_STYLE,
410 HTML_TAG_SUB,
411 HTML_TAG_SUP,
412 HTML_TAG_SUMMARY,
413 HTML_TAG_SVG,
414 HTML_TAG_TABLE,
415 HTML_TAG_TBODY,
416 HTML_TAG_TD,
417 HTML_TAG_TEMPLATE,
418 HTML_TAG_TEXTAREA,
419 HTML_TAG_TFOOT,
420 HTML_TAG_TH,
421 HTML_TAG_THEAD,
422 HTML_TAG_TITLE,
423 HTML_TAG_TR,
424 HTML_TAG_TRACK,
425 HTML_TAG_TT,
426 HTML_TAG_U,
427 HTML_TAG_UL,
428 HTML_TAG_VAR,
429 HTML_TAG_WBR,
430 HTML_TAG_XMP,
431
432 HTML_TAG_MAX_ID
433 } html_tag_type;
434
435 typedef enum {
436 HTML_SCOPE_DEFAULT,
437 HTML_SCOPE_LIST_ITEM,
438 HTML_SCOPE_BUTTON,
439 HTML_SCOPE_TABLE,
440 HTML_SCOPE_SELECT
441 } html_scope;
442
443 typedef enum {
444 HTML_NAMESPACE_HTML,
445 HTML_NAMESPACE_MATHML,
446 HTML_NAMESPACE_SVG,
447 HTML_NAMESPACE_XLINK,
448 HTML_NAMESPACE_XML,
449 HTML_NAMESPACE_XMLNS
450 } html_namespace;
451
452 typedef struct {
453 const char *entity;
454 uint32_t codepoint;
455 } html_entity;
456
457 extern const html_entity html_entities[];
458
459 struct html_attr {
460 char name[24];
461 short name_len;
462 char val[128];
463 short val_len;
464 };
465
466 struct html_tag {
467 /* this must be first */
468 html_tag_type token_type;
469
470 html_tag_type type;
471 html_namespace ns;
472 char name[16];
473 short name_len;
474 struct html_attr attrs[16];
475 short attrs_count;
476 bool emitted;
477 bool self_closing;
478 bool self_closing_acked;
479 };
480
481 struct html_element {
482 html_tag_type type;
483
484 html_namespace ns;
485 char name[16];
486 short name_len;
487 struct html_attr attrs[8];
488 short attrs_count;
489
490 char *text;
491 size_t text_len;
492 size_t text_off;
493 size_t text_size;
494 bool has_height;
495 short margin_top;
496 short margin_bottom;
497 short ol_count;
498 short renders;
499
500 TEHandle input_te;
501
502 short refs;
503 struct html_element *next_need_free;
504 };
505
506 struct html_comment {
507 /* this must be first */
508 html_token_type token_type;
509
510 char data[8];
511 short len;
512 };
513
514 struct html_char {
515 /* this must be first */
516 html_token_type token_type;
517
518 char c;
519 };
520
521 struct html_doctype {
522 /* this must be first */
523 html_token_type _pad;
524
525 char name[16];
526 short name_len;
527 char public_identifier[16];
528 short public_identifier_len;
529 char system_identifier[16];
530 short system_identifier_len;
531 bool system_identifier_found;
532 bool force_quirks;
533 };
534
535 /*
536 * THINK C doesn't support anonymous unions so we can't have a
537 * struct html_token with tag/doctype/comment at the root
538 */
539 union html_token {
540 /* every other type has html_token_type as its first member */
541 html_token_type type;
542
543 struct html_tag tag;
544 struct html_doctype doctype;
545 struct html_comment comment;
546 struct html_char ch;
547 };
548 typedef union html_token html_token;
549
550 struct html_formatting {
551 bool marker;
552 struct html_element *element;
553 html_token_type token;
554 };
555
556 struct html_page {
557 void *cookie;
558
559 size_t input_pos;
560 bool eof;
561
562 /* insertion mode */
563 html_mode mode;
564 html_mode original_mode;
565
566 html_state state;
567 html_state return_state;
568
569 html_error error;
570
571 char *escaped_buf;
572 size_t escaped_size;
573
574 long char_ref_code;
575
576 bool parse_last_cr;
577 bool frameset_ok;
578 bool parser_cannot_change_mode;
579 bool foster_parenting;
580 bool quirks_mode;
581
582 /* rendering */
583 bool render_in_body;
584 short render_list_depth;
585 char last_output;
586 bool last_margin_top;
587 bool last_margin_bottom;
588
589 /* configurables */
590 bool ignore_script_data;
591 bool ignore_comment_data;
592 bool scripting;
593 bool styling;
594
595 /* if the next character token should be skipped if it's \n */
596 bool skip_newline_char_token;
597
598 /* "stack of open elements" */
599 struct html_element *open[HTML_STACK_DEPTH];
600 short open_count;
601 struct html_element *current_node;
602 struct html_element *need_free_list;
603 struct html_element *need_free_tail;
604
605 /* https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements */
606 struct html_formatting active_formatting[HTML_STACK_DEPTH];
607 short active_formatting_count;
608
609 /* https://html.spec.whatwg.org/multipage/parsing.html#the-element-pointers */
610 struct html_element *head;
611 struct html_element *form;
612
613 union html_token new_token;
614
615 /* we'll queue some characters up before actually parsing */
616 char lookahead[HTML_LOOKAHEAD_SIZE];
617 unsigned char lookahead_len;
618
619 /* some tokens need a temporary buffer to store text */
620 char tmp[128];
621 unsigned char tmp_len;
622 };
623
624 #define HTML_REPLACEMENT_CHARACTER 0xff
625
626 void html_output(void *cookie, struct html_page *html, char *str,
627 size_t len);
628 void html_debug(const char *fmt, ...);
629 void html_have_title(void *cookie, struct html_page *html, char *str,
630 size_t len);
631
632 /* html.c */
633 struct html_page * html_init_page(void *cookie);
634 void html_page_finish(struct html_page **htmlp);
635 void html_xfree(struct html_page **htmlp);
636 bool html_parse(struct html_page *html, char *str, size_t len);
637 void html_insert_character(struct html_page *html, short cc);
638 bool html_is_block_tag(struct html_page *html, html_tag_type tag);
639 long html_get_attribute_value(struct html_page *html,
640 struct html_element *element, char *name, char **ret);
641 void html_render_current_node(struct html_page *html, bool popping);
642 void html_parse_error(struct html_page *html);
643 void html_debug(const char *fmt, ...);
644 #if 0
645 void html_emit_token(struct html_page *html, html_token *token);
646 #else
647 #define html_emit_token(a, b) html_process_token(a, b)
648 #endif
649 void html_buffer_output(struct html_page *html, char *str, size_t len);
650 void html_flush_output_buffer(struct html_page *html);
651
652 /* html_tokenize.c */
653 void html_tokenize(struct html_page *html, short cc);
654 void html_prep_new_token(struct html_page *html, html_token_type token_type);
655 struct html_attr * html_prep_new_attribute(struct html_page *html,
656 struct html_tag *tag);
657 void html_tokenize_finish(struct html_page *html);
658 html_token_act html_process_token_in_foreign_content(struct html_page *html,
659 html_token *token);
660
661 /* html_tree.c */
662 void html_process_token(struct html_page *html, html_token *token);
663 void html_append_comment(struct html_page *html, struct html_comment *comment);
664 void html_stop_parsing(struct html_page *html);
665 char * html_escape_string(struct html_page *html, char *str, size_t *len,
666 bool attribute_mode);
667 void html_emit_char_token(struct html_page *html, short cc);
668 void html_emit_eof_token(struct html_page *html);
669 void html_emit_comment(struct html_page *html, struct html_comment *comment);
670
671 #endif /* HTML_ENABLE */