/* * Copyright (c) 2024 joshua stein * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * Tree construction * https://html.spec.whatwg.org/multipage/parsing.html#tree-construction * * html_tokenize() outputs tokens of various types to the html_emit_*token() * functions, which then output them to html_process_token() here for tree * building, tag order manipulation, tag closing, etc. */ #include "html.h" #ifdef HTML_ENABLE void html_deref_element(struct html_page *html, struct html_element *element); void html_append_element(struct html_page *html, struct html_element *element); struct html_element * html_create_element_for_token(struct html_page *html, html_token *token); struct html_element * html_append_element_for_token(struct html_page *html, html_token *token, html_namespace ns); bool html_remove_open_element(struct html_page *html, struct html_element *element); html_token_act html_process_token_initial(struct html_page *html, html_token *token); html_token_act html_process_token_before_html(struct html_page *html, html_token *token); html_token_act html_process_token_before_head(struct html_page *html, html_token *token); html_token_act html_process_token_in_head(struct html_page *html, html_token *token); html_token_act html_process_token_in_head_noscript(struct html_page *html, html_token *token); html_token_act html_process_token_after_head(struct html_page *html, html_token *token); html_token_act html_process_token_in_body(struct html_page *html, html_token *token); html_token_act html_process_token_text(struct html_page *html, html_token *token); html_token_act html_process_token_in_table(struct html_page *html, html_token *token); html_token_act html_process_token_in_table_text(struct html_page *html, html_token *token); html_token_act html_process_token_in_caption(struct html_page *html, html_token *token); html_token_act html_process_token_in_column_group(struct html_page *html, html_token *token); html_token_act html_process_token_in_table_body(struct html_page *html, html_token *token); html_token_act html_process_token_in_row(struct html_page *html, html_token *token); html_token_act html_process_token_in_cell(struct html_page *html, html_token *token); html_token_act html_process_token_in_select(struct html_page *html, html_token *token); html_token_act html_process_token_in_select_in_table(struct html_page *html, html_token *token); html_token_act html_process_token_in_template(struct html_page *html, html_token *token); html_token_act html_process_token_after_body(struct html_page *html, html_token *token); html_token_act html_process_token_in_frameset(struct html_page *html, html_token *token); html_token_act html_process_token_after_frameset(struct html_page *html, html_token *token); html_token_act html_process_token_after_after_body(struct html_page *html, html_token *token); html_token_act html_process_token_after_after_frameset(struct html_page *html, html_token *token); void html_pop_current_element(struct html_page *html); void html_pop_nodes_until_past_tag(struct html_page *html, html_tag_type stop_after); void html_pop_nodes_until_past_element(struct html_page *html, struct html_element *element); void html_close_p(struct html_page *html); void html_generate_implied_end_tags(struct html_page *html, char *except, bool thoroughly); /* active formatting */ void html_push_active_formatting_element(struct html_page *html, struct html_element *element, html_token_type token_type); void html_push_active_formatting_marker(struct html_page *html, html_token_type token_type); bool html_is_tag_in_active_formatting(struct html_page *html, html_tag_type tag); bool html_is_element_in_active_formatting(struct html_page *html, struct html_element *element); bool html_remove_active_formatting_element(struct html_page *html, struct html_element *element); void html_reconstruct_active_formatting(struct html_page *html); void html_clear_active_formatting_to_last_marker(struct html_page *html); bool html_run_adoption_agency(struct html_page *html, html_token *token); /* helpers */ bool html_is_element_special(struct html_page *html, struct html_element *el); bool html_is_element_formatting(struct html_page *html, struct html_element *el); bool html_is_element_open(struct html_page *html, struct html_element *el); bool html_has_tag_open(struct html_page *html, html_tag_type tag); bool html_has_element_in_scope(struct html_page *html, struct html_element *element, html_scope scope); bool html_has_element_with_tag_open_in_scope(struct html_page *html, html_tag_type tag, html_scope scope); bool html_has_element_or_one_with_tag_open_in_scope(struct html_page *html, struct html_element *element, html_tag_type tag, html_scope scope); bool html_element_serializes_as_void(struct html_page *html, struct html_element *element); void html_append_element(struct html_page *html, struct html_element *element) { short n; if (html->open_count >= nitems(html->open)) panic("ran out of tag stack space"); if (html->current_node) { HTML_DEBUG((": rendering current before-append <%s>", html->current_node->name)); html_render_current_node(html, false); } HTML_DEBUG((": appending element")); if (element->ns != HTML_NAMESPACE_HTML) HTML_DEBUG((" in namespace %d", element->ns)); HTML_DEBUG((": %d: <%s>", html->open_count, element->name)); html->open[html->open_count++] = element; element->refs++; html->current_node = element; switch (element->type) { case HTML_TAG_BLOCKQUOTE: case HTML_TAG_CENTER: case HTML_TAG_DL: case HTML_TAG_H1: case HTML_TAG_H2: case HTML_TAG_H3: case HTML_TAG_H4: case HTML_TAG_H5: case HTML_TAG_H6: case HTML_TAG_MENU: case HTML_TAG_P: element->margin_top = 1; element->margin_bottom = 1; break; case HTML_TAG_OL: case HTML_TAG_UL: /* only give margins if not inside another list */ for (n = html->open_count - 2; n >= 0; n--) { if (html->open[n]->type == HTML_TAG_OL || html->open[n]->type == HTML_TAG_UL) break; if (n == 0) { element->margin_top = 1; element->margin_bottom = 1; } } break; } HTML_DEBUG((": now open: ")); for (n = 0; n <= html->open_count - 1; n++) HTML_DEBUG(("<%s>", html->open[n]->name)); } void html_append_comment(struct html_page *html, struct html_comment *comment) { #if 0 size_t esclen; char *esc; /* * https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments:comment-2 */ esclen = comment->len; esc = html_escape_string(html, comment->data, &esclen, false); html_buffer_output(html, "", 3); #endif } struct html_element * html_create_element_for_token(struct html_page *html, html_token *token) { struct html_element *element; if (token->tag.name[0] == '\0') token->tag.name_len = strlcpy(token->tag.name, html_tag_names[token->tag.type], sizeof(token->tag.name)); /* TODO: do an optimized allocation only the size we need */ element = xmalloczero(sizeof(struct html_element)); element->type = token->tag.type; memcpy(element->name, token->tag.name, sizeof(element->name)); element->name_len = token->tag.name_len; memcpy(element->attrs, token->tag.attrs, sizeof(element->attrs)); element->attrs_count = token->tag.attrs_count; return element; } void html_deref_element(struct html_page *html, struct html_element *element) { if (element->refs == 0) Debugger(); else element->refs--; if (element->refs == 0) { if (html->need_free_list) { html->need_free_tail->next_need_free = element; html->need_free_tail = element; } else { html->need_free_list = element; html->need_free_tail = element; } } } struct html_element * html_append_element_for_token(struct html_page *html, html_token *token, html_namespace ns) { struct html_element *element; element = html_create_element_for_token(html, token); element->ns = ns; html_append_element(html, element); return element; } void html_process_token(struct html_page *html, html_token *token) { html_token_act ret; struct html_element *el; while (html->need_free_list) { HTML_DEBUG((": freeing deref'd <%s>", html->need_free_list->name)); el = html->need_free_list->next_need_free; if (html->need_free_list->text) xfree(&html->need_free_list->text); xfree(&html->need_free_list); html->need_free_list = el; html->need_free_tail = NULL; } /* * https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhtml */ if (token->type == HTML_TOKEN_CHARACTER && token->ch.c == '\n' && html->skip_newline_char_token) { html->skip_newline_char_token = false; return; } HTML_DEBUG((" => token %s,", html_token_names[token->type])); reprocess: HTML_DEBUG((" mode %s", html_mode_names[html->mode])); if (!(html->current_node == NULL || html->current_node->ns == HTML_NAMESPACE_HTML || token->type == HTML_TOKEN_EOF)) { /* * Process the token according to the rules given in the section for * parsing tokens in foreign content. */ /* TODO mathml checks */ ret = html_process_token_in_foreign_content(html, token); if (ret != HTML_TOKEN_REPROCESS) return; HTML_DEBUG((" -R->")); /* fallthrough */ } /* * Process the token according to the rules given in the section * corresponding to the current insertion mode in HTML content. */ switch (html->mode) { case HTML_MODE_INITIAL: ret = html_process_token_initial(html, token); break; case HTML_MODE_BEFORE_HTML: ret = html_process_token_before_html(html, token); break; case HTML_MODE_BEFORE_HEAD: ret = html_process_token_before_head(html, token); break; case HTML_MODE_IN_HEAD: ret = html_process_token_in_head(html, token); break; case HTML_MODE_IN_HEAD_NOSCRIPT: ret = html_process_token_in_head_noscript(html, token); break; case HTML_MODE_AFTER_HEAD: ret = html_process_token_after_head(html, token); break; case HTML_MODE_IN_BODY: ret = html_process_token_in_body(html, token); break; case HTML_MODE_TEXT: ret = html_process_token_text(html, token); break; case HTML_MODE_IN_TABLE: ret = html_process_token_in_table(html, token); break; case HTML_MODE_IN_TABLE_TEXT: ret = html_process_token_in_table_text(html, token); break; case HTML_MODE_IN_CAPTION: ret = html_process_token_in_caption(html, token); break; case HTML_MODE_IN_COLUMN_GROUP: ret = html_process_token_in_column_group(html, token); break; case HTML_MODE_IN_TABLE_BODY: ret = html_process_token_in_table_body(html, token); break; case HTML_MODE_IN_ROW: ret = html_process_token_in_row(html, token); break; case HTML_MODE_IN_CELL: ret = html_process_token_in_cell(html, token); break; case HTML_MODE_IN_SELECT: ret = html_process_token_in_select(html, token); break; case HTML_MODE_IN_SELECT_IN_TABLE: ret = html_process_token_in_table(html, token); break; case HTML_MODE_IN_TEMPLATE: ret = html_process_token_in_template(html, token); break; case HTML_MODE_AFTER_BODY: ret = html_process_token_after_body(html, token); break; case HTML_MODE_IN_FRAMESET: ret = html_process_token_in_frameset(html, token); break; case HTML_MODE_AFTER_FRAMESET: ret = html_process_token_after_frameset(html, token); break; case HTML_MODE_AFTER_AFTER_BODY: ret = html_process_token_after_after_body(html, token); break; case HTML_MODE_AFTER_AFTER_FRAMESET: ret = html_process_token_after_after_frameset(html, token); break; default: panic("bogus mode"); } if (ret == HTML_TOKEN_REPROCESS) { HTML_DEBUG((" -R->")); goto reprocess; } } html_token_act html_process_token_initial(struct html_page *html, html_token *token) { /* * https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode */ if (token->type == HTML_TOKEN_CHARACTER && (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || token->ch.c == '\r' || token->ch.c == ' ')) { /* ignore */ return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_COMMENT) { /* XXX: insert as "last child of the Document object" */ html_append_comment(html, &token->comment); return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_DOCTYPE) { /* TODO: handle if doctype is not "html" */ html->mode = HTML_MODE_BEFORE_HTML; return HTML_TOKEN_PROCESSED; } /* TODO: check if "document is not an iframe srcdoc document" */ if (true) { html_parse_error(html); if (!html->parser_cannot_change_mode) html->quirks_mode = true; } html->mode = HTML_MODE_BEFORE_HTML; return HTML_TOKEN_REPROCESS; } html_token_act html_process_token_before_html(struct html_page *html, html_token *token) { html_token ttoken; /* * https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode */ if (token->type == HTML_TOKEN_DOCTYPE) { /* parse error, ignore */ html_parse_error(html); return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_COMMENT) { html_append_comment(html, &token->comment); return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_CHARACTER && (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || token->ch.c == '\r' || token->ch.c == ' ')) { /* ignore */ return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_START_TAG && token->tag.type == HTML_TAG_HTML) { html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); html->mode = HTML_MODE_BEFORE_HEAD; return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_END_TAG && (token->tag.type == HTML_TAG_HEAD || token->tag.type == HTML_TAG_BODY || token->tag.type == HTML_TAG_HTML || token->tag.type == HTML_TAG_BR)) { goto anything_else; } if (token->type == HTML_TOKEN_END_TAG) { /* parse error, ignore */ html_parse_error(html); return HTML_TOKEN_PROCESSED; } anything_else: memset(&ttoken, 0, sizeof(html_token)); ttoken.type = HTML_TOKEN_START_TAG; ttoken.tag.type = HTML_TAG_HTML; html_append_element_for_token(html, &ttoken, HTML_NAMESPACE_HTML); html->mode = HTML_MODE_BEFORE_HEAD; return HTML_TOKEN_REPROCESS; } html_token_act html_process_token_before_head(struct html_page *html, html_token *token) { html_token ttoken; /* * https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode */ if (token->type == HTML_TOKEN_CHARACTER && (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || token->ch.c == '\r' || token->ch.c == ' ')) { /* ignore */ return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_COMMENT) { html_append_comment(html, &token->comment); return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_DOCTYPE) { /* parse error, ignore */ html_parse_error(html); return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_START_TAG && token->tag.type == HTML_TAG_HTML) { /* process as "in body" */ html_process_token_in_body(html, token); return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_START_TAG && token->tag.type == HTML_TAG_HEAD) { html->head = html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); html->mode = HTML_MODE_IN_HEAD; return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_END_TAG && !(token->tag.type == HTML_TAG_HEAD || token->tag.type == HTML_TAG_BODY || token->tag.type == HTML_TAG_HTML || token->tag.type == HTML_TAG_BR)) { /* parse error, ignore */ html_parse_error(html); return HTML_TOKEN_PROCESSED; } memset(&ttoken, 0, sizeof(html_token)); ttoken.type = HTML_TOKEN_START_TAG; ttoken.tag.type = HTML_TAG_HEAD; html_append_element_for_token(html, &ttoken, HTML_NAMESPACE_HTML); html->mode = HTML_MODE_IN_HEAD; return HTML_TOKEN_REPROCESS; } html_token_act html_process_token_in_head(struct html_page *html, html_token *token) { /* * https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead */ if (token->type == HTML_TOKEN_CHARACTER && (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || token->ch.c == '\r' || token->ch.c == ' ')) { html_insert_character(html, token->ch.c); return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_COMMENT) { html_append_comment(html, &token->comment); return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_DOCTYPE) { /* parse error, ignore */ html_parse_error(html); return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_START_TAG && token->tag.type == HTML_TAG_HTML) { /* process as "in body" */ html_process_token_in_body(html, token); return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_START_TAG && (token->tag.type == HTML_TAG_BASE || token->tag.type == HTML_TAG_BASEFONT || token->tag.type == HTML_TAG_BGSOUND || token->tag.type == HTML_TAG_LINK)) { html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); html_pop_current_element(html); if (token->tag.self_closing) token->tag.self_closing_acked = true; return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_START_TAG && token->tag.type == HTML_TAG_META) { html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); html_pop_current_element(html); if (token->tag.self_closing) token->tag.self_closing_acked = true; /* TODO: check "charset" and "http-equiv" and change encoding */ return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_START_TAG && token->tag.type == HTML_TAG_TITLE) { /* "RCDATA element parsing algorithm" */ html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); html->state = HTML_STATE_RCDATA; html->original_mode = html->mode; html->mode = HTML_MODE_TEXT; return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_START_TAG && ((token->tag.type == HTML_TAG_NOSCRIPT && html->scripting) || (token->tag.type == HTML_TAG_NOFRAMES || token->tag.type == HTML_TAG_STYLE))) { /* "raw text element parsing algorithm" */ html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); html->state = HTML_STATE_RAWTEXT; html->original_mode = html->mode; html->mode = HTML_MODE_TEXT; return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_START_TAG && token->tag.type == HTML_TAG_NOSCRIPT && !html->scripting) { html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); html->mode = HTML_MODE_IN_HEAD_NOSCRIPT; return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_START_TAG && token->tag.type == HTML_TAG_SCRIPT) { /* TODO: more stuff according to docs */ html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); html->state = HTML_STATE_SCRIPT_DATA; html->original_mode = html->mode; html->mode = HTML_MODE_TEXT; return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_END_TAG && token->tag.type == HTML_TAG_HEAD) { /* this should be head */ html_pop_current_element(html); html->mode = HTML_MODE_AFTER_HEAD; return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_END_TAG && (token->tag.type == HTML_TAG_BODY || token->tag.type == HTML_TAG_HTML || token->tag.type == HTML_TAG_BR)) { goto anything_else; } if (token->type == HTML_TOKEN_START_TAG && token->tag.type == HTML_TAG_TEMPLATE) { html_push_active_formatting_marker(html, token->type); html->frameset_ok = false; html->mode = HTML_MODE_IN_TEMPLATE; /* TODO: draw the rest of the owl */ return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_END_TAG && token->tag.type == HTML_TAG_TEMPLATE) { if (!html_has_tag_open(html, HTML_TAG_TEMPLATE)) { /* parse error, ignore */ html_parse_error(html); return HTML_TOKEN_PROCESSED; } html_generate_implied_end_tags(html, NULL, true); if (token->tag.type != HTML_TAG_TEMPLATE) { /* parse error */ html_parse_error(html); } html_pop_nodes_until_past_tag(html, HTML_TAG_TEMPLATE); /* * TODO: "Clear the list of active formatting elements up to the last * marker." */ /* * TODO: "Pop the current template insertion mode off the stack of * template insertion modes." */ /* TODO: "Reset the insertion mode appropriately." */ return HTML_TOKEN_PROCESSED; } if ((token->type == HTML_TOKEN_START_TAG && token->tag.type == HTML_TAG_HEAD) || token->type == HTML_TOKEN_END_TAG) { /* parse error, ignore */ html_parse_error(html); return HTML_TOKEN_PROCESSED; } anything_else: /* this should be head */ html_pop_current_element(html); html->mode = HTML_MODE_AFTER_HEAD; return HTML_TOKEN_REPROCESS; } html_token_act html_process_token_in_head_noscript(struct html_page *html, html_token *token) { /* * https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inheadnoscript */ if (token->type == HTML_TOKEN_DOCTYPE) { /* parse error, ignore */ html_parse_error(html); return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_START_TAG && token->tag.type == HTML_TAG_HTML) { /* process as "in body" */ html_process_token_in_body(html, token); return HTML_TOKEN_PROCESSED; } if (token->type == HTML_TOKEN_END_TAG && token->tag.type == HTML_TAG_NOSCRIPT) { /* this should be