AmendHub

Download:

jcs

/

detritus

/

amendments

/

58

html_tree: Import HTML tree constructor written to WHATWG specs


jcs made amendment 58 about 1 year ago
--- html_tree.c Tue Dec 10 23:13:13 2024 +++ html_tree.c Tue Dec 10 23:13:13 2024 @@ -0,0 +1,3567 @@ +/* + * Copyright (c) 2024 joshua stein <jcs@jcs.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Tree construction + * https://html.spec.whatwg.org/multipage/parsing.html#tree-construction + * + * html_tokenize() outputs tokens of various types to the html_emit_*token() + * functions, which then output them to html_process_token() here for tree + * building, tag order manipulation, tag closing, etc. + */ + +#include "html.h" + +void html_deref_element(struct html_page *html, + struct html_element *element); +void html_append_element(struct html_page *html, + struct html_element *element); +struct html_element * html_create_element_for_token(struct html_page *html, + html_token *token); +struct html_element * html_append_element_for_token(struct html_page *html, + html_token *token, html_namespace ns); +bool html_remove_open_element(struct html_page *html, + struct html_element *element); + +html_token_act html_process_token_initial(struct html_page *html, + html_token *token); +html_token_act html_process_token_before_html(struct html_page *html, + html_token *token); +html_token_act html_process_token_before_head(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_head(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_head_noscript(struct html_page *html, + html_token *token); +html_token_act html_process_token_after_head(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_body(struct html_page *html, + html_token *token); +html_token_act html_process_token_text(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_table(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_table_text(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_caption(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_column_group(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_table_body(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_row(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_cell(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_select(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_select_in_table(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_template(struct html_page *html, + html_token *token); +html_token_act html_process_token_after_body(struct html_page *html, + html_token *token); +html_token_act html_process_token_in_frameset(struct html_page *html, + html_token *token); +html_token_act html_process_token_after_frameset(struct html_page *html, + html_token *token); +html_token_act html_process_token_after_after_body(struct html_page *html, + html_token *token); +html_token_act html_process_token_after_after_frameset(struct html_page *html, + html_token *token); + +void html_pop_current_element(struct html_page *html); +void html_pop_nodes_until_past_tag(struct html_page *html, + html_tag_type stop_after); +void html_pop_nodes_until_past_element(struct html_page *html, + struct html_element *element); +void html_close_p(struct html_page *html); +void html_generate_implied_end_tags(struct html_page *html, char *except, + bool thoroughly); + +/* active formatting */ +void html_push_active_formatting_element(struct html_page *html, + struct html_element *element, html_token_type token_type); +void html_push_active_formatting_marker(struct html_page *html, + html_token_type token_type); +bool html_is_tag_in_active_formatting(struct html_page *html, + html_tag_type tag); +bool html_is_element_in_active_formatting(struct html_page *html, + struct html_element *element); +bool html_remove_active_formatting_element(struct html_page *html, + struct html_element *element); +void html_reconstruct_active_formatting(struct html_page *html); +void html_clear_active_formatting_to_last_marker(struct html_page *html); +bool html_run_adoption_agency(struct html_page *html, html_token *token); + +/* helpers */ +bool html_is_element_special(struct html_page *html, struct html_element *el); +bool html_is_element_formatting(struct html_page *html, + struct html_element *el); +bool html_is_element_open(struct html_page *html, struct html_element *el); +bool html_has_tag_open(struct html_page *html, html_tag_type tag); +bool html_has_element_in_scope(struct html_page *html, + struct html_element *element, html_scope scope); +bool html_has_element_with_tag_open_in_scope(struct html_page *html, + html_tag_type tag, html_scope scope); +bool html_has_element_or_one_with_tag_open_in_scope(struct html_page *html, + struct html_element *element, html_tag_type tag, html_scope scope); +bool html_element_serializes_as_void(struct html_page *html, + struct html_element *element); + +void +html_append_element(struct html_page *html, struct html_element *element) +{ + short n; + + if (html->open_count >= nitems(html->open)) + panic("ran out of tag stack space"); + + if (html->current_node) { + HTML_DEBUG((": rendering current before-append <%s>", + html->current_node->name)); + html_render_current_node(html, false); + } + + HTML_DEBUG((": appending element")); + if (element->ns != HTML_NAMESPACE_HTML) + HTML_DEBUG((" in namespace %d", element->ns)); + HTML_DEBUG((": %d: <%s>", html->open_count, element->name)); + + html->open[html->open_count++] = element; + element->refs++; + html->current_node = element; + + switch (element->type) { + case HTML_TAG_BLOCKQUOTE: + case HTML_TAG_CENTER: + case HTML_TAG_DL: + case HTML_TAG_H1: + case HTML_TAG_H2: + case HTML_TAG_H3: + case HTML_TAG_H4: + case HTML_TAG_H5: + case HTML_TAG_H6: + case HTML_TAG_MENU: + case HTML_TAG_P: + element->margin_top = 1; + element->margin_bottom = 1; + break; + case HTML_TAG_OL: + case HTML_TAG_UL: + /* only give margins if not inside another list */ + for (n = html->open_count - 2; n >= 0; n--) { + if (html->open[n]->type == HTML_TAG_OL || + html->open[n]->type == HTML_TAG_UL) + break; + + if (n == 0) { + element->margin_top = 1; + element->margin_bottom = 1; + } + } + break; + } + + HTML_DEBUG((": now open: ")); + for (n = 0; n <= html->open_count - 1; n++) + HTML_DEBUG(("<%s>", html->open[n]->name)); +} + +void +html_append_comment(struct html_page *html, struct html_comment *comment) +{ +#if 0 + size_t esclen; + char *esc; + + /* + * https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments:comment-2 + */ + + esclen = comment->len; + esc = html_escape_string(html, comment->data, &esclen, false); + html_buffer_output(html, "<!--", 4); + html_buffer_output(html, esc, esclen); + html_buffer_output(html, "-->", 3); +#endif +} + +struct html_element * +html_create_element_for_token(struct html_page *html, html_token *token) +{ + struct html_element *element; + + if (token->tag.name[0] == '\0') + token->tag.name_len = strlcpy(token->tag.name, + html_tag_names[token->tag.type], sizeof(token->tag.name)); + + /* TODO: do an optimized allocation only the size we need */ + element = xmalloczero(sizeof(struct html_element)); + element->type = token->tag.type; + memcpy(element->name, token->tag.name, sizeof(element->name)); + element->name_len = token->tag.name_len; + memcpy(element->attrs, token->tag.attrs, sizeof(element->attrs)); + element->attrs_count = token->tag.attrs_count; + + return element; +} + +void +html_deref_element(struct html_page *html, struct html_element *element) +{ + if (element->refs == 0) + Debugger(); + else + element->refs--; + + if (element->refs == 0) { + if (html->need_free_list) { + html->need_free_tail->next_need_free = element; + html->need_free_tail = element; + } else { + html->need_free_list = element; + html->need_free_tail = element; + } + } +} + +struct html_element * +html_append_element_for_token(struct html_page *html, html_token *token, + html_namespace ns) +{ + struct html_element *element; + + element = html_create_element_for_token(html, token); + element->ns = ns; + html_append_element(html, element); + return element; +} + +void +html_process_token(struct html_page *html, html_token *token) +{ + html_token_act ret; + struct html_element *el; + + while (html->need_free_list) { + HTML_DEBUG((": freeing deref'd <%s>", html->need_free_list->name)); + el = html->need_free_list->next_need_free; + if (html->need_free_list->text) + xfree(&html->need_free_list->text); + xfree(&html->need_free_list); + html->need_free_list = el; + html->need_free_tail = NULL; + } + + /* + * https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhtml + */ + + if (token->type == HTML_TOKEN_CHARACTER && token->ch.c == '\n' && + html->skip_newline_char_token) { + html->skip_newline_char_token = false; + return; + } + + HTML_DEBUG((" => token %s,", html_token_names[token->type])); + +reprocess: + HTML_DEBUG((" mode %s", html_mode_names[html->mode])); + + if (!(html->current_node == NULL || + html->current_node->ns == HTML_NAMESPACE_HTML || + token->type == HTML_TOKEN_EOF)) { + /* + * Process the token according to the rules given in the section for + * parsing tokens in foreign content. + */ + + /* TODO mathml checks */ + + ret = html_process_token_in_foreign_content(html, token); + if (ret != HTML_TOKEN_REPROCESS) + return; + + HTML_DEBUG((" -R->")); + /* fallthrough */ + } + + /* + * Process the token according to the rules given in the section + * corresponding to the current insertion mode in HTML content. + */ + switch (html->mode) { + case HTML_MODE_INITIAL: + ret = html_process_token_initial(html, token); + break; + case HTML_MODE_BEFORE_HTML: + ret = html_process_token_before_html(html, token); + break; + case HTML_MODE_BEFORE_HEAD: + ret = html_process_token_before_head(html, token); + break; + case HTML_MODE_IN_HEAD: + ret = html_process_token_in_head(html, token); + break; + case HTML_MODE_IN_HEAD_NOSCRIPT: + ret = html_process_token_in_head_noscript(html, token); + break; + case HTML_MODE_AFTER_HEAD: + ret = html_process_token_after_head(html, token); + break; + case HTML_MODE_IN_BODY: + ret = html_process_token_in_body(html, token); + break; + case HTML_MODE_TEXT: + ret = html_process_token_text(html, token); + break; + case HTML_MODE_IN_TABLE: + ret = html_process_token_in_table(html, token); + break; + case HTML_MODE_IN_TABLE_TEXT: + ret = html_process_token_in_table_text(html, token); + break; + case HTML_MODE_IN_CAPTION: + ret = html_process_token_in_caption(html, token); + break; + case HTML_MODE_IN_COLUMN_GROUP: + ret = html_process_token_in_column_group(html, token); + break; + case HTML_MODE_IN_TABLE_BODY: + ret = html_process_token_in_table_body(html, token); + break; + case HTML_MODE_IN_ROW: + ret = html_process_token_in_row(html, token); + break; + case HTML_MODE_IN_CELL: + ret = html_process_token_in_cell(html, token); + break; + case HTML_MODE_IN_SELECT: + ret = html_process_token_in_select(html, token); + break; + case HTML_MODE_IN_SELECT_IN_TABLE: + ret = html_process_token_in_table(html, token); + break; + case HTML_MODE_IN_TEMPLATE: + ret = html_process_token_in_template(html, token); + break; + case HTML_MODE_AFTER_BODY: + ret = html_process_token_after_body(html, token); + break; + case HTML_MODE_IN_FRAMESET: + ret = html_process_token_in_frameset(html, token); + break; + case HTML_MODE_AFTER_FRAMESET: + ret = html_process_token_after_frameset(html, token); + break; + case HTML_MODE_AFTER_AFTER_BODY: + ret = html_process_token_after_after_body(html, token); + break; + case HTML_MODE_AFTER_AFTER_FRAMESET: + ret = html_process_token_after_after_frameset(html, token); + break; + default: + panic("bogus mode"); + } + + if (ret == HTML_TOKEN_REPROCESS) { + HTML_DEBUG((" -R->")); + goto reprocess; + } +} + +html_token_act +html_process_token_initial(struct html_page *html, html_token *token) +{ + /* + * https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode + */ + + if (token->type == HTML_TOKEN_CHARACTER && + (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || + token->ch.c == '\r' || token->ch.c == ' ')) { + /* ignore */ + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_COMMENT) { + /* XXX: insert as "last child of the Document object" */ + html_append_comment(html, &token->comment); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_DOCTYPE) { + /* TODO: handle if doctype is not "html" */ + + html->mode = HTML_MODE_BEFORE_HTML; + return HTML_TOKEN_PROCESSED; + } + + /* TODO: check if "document is not an iframe srcdoc document" */ + if (true) { + html_parse_error(html); + if (!html->parser_cannot_change_mode) + html->quirks_mode = true; + } + + html->mode = HTML_MODE_BEFORE_HTML; + return HTML_TOKEN_REPROCESS; +} + +html_token_act +html_process_token_before_html(struct html_page *html, html_token *token) +{ + html_token ttoken; + + /* + * https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode + */ + + if (token->type == HTML_TOKEN_DOCTYPE) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_COMMENT) { + html_append_comment(html, &token->comment); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_CHARACTER && + (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || + token->ch.c == '\r' || token->ch.c == ' ')) { + /* ignore */ + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HTML) { + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html->mode = HTML_MODE_BEFORE_HEAD; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + (token->tag.type == HTML_TAG_HEAD || + token->tag.type == HTML_TAG_BODY || + token->tag.type == HTML_TAG_HTML || + token->tag.type == HTML_TAG_BR)) { + goto anything_else; + } + + if (token->type == HTML_TOKEN_END_TAG) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + +anything_else: + memset(&ttoken, 0, sizeof(html_token)); + ttoken.type = HTML_TOKEN_START_TAG; + ttoken.tag.type = HTML_TAG_HTML; + html_append_element_for_token(html, &ttoken, HTML_NAMESPACE_HTML); + + html->mode = HTML_MODE_BEFORE_HEAD; + return HTML_TOKEN_REPROCESS; +} + +html_token_act +html_process_token_before_head(struct html_page *html, html_token *token) +{ + html_token ttoken; + + /* + * https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode + */ + + if (token->type == HTML_TOKEN_CHARACTER && + (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || + token->ch.c == '\r' || token->ch.c == ' ')) { + /* ignore */ + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_COMMENT) { + html_append_comment(html, &token->comment); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_DOCTYPE) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HTML) { + /* process as "in body" */ + html_process_token_in_body(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HEAD) { + html->head = html_append_element_for_token(html, token, + HTML_NAMESPACE_HTML); + html->mode = HTML_MODE_IN_HEAD; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + !(token->tag.type == HTML_TAG_HEAD || + token->tag.type == HTML_TAG_BODY || + token->tag.type == HTML_TAG_HTML || + token->tag.type == HTML_TAG_BR)) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + memset(&ttoken, 0, sizeof(html_token)); + ttoken.type = HTML_TOKEN_START_TAG; + ttoken.tag.type = HTML_TAG_HEAD; + html_append_element_for_token(html, &ttoken, HTML_NAMESPACE_HTML); + + html->mode = HTML_MODE_IN_HEAD; + return HTML_TOKEN_REPROCESS; +} + +html_token_act +html_process_token_in_head(struct html_page *html, html_token *token) +{ + /* + * https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead + */ + + if (token->type == HTML_TOKEN_CHARACTER && + (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || + token->ch.c == '\r' || token->ch.c == ' ')) { + html_insert_character(html, token->ch.c); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_COMMENT) { + html_append_comment(html, &token->comment); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_DOCTYPE) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HTML) { + /* process as "in body" */ + html_process_token_in_body(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_BASE || + token->tag.type == HTML_TAG_BASEFONT || + token->tag.type == HTML_TAG_BGSOUND || + token->tag.type == HTML_TAG_LINK)) { + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html_pop_current_element(html); + + if (token->tag.self_closing) + token->tag.self_closing_acked = true; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_META) { + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html_pop_current_element(html); + + if (token->tag.self_closing) + token->tag.self_closing_acked = true; + + /* TODO: check "charset" and "http-equiv" and change encoding */ + + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_TITLE) { + /* "RCDATA element parsing algorithm" */ + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html->state = HTML_STATE_RCDATA; + html->original_mode = html->mode; + html->mode = HTML_MODE_TEXT; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + ((token->tag.type == HTML_TAG_NOSCRIPT && html->scripting) || + (token->tag.type == HTML_TAG_NOFRAMES || + token->tag.type == HTML_TAG_STYLE))) { + /* "raw text element parsing algorithm" */ + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html->state = HTML_STATE_RAWTEXT; + html->original_mode = html->mode; + html->mode = HTML_MODE_TEXT; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_NOSCRIPT && !html->scripting) { + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html->mode = HTML_MODE_IN_HEAD_NOSCRIPT; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_SCRIPT) { + /* TODO: more stuff according to docs */ + + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html->state = HTML_STATE_SCRIPT_DATA; + html->original_mode = html->mode; + html->mode = HTML_MODE_TEXT; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && token->tag.type == HTML_TAG_HEAD) { + /* this should be head */ + html_pop_current_element(html); + html->mode = HTML_MODE_AFTER_HEAD; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + (token->tag.type == HTML_TAG_BODY || + token->tag.type == HTML_TAG_HTML || + token->tag.type == HTML_TAG_BR)) { + goto anything_else; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_TEMPLATE) { + html_push_active_formatting_marker(html, token->type); + html->frameset_ok = false; + html->mode = HTML_MODE_IN_TEMPLATE; + + /* TODO: draw the rest of the owl */ + + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + token->tag.type == HTML_TAG_TEMPLATE) { + if (!html_has_tag_open(html, HTML_TAG_TEMPLATE)) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + html_generate_implied_end_tags(html, NULL, true); + + if (token->tag.type != HTML_TAG_TEMPLATE) { + /* parse error */ + html_parse_error(html); + } + + html_pop_nodes_until_past_tag(html, HTML_TAG_TEMPLATE); + + /* + * TODO: "Clear the list of active formatting elements up to the last + * marker." + */ + + /* + * TODO: "Pop the current template insertion mode off the stack of + * template insertion modes." + */ + + /* TODO: "Reset the insertion mode appropriately." */ + + return HTML_TOKEN_PROCESSED; + } + + if ((token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HEAD) || + token->type == HTML_TOKEN_END_TAG) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + +anything_else: + /* this should be head */ + html_pop_current_element(html); + html->mode = HTML_MODE_AFTER_HEAD; + return HTML_TOKEN_REPROCESS; +} + +html_token_act +html_process_token_in_head_noscript(struct html_page *html, html_token *token) +{ + /* + * https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inheadnoscript + */ + + if (token->type == HTML_TOKEN_DOCTYPE) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HTML) { + /* process as "in body" */ + html_process_token_in_body(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + token->tag.type == HTML_TAG_NOSCRIPT) { + /* this should be <noscript> */ + html_pop_current_element(html); + /* current tag should now be <head> */ + html->mode = HTML_MODE_IN_HEAD; + return HTML_TOKEN_PROCESSED; + } + + if ((token->type == HTML_TOKEN_CHARACTER && + (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || + token->ch.c == '\r' || token->ch.c == ' ')) || + (token->type == HTML_TOKEN_COMMENT) || + (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_BASEFONT || + token->tag.type == HTML_TAG_BGSOUND || + token->tag.type == HTML_TAG_LINK || + token->tag.type == HTML_TAG_META || + token->tag.type == HTML_TAG_NOFRAMES || + token->tag.type == HTML_TAG_STYLE))) { + /* process as "in head" */ + html_process_token_in_head(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && token->tag.type == HTML_TAG_BR) + goto anything_else; + + if ((token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_HEAD || + token->tag.type == HTML_TAG_NOSCRIPT)) || + token->type == HTML_TOKEN_END_TAG) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + +anything_else: + /* parse error */ + html_parse_error(html); + + /* this should be noscript */ + html_pop_current_element(html); + /* current tag should now be <head> */ + html->mode = HTML_MODE_IN_HEAD; + return HTML_TOKEN_REPROCESS; +} + +html_token_act +html_process_token_after_head(struct html_page *html, html_token *token) +{ + html_token ttoken; + + /* + * https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode + */ + + if (token->type == HTML_TOKEN_CHARACTER && + (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || + token->ch.c == '\r' || token->ch.c == ' ')) { + html_insert_character(html, token->ch.c); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_COMMENT) { + html_append_comment(html, &token->comment); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_DOCTYPE) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HTML) { + /* process as "in body" */ + html_process_token_in_body(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_BODY) { + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html->frameset_ok = false; + html->mode = HTML_MODE_IN_BODY; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_FRAMESET) { + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html->mode = HTML_MODE_IN_FRAMESET; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_BASE || + token->tag.type == HTML_TAG_BASEFONT || + token->tag.type == HTML_TAG_BGSOUND || + token->tag.type == HTML_TAG_LINK || + token->tag.type == HTML_TAG_META || + token->tag.type == HTML_TAG_NOFRAMES || + token->tag.type == HTML_TAG_SCRIPT || + token->tag.type == HTML_TAG_STYLE || + token->tag.type == HTML_TAG_TEMPLATE || + token->tag.type == HTML_TAG_TITLE)) { + /* parse error */ + html_parse_error(html); + + html_append_element(html, html->head); + + /* process as "in head" */ + html_process_token_in_head(html, token); + + html_remove_open_element(html, html->head); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + token->tag.type == HTML_TAG_TEMPLATE) { + /* process as "in head" */ + html_process_token_in_head(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + (token->tag.type == HTML_TAG_BODY || + token->tag.type == HTML_TAG_HTML || + token->tag.type == HTML_TAG_BR)) + goto anything_else; + + if ((token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HEAD) || + token->type == HTML_TOKEN_END_TAG) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + +anything_else: + memset(&ttoken, 0, sizeof(html_token)); + ttoken.type = HTML_TOKEN_START_TAG; + ttoken.tag.type = HTML_TAG_BODY; + html_append_element_for_token(html, &ttoken, HTML_NAMESPACE_HTML); + + html->mode = HTML_MODE_IN_BODY; + return HTML_TOKEN_REPROCESS; +} + +html_token_act +html_process_token_in_body(struct html_page *html, html_token *token) +{ + html_token ttoken; + struct html_element *element, *node; + short n; + + /* + * https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody + */ + + if (token->type == HTML_TOKEN_CHARACTER && token->ch.c == '\0') { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_CHARACTER && + (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || + token->ch.c == '\r' || token->ch.c == ' ')) { + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + html_insert_character(html, token->ch.c); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_CHARACTER) { + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + html_insert_character(html, token->ch.c); + html->frameset_ok = false; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_COMMENT) { + html_append_comment(html, &token->comment); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_DOCTYPE) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HTML) { + /* parse error */ + html_parse_error(html); + if (html_has_tag_open(html, HTML_TAG_TEMPLATE)) { + /* ignore */ + return HTML_TOKEN_PROCESSED; + } + + /* TODO: add attrs to first html tag it doesn't already have */ + + return HTML_TOKEN_PROCESSED; + } + + if ((token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_BASE || + token->tag.type == HTML_TAG_BASEFONT || + token->tag.type == HTML_TAG_BGSOUND || + token->tag.type == HTML_TAG_LINK || + token->tag.type == HTML_TAG_META || + token->tag.type == HTML_TAG_NOFRAMES || + token->tag.type == HTML_TAG_SCRIPT || + token->tag.type == HTML_TAG_STYLE || + token->tag.type == HTML_TAG_TEMPLATE || + token->tag.type == HTML_TAG_TITLE)) || + (token->type == HTML_TOKEN_END_TAG && + token->tag.type == HTML_TAG_TEMPLATE)) { + /* process as "in head" */ + html_process_token_in_head(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_BODY) { + /* parse error */ + html_parse_error(html); + if (html->open_count == 1 || html->open[1]->type != HTML_TAG_BODY || + html_has_tag_open(html, HTML_TAG_TEMPLATE)) { + /* ignore */ + return HTML_TOKEN_PROCESSED; + } + html->frameset_ok = false; + + /* TODO: add attrs to first body tag it doesn't already have */ + + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_FRAMESET) { + /* parse error */ + html_parse_error(html); + if (html->open_count == 1 || html->open[1]->type != HTML_TAG_BODY || + html_has_tag_open(html, HTML_TAG_TEMPLATE)) { + /* ignore */ + return HTML_TOKEN_PROCESSED; + } + + if (!html->frameset_ok) { + /* ignore */ + return HTML_TOKEN_PROCESSED; + } + + /* pop all nodes except root html */ + while (html->open_count != 1) + html_pop_current_element(html); + + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html->mode = HTML_MODE_IN_FRAMESET; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_EOF) { + /* supposed to do more here but it all ends up the same */ + html->eof = true; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + (token->tag.type == HTML_TAG_BODY || + token->tag.type == HTML_TAG_HTML)) { + if (!html_has_element_with_tag_open_in_scope(html, HTML_TAG_BODY, + HTML_SCOPE_DEFAULT)) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + for (n = 0; n < html->open_count; n++) { + if (html->open[n]->type == HTML_TAG_DD || + html->open[n]->type == HTML_TAG_DT || + html->open[n]->type == HTML_TAG_LI || + html->open[n]->type == HTML_TAG_OPTGROUP || + html->open[n]->type == HTML_TAG_OPTION || + html->open[n]->type == HTML_TAG_P || + html->open[n]->type == HTML_TAG_RB || + html->open[n]->type == HTML_TAG_RP || + html->open[n]->type == HTML_TAG_RT || + html->open[n]->type == HTML_TAG_RTC || + html->open[n]->type == HTML_TAG_TBODY || + html->open[n]->type == HTML_TAG_TD || + html->open[n]->type == HTML_TAG_TFOOT || + html->open[n]->type == HTML_TAG_TH || + html->open[n]->type == HTML_TAG_THEAD || + html->open[n]->type == HTML_TAG_TR || + html->open[n]->type == HTML_TAG_BODY || + html->open[n]->type == HTML_TAG_HTML) + continue; + + html_parse_error(html); + break; + } + + html->mode = HTML_MODE_AFTER_BODY; + + if (token->tag.type == HTML_TAG_HTML) + return HTML_TOKEN_REPROCESS; + + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_ADDRESS || + token->tag.type == HTML_TAG_ARTICLE || + token->tag.type == HTML_TAG_ASIDE || + token->tag.type == HTML_TAG_BLOCKQUOTE || + token->tag.type == HTML_TAG_CENTER || + token->tag.type == HTML_TAG_DETAILS || + token->tag.type == HTML_TAG_DIALOG || + token->tag.type == HTML_TAG_DIR || + token->tag.type == HTML_TAG_DIV || + token->tag.type == HTML_TAG_DL || + token->tag.type == HTML_TAG_FIELDSET || + token->tag.type == HTML_TAG_FIGCAPTION || + token->tag.type == HTML_TAG_FIGURE || + token->tag.type == HTML_TAG_FOOTER || + token->tag.type == HTML_TAG_HEADER || + token->tag.type == HTML_TAG_HGROUP || + token->tag.type == HTML_TAG_MAIN || + token->tag.type == HTML_TAG_MENU || + token->tag.type == HTML_TAG_NAV || + token->tag.type == HTML_TAG_OL || + token->tag.type == HTML_TAG_P || + token->tag.type == HTML_TAG_SEARCH || + token->tag.type == HTML_TAG_SECTION || + token->tag.type == HTML_TAG_SUMMARY || + token->tag.type == HTML_TAG_UL)) { + if (html_has_element_with_tag_open_in_scope(html, HTML_TAG_P, + HTML_SCOPE_BUTTON)) + html_close_p(html); + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_H1 || + token->tag.type == HTML_TAG_H2 || + token->tag.type == HTML_TAG_H3 || + token->tag.type == HTML_TAG_H4 || + token->tag.type == HTML_TAG_H5 || + token->tag.type == HTML_TAG_H6)) { + if (html_has_element_with_tag_open_in_scope(html, HTML_TAG_P, + HTML_SCOPE_BUTTON)) + html_close_p(html); + + if (token->type == HTML_TOKEN_START_TAG && + (html->current_node->type == HTML_TAG_H1 || + html->current_node->type == HTML_TAG_H2 || + html->current_node->type == HTML_TAG_H3 || + html->current_node->type == HTML_TAG_H4 || + html->current_node->type == HTML_TAG_H5 || + html->current_node->type == HTML_TAG_H6)) { + /* parse error */ + html_parse_error(html); + html_pop_current_element(html); + } + + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_PRE || + token->tag.type == HTML_TAG_LISTING)) { + if (html_has_element_with_tag_open_in_scope(html, HTML_TAG_P, + HTML_SCOPE_BUTTON)) + html_close_p(html); + + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + + html->skip_newline_char_token = true; + html->frameset_ok = false; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_FORM) { + if (html->form && !html_has_tag_open(html, HTML_TAG_TEMPLATE)) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + if (html_has_element_with_tag_open_in_scope(html, HTML_TAG_P, + HTML_SCOPE_BUTTON)) + html_close_p(html); + + element = html_append_element_for_token(html, token, + HTML_NAMESPACE_HTML); + + if (!html_has_tag_open(html, HTML_TAG_TEMPLATE)) + html->form = element; + + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_LI) { + html->frameset_ok = false; + + /* TODO: docs say to run a loop doing stuff here */ + + if (html_has_element_with_tag_open_in_scope(html, HTML_TAG_P, + HTML_SCOPE_BUTTON)) + html_close_p(html); + + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_DD || + token->tag.type == HTML_TAG_DT)) { + html->frameset_ok = false; + + /* TODO: docs say to run a loop doing stuff here */ + + if (html_has_element_with_tag_open_in_scope(html, HTML_TAG_P, + HTML_SCOPE_BUTTON)) + html_close_p(html); + + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_PLAINTEXT) { + if (html_has_element_with_tag_open_in_scope(html, HTML_TAG_P, + HTML_SCOPE_BUTTON)) + html_close_p(html); + + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html->state = HTML_STATE_PLAINTEXT; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_BUTTON) { + if (html_has_element_with_tag_open_in_scope(html, HTML_TAG_BUTTON, + HTML_SCOPE_DEFAULT)) { + /* parse error */ + html_parse_error(html); + html_generate_implied_end_tags(html, NULL, false); + html_pop_nodes_until_past_tag(html, HTML_TAG_BUTTON); + } + + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html->frameset_ok = false; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + (token->tag.type == HTML_TAG_ADDRESS || + token->tag.type == HTML_TAG_ARTICLE || + token->tag.type == HTML_TAG_ASIDE || + token->tag.type == HTML_TAG_BLOCKQUOTE || + token->tag.type == HTML_TAG_BUTTON || + token->tag.type == HTML_TAG_CENTER || + token->tag.type == HTML_TAG_DETAILS || + token->tag.type == HTML_TAG_DIALOG || + token->tag.type == HTML_TAG_DIR || + token->tag.type == HTML_TAG_DIV || + token->tag.type == HTML_TAG_DL || + token->tag.type == HTML_TAG_FIELDSET || + token->tag.type == HTML_TAG_FIGCAPTION || + token->tag.type == HTML_TAG_FIGURE || + token->tag.type == HTML_TAG_FOOTER || + token->tag.type == HTML_TAG_HEADER || + token->tag.type == HTML_TAG_HGROUP || + token->tag.type == HTML_TAG_LISTING || + token->tag.type == HTML_TAG_MAIN || + token->tag.type == HTML_TAG_MENU || + token->tag.type == HTML_TAG_NAV || + token->tag.type == HTML_TAG_OL || + token->tag.type == HTML_TAG_PRE || + token->tag.type == HTML_TAG_SEARCH || + token->tag.type == HTML_TAG_SECTION || + token->tag.type == HTML_TAG_SUMMARY || + token->tag.type == HTML_TAG_UL)) { + if (!html_has_element_with_tag_open_in_scope(html, + token->tag.type, HTML_SCOPE_DEFAULT)) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + html_generate_implied_end_tags(html, NULL, false); + + if (!html_has_tag_open(html, token->tag.type)) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + html_pop_nodes_until_past_tag(html, token->tag.type); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && token->tag.type == HTML_TAG_FORM) { + if (!html_has_tag_open(html, HTML_TAG_TEMPLATE)) { + /* TODO */ + } else { + if (!html_has_tag_open(html, HTML_TAG_FORM)) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + html_generate_implied_end_tags(html, NULL, false); + + if (html->current_node->type != HTML_TAG_FORM) { + /* parse error */ + html_parse_error(html); + } + + html_pop_nodes_until_past_tag(html, HTML_TAG_FORM); + } + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && token->tag.type == HTML_TAG_P) { + if (!html_has_element_with_tag_open_in_scope(html, HTML_TAG_P, + HTML_SCOPE_BUTTON)) { + /* parse error */ + html_parse_error(html); + memset(&ttoken, 0, sizeof(html_token)); + ttoken.type = HTML_TOKEN_START_TAG; + ttoken.tag.type = HTML_TAG_P; + html_append_element_for_token(html, &ttoken, HTML_NAMESPACE_HTML); + } + + html_close_p(html); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && token->tag.type == HTML_TAG_LI) { + if (!html_has_element_with_tag_open_in_scope(html, HTML_TAG_LI, + HTML_SCOPE_LIST_ITEM)) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + html_generate_implied_end_tags(html, "li", false); + + if (html->current_node->type != HTML_TAG_LI) { + /* parse error */ + html_parse_error(html); + } + + html_pop_nodes_until_past_tag(html, HTML_TAG_LI); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + (token->tag.type == HTML_TAG_DD || + token->tag.type == HTML_TAG_DT)) { + if (!html_has_tag_open(html, token->tag.type)) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + html_generate_implied_end_tags(html, token->tag.name, false); + + if (html->current_node->type != token->tag.type) { + /* parse error */ + html_parse_error(html); + } + + html_pop_nodes_until_past_tag(html, token->tag.type); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + (token->tag.type == HTML_TAG_H1 || + token->tag.type == HTML_TAG_H2 || + token->tag.type == HTML_TAG_H3 || + token->tag.type == HTML_TAG_H4 || + token->tag.type == HTML_TAG_H5 || + token->tag.type == HTML_TAG_H6)) { + if (!(html_has_element_with_tag_open_in_scope(html, HTML_TAG_H1, + HTML_SCOPE_DEFAULT) || + html_has_element_with_tag_open_in_scope(html, HTML_TAG_H2, + HTML_SCOPE_DEFAULT) || + html_has_element_with_tag_open_in_scope(html, HTML_TAG_H3, + HTML_SCOPE_DEFAULT) || + html_has_element_with_tag_open_in_scope(html, HTML_TAG_H4, + HTML_SCOPE_DEFAULT) || + html_has_element_with_tag_open_in_scope(html, HTML_TAG_H5, + HTML_SCOPE_DEFAULT) || + html_has_element_with_tag_open_in_scope(html, HTML_TAG_H6, + HTML_SCOPE_DEFAULT))) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + html_generate_implied_end_tags(html, NULL, false); + + if (html->current_node->type != token->tag.type) { + /* parse error */ + html_parse_error(html); + } + + html_pop_nodes_until_past_tag(html, token->tag.type); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + strcmp(token->tag.name, "sarcasm") == 0) { + /* TODO: take a deep breath */ + goto any_other_end_tag; + } + + if (token->type == HTML_TOKEN_START_TAG && token->tag.type == HTML_TAG_A) { + short last_marker = 0; + struct html_element *found_a; + + for (n = 0; n < html->active_formatting_count; n++) { + if (html->active_formatting[n].marker) { + HTML_DEBUG((": af[%d]=marker", n)); + } else { + HTML_DEBUG((": af[%d]=<%s>", n, + html->active_formatting[n].element->name)); + } + } + + /* find last marker, if any */ + for (n = html->active_formatting_count - 1; n >= 0; n--) { + if (html->active_formatting[n].marker) { + last_marker = n; + break; + } + } + + /* + * "If the list of active formatting elements contains an a element + * between the end of the list and the last marker on the list (or the + * start of the list if there is no marker on the list), then this is a + * parse error;" + */ + for (n = last_marker; n < html->active_formatting_count; n++) { + if (!html->active_formatting[n].element || + html->active_formatting[n].element->type != HTML_TAG_A) + continue; + + found_a = html->active_formatting[n].element; + html_parse_error(html); + + /* + * "then remove that element from the list of active formatting + * elements and the stack of open elements if the adoption + * agency algorithm didn't already remove it" + */ + html_run_adoption_agency(html, token); + html_remove_active_formatting_element(html, found_a); + break; + } + + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + element = html_append_element_for_token(html, token, + HTML_NAMESPACE_HTML); + html_push_active_formatting_element(html, element, token->type); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_B || + token->tag.type == HTML_TAG_BIG || + token->tag.type == HTML_TAG_CODE || + token->tag.type == HTML_TAG_EM || + token->tag.type == HTML_TAG_FONT || + token->tag.type == HTML_TAG_I || + token->tag.type == HTML_TAG_S || + token->tag.type == HTML_TAG_SMALL || + token->tag.type == HTML_TAG_STRIKE || + token->tag.type == HTML_TAG_STRONG || + token->tag.type == HTML_TAG_TT || + token->tag.type == HTML_TAG_U)) { + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + element = html_append_element_for_token(html, token, + HTML_NAMESPACE_HTML); + html_push_active_formatting_element(html, element, token->type); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_NOBR) { + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + if (html_has_element_with_tag_open_in_scope(html, HTML_TAG_NOBR, + HTML_SCOPE_DEFAULT)) { + /* parse error */ + html_parse_error(html); + html_run_adoption_agency(html, token); + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + } + + element = html_append_element_for_token(html, token, + HTML_NAMESPACE_HTML); + html_push_active_formatting_element(html, element, token->type); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + (token->tag.type == HTML_TAG_A || + token->tag.type == HTML_TAG_B || + token->tag.type == HTML_TAG_BIG || + token->tag.type == HTML_TAG_CODE || + token->tag.type == HTML_TAG_EM || + token->tag.type == HTML_TAG_FONT || + token->tag.type == HTML_TAG_I || + token->tag.type == HTML_TAG_NOBR || + token->tag.type == HTML_TAG_S || + token->tag.type == HTML_TAG_SMALL || + token->tag.type == HTML_TAG_STRIKE || + token->tag.type == HTML_TAG_STRONG || + token->tag.type == HTML_TAG_TT || + token->tag.type == HTML_TAG_U)) { + if (!html_run_adoption_agency(html, token)) + goto any_other_end_tag; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_APPLET || + token->tag.type == HTML_TAG_MARQUEE || + token->tag.type == HTML_TAG_OBJECT)) { + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + element = html_append_element_for_token(html, token, + HTML_NAMESPACE_HTML); + html_push_active_formatting_element(html, element, token->type); + html->frameset_ok = false; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + (token->tag.type == HTML_TAG_APPLET || + token->tag.type == HTML_TAG_MARQUEE || + token->tag.type == HTML_TAG_OBJECT)) { + if (!html_has_tag_open(html, token->tag.type)) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + html_generate_implied_end_tags(html, NULL, false); + + if (html->current_node->type != token->tag.type) { + /* parse error */ + html_parse_error(html); + } + + html_pop_nodes_until_past_tag(html, token->tag.type); + + /* TODO: clear list of active formatting elements up to last marker */ + + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_TABLE) { + if (html_has_element_with_tag_open_in_scope(html, HTML_TAG_P, + HTML_SCOPE_BUTTON)) { + /* TODO: only do this if document is not set to quirks mode */ + html_close_p(html); + } + + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html->frameset_ok = false; + html->mode = HTML_MODE_IN_TABLE; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && token->tag.type == HTML_TAG_BR) { + /* parse error, drop attributes and turn into start */ + html_parse_error(html); + + token->tag.attrs_count = 0; + token->type = HTML_TOKEN_START_TAG; + + /* fall through */ + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_AREA || + token->tag.type == HTML_TAG_BR || + token->tag.type == HTML_TAG_EMBED || + token->tag.type == HTML_TAG_IMG || + token->tag.type == HTML_TAG_KEYGEN || + token->tag.type == HTML_TAG_WBR)) { + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html_pop_current_element(html); + + if (token->tag.self_closing) + token->tag.self_closing_acked = true; + + html->frameset_ok = false; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_INPUT) { + bool found_hidden; + + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html_pop_current_element(html); + + if (token->tag.self_closing) + token->tag.self_closing_acked = true; + + for (n = 0, found_hidden = false; n < token->tag.attrs_count; n++) { + if (strcasecmp(token->tag.attrs[n].name, "type") == 0 && + strcasecmp(token->tag.attrs[n].val, "hidden") == 0) { + found_hidden = true; + break; + } + } + if (!found_hidden) + html->frameset_ok = false; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_PARAM || + token->tag.type == HTML_TAG_SOURCE || + token->tag.type == HTML_TAG_TRACK)) { + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html_pop_current_element(html); + + if (token->tag.self_closing) + token->tag.self_closing_acked = true; + + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HR) { + if (html_has_element_with_tag_open_in_scope(html, HTML_TAG_P, + HTML_SCOPE_BUTTON)) + html_close_p(html); + + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html_pop_current_element(html); + + if (token->tag.self_closing) + token->tag.self_closing_acked = true; + + html->frameset_ok = false; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_IMAGE) { + /* parse error */ + html_parse_error(html); + + /* "Don't ask." */ + token->tag.name_len = strlcpy(token->tag.name, "img", + sizeof(token->tag.name)); + token->tag.type = HTML_TAG_IMG; + + return HTML_TOKEN_REPROCESS; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_TEXTAREA) { + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + + html->skip_newline_char_token = true; + html->state = HTML_STATE_RCDATA; + html->original_mode = html->mode; + html->frameset_ok = false; + html->mode = HTML_MODE_TEXT; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_XMP) { + if (html_has_element_with_tag_open_in_scope(html, HTML_TAG_P, + HTML_SCOPE_BUTTON)) + html_close_p(html); + + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + html->frameset_ok = false; + + /* "raw text element parsing algorithm" */ + html->state = HTML_STATE_RAWTEXT; + html->original_mode = html->mode; + html->mode = HTML_MODE_TEXT; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_IFRAME) { + html->frameset_ok = false; + + /* "raw text element parsing algorithm" */ + html->state = HTML_STATE_RAWTEXT; + html->original_mode = html->mode; + html->mode = HTML_MODE_TEXT; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_NOEMBED || + (token->tag.type == HTML_TAG_NOSCRIPT && html->scripting))) { + /* "raw text element parsing algorithm" */ + html->state = HTML_STATE_RAWTEXT; + html->original_mode = html->mode; + html->mode = HTML_MODE_TEXT; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_SELECT) { + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html->frameset_ok = false; + + if (html->mode == HTML_MODE_IN_TABLE || + html->mode == HTML_MODE_IN_CAPTION || + html->mode == HTML_MODE_IN_TABLE_BODY || + html->mode == HTML_MODE_IN_ROW || + html->mode == HTML_MODE_IN_CELL) + html->mode = HTML_MODE_IN_SELECT_IN_TABLE; + else + html->mode = HTML_MODE_IN_SELECT; + + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_OPTGROUP || + token->tag.type == HTML_TAG_OPTION)) { + if (token->tag.type == HTML_TAG_OPTION) + html_pop_current_element(html); + + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_RB || + token->tag.type == HTML_TAG_RTC)) { + if (html_has_tag_open(html, HTML_TAG_RUBY)) { + html_generate_implied_end_tags(html, "rtc", false); + + if (token->tag.type == HTML_TAG_RUBY) { + /* parse error */ + html_parse_error(html); + } + } + + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_MATH) { + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + + /* TODO: "adjust MathML attributes" */ + + element = html_append_element_for_token(html, token, + HTML_NAMESPACE_MATHML); + + if (token->tag.self_closing) { + html_pop_current_element(html); + token->tag.self_closing_acked = true; + } + + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_SVG) { + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + + /* TODO: "adjust SVG attributes" */ + + element = html_append_element_for_token(html, token, + HTML_NAMESPACE_SVG); + + if (token->tag.self_closing) { + html_pop_current_element(html); + token->tag.self_closing_acked = true; + } + + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_CAPTION || + token->tag.type == HTML_TAG_COL || + token->tag.type == HTML_TAG_COLGROUP || + token->tag.type == HTML_TAG_FRAME || + token->tag.type == HTML_TAG_HEAD || + token->tag.type == HTML_TAG_TBODY || + token->tag.type == HTML_TAG_TD || + token->tag.type == HTML_TAG_TFOOT || + token->tag.type == HTML_TAG_TH || + token->tag.type == HTML_TAG_THEAD || + token->tag.type == HTML_TAG_TR)) { + /* parse error, ignore */ + html_parse_error(html); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG) { + /* any other tag */ + if (html->active_formatting_count > 0) + html_reconstruct_active_formatting(html); + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG) { +any_other_end_tag: + /* + * 1. Initialize node to be the current node (the bottommost node of + * the stack). + */ + /* 2. Loop: */ + for (n = html->open_count - 1; n >= 0; n--) { + node = html->open[n]; + + /* + * 2. If node is an HTML element with the same tag name as the + * token, then: + */ + if (strcmp(node->name, token->tag.name) == 0) { + /* + * 1. Generate implied end tags, except for HTML elements with + * the same tag name as the token. + */ + html_generate_implied_end_tags(html, token->tag.name, false); + + /* + * 2. If node is not the current node, then this is a parse + * error. + */ + if (node != html->current_node) { + html_parse_error(html); + } + + /* + * 3. Pop all the nodes from the current node up to node, + * including node, then stop these steps. + */ + html_pop_nodes_until_past_element(html, node); + break; + } + + /* + * 3. Otherwise, if node is in the special category, then this + * is a parse error; ignore the token, and return. + */ + if (node->type && html_is_element_special(html, node)) { + html_parse_error(html); + /* ignore */ + return HTML_TOKEN_PROCESSED; + } + + /* + * 4. Set node to the previous entry in the stack of open + * elements. + */ + /* 5. Return to the step labeled loop. */ + } + + return HTML_TOKEN_PROCESSED; + } + + panic("we shouldn't get to default case in 'in body' parser"); + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_text(struct html_page *html, html_token *token) +{ + /* + * https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incdata + */ + + if (token->type == HTML_TOKEN_CHARACTER) { + html_insert_character(html, token->ch.c); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_EOF) { + /* parse error */ + html_parse_error(html); + if (token->tag.type == HTML_TAG_SCRIPT) { + /* TODO: "set its already started to true" */ + } + + html_pop_current_element(html); + + html->mode = html->original_mode; + html->original_mode = HTML_MODE_NONE; + return HTML_TOKEN_REPROCESS; + } + + if (token->type == HTML_TOKEN_END_TAG && + token->tag.type == HTML_TAG_SCRIPT) { + html_pop_current_element(html); + + html->mode = html->original_mode; + html->original_mode = HTML_MODE_NONE; + + /* TODO: some more stuff related to scripting engine */ + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG) { + html_pop_current_element(html); + + html->mode = html->original_mode; + html->original_mode = HTML_MODE_NONE; + return HTML_TOKEN_REPROCESS; + } + + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_in_table(struct html_page *html, html_token *token) +{ + HTML_DEBUG(("in_table: TODO")); + /* TODO */ + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_in_table_text(struct html_page *html, html_token *token) +{ + HTML_DEBUG(("in_table_text: TODO")); + /* TODO */ + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_in_caption(struct html_page *html, html_token *token) +{ + HTML_DEBUG(("in_caption: TODO")); + /* TODO */ + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_in_column_group(struct html_page *html, html_token *token) +{ + HTML_DEBUG(("in_column_group: TODO")); + /* TODO */ + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_in_table_body(struct html_page *html, html_token *token) +{ + HTML_DEBUG(("in_table_body: TODO")); + /* TODO */ + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_in_row(struct html_page *html, html_token *token) +{ + HTML_DEBUG(("in_row: TODO")); + /* TODO */ + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_in_cell(struct html_page *html, html_token *token) +{ + HTML_DEBUG(("in_cell: TODO")); + /* TODO */ + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_in_select(struct html_page *html, html_token *token) +{ + HTML_DEBUG(("in_select: TODO")); + /* TODO */ + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_in_select_in_table(struct html_page *html, + html_token *token) +{ + HTML_DEBUG(("in_select_in_table: TODO")); + /* TODO */ + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_in_template(struct html_page *html, html_token *token) +{ + HTML_DEBUG(("in_template: TODO")); + /* TODO */ + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_after_body(struct html_page *html, html_token *token) +{ + if (token->type == HTML_TOKEN_CHARACTER && + (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || + token->ch.c == '\r' || token->ch.c == ' ')) { + /* process as "in body" */ + html_process_token_in_body(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_COMMENT) { + html_append_comment(html, &token->comment); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_DOCTYPE) { + html_parse_error(html); + /* ignore */ + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HTML) { + /* process as "in body" */ + html_process_token_in_body(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + token->tag.type == HTML_TAG_HTML) { + html->mode = HTML_MODE_AFTER_AFTER_BODY; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_EOF) { + html_stop_parsing(html); + return HTML_TOKEN_PROCESSED; + } + + html_parse_error(html); + html->mode = HTML_MODE_IN_BODY; + return HTML_TOKEN_REPROCESS; +} + +html_token_act +html_process_token_in_frameset(struct html_page *html, html_token *token) +{ + if (token->type == HTML_TOKEN_CHARACTER && + (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || + token->ch.c == '\r' || token->ch.c == ' ')) { + html_insert_character(html, token->ch.c); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_COMMENT) { + html_append_comment(html, &token->comment); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_DOCTYPE) { + html_parse_error(html); + /* ignore */ + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HTML) { + /* process as "in body" */ + html_process_token_in_body(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_FRAMESET) { + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + token->tag.type == HTML_TAG_FRAMESET) { + if (html->current_node->type == HTML_TAG_HTML) { + html_parse_error(html); + /* ignore */ + return HTML_TOKEN_PROCESSED; + } + + html_pop_current_element(html); + if (html->current_node->type != HTML_TAG_FRAMESET) + html->mode = HTML_MODE_AFTER_FRAMESET; + + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_FRAME) { + html_append_element_for_token(html, token, HTML_NAMESPACE_HTML); + html_pop_current_element(html); + if (token->tag.self_closing) + token->tag.self_closing_acked = true; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_NOFRAMES) { + /* process as "in head" */ + html_process_token_in_head(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_EOF) { + if (html->current_node->type != HTML_TAG_HTML) + html_parse_error(html); + html_stop_parsing(html); + return HTML_TOKEN_PROCESSED; + } + + html_parse_error(html); + /* ignore */ + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_after_frameset(struct html_page *html, html_token *token) +{ + if (token->type == HTML_TOKEN_CHARACTER && + (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || + token->ch.c == '\r' || token->ch.c == ' ')) { + html_insert_character(html, token->ch.c); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_COMMENT) { + html_append_comment(html, &token->comment); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_DOCTYPE) { + html_parse_error(html); + /* ignore */ + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HTML) { + /* process as "in body" */ + html_process_token_in_body(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG && + token->tag.type == HTML_TAG_HTML) { + html->mode = HTML_MODE_AFTER_AFTER_FRAMESET; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_NOFRAMES) { + /* process as "in head" */ + html_process_token_in_head(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_EOF) { + html_stop_parsing(html); + return HTML_TOKEN_PROCESSED; + } + + html_parse_error(html); + /* ignore */ + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_after_after_body(struct html_page *html, html_token *token) +{ + if (token->type == HTML_TOKEN_COMMENT) { + /* doc says "as the last child of the Document object */ + html_append_comment(html, &token->comment); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_DOCTYPE || + (token->type == HTML_TOKEN_CHARACTER && + (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || + token->ch.c == '\r' || token->ch.c == ' ')) || + (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HTML)) { + /* process as "in body" */ + html_process_token_in_body(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_EOF) { + html_stop_parsing(html); + return HTML_TOKEN_PROCESSED; + } + + html_parse_error(html); + html->mode = HTML_MODE_IN_BODY; + return HTML_TOKEN_REPROCESS; +} + +html_token_act +html_process_token_after_after_frameset(struct html_page *html, + html_token *token) +{ + if (token->type == HTML_TOKEN_COMMENT) { + /* doc says "as the last child of the Document object */ + html_append_comment(html, &token->comment); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_DOCTYPE || + (token->type == HTML_TOKEN_CHARACTER && + (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || + token->ch.c == '\r' || token->ch.c == ' ')) || + (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_HTML)) { + /* process as "in body" */ + html_process_token_in_body(html, token); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_EOF) { + html_stop_parsing(html); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_START_TAG && + token->tag.type == HTML_TAG_NOFRAMES) { + /* process as "in head" */ + html_process_token_in_head(html, token); + return HTML_TOKEN_PROCESSED; + } + + html_parse_error(html); + /* ignore */ + return HTML_TOKEN_PROCESSED; +} + +html_token_act +html_process_token_in_foreign_content(struct html_page *html, + html_token *token) +{ + struct html_element *node; + short n; + + if (token->type == HTML_TOKEN_CHARACTER && token->ch.c == '\0') { + html_parse_error(html); + html_insert_character(html, HTML_REPLACEMENT_CHARACTER); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_CHARACTER && + (token->ch.c == '\t' || token->ch.c == '\n' || token->ch.c == '\f' || + token->ch.c == '\r' || token->ch.c == ' ')) { + html_insert_character(html, token->ch.c); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_CHARACTER) { + html_insert_character(html, token->ch.c); + html->frameset_ok = false; + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_COMMENT) { + html_append_comment(html, &token->comment); + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_DOCTYPE) { + html_parse_error(html); + /* ignore */ + return HTML_TOKEN_PROCESSED; + } + + if ((token->type == HTML_TOKEN_START_TAG && + (token->tag.type == HTML_TAG_B || + token->tag.type == HTML_TAG_BIG || + token->tag.type == HTML_TAG_BLOCKQUOTE || + token->tag.type == HTML_TAG_BODY || + token->tag.type == HTML_TAG_BR || + token->tag.type == HTML_TAG_CENTER || + token->tag.type == HTML_TAG_CODE || + token->tag.type == HTML_TAG_DD || + token->tag.type == HTML_TAG_DIV || + token->tag.type == HTML_TAG_DL || + token->tag.type == HTML_TAG_DT || + token->tag.type == HTML_TAG_EM || + token->tag.type == HTML_TAG_EMBED || + token->tag.type == HTML_TAG_H1 || + token->tag.type == HTML_TAG_H2 || + token->tag.type == HTML_TAG_H3 || + token->tag.type == HTML_TAG_H4 || + token->tag.type == HTML_TAG_H5 || + token->tag.type == HTML_TAG_H6 || + token->tag.type == HTML_TAG_HEAD || + token->tag.type == HTML_TAG_HR || + token->tag.type == HTML_TAG_I || + token->tag.type == HTML_TAG_IMG || + token->tag.type == HTML_TAG_LI || + token->tag.type == HTML_TAG_LISTING || + token->tag.type == HTML_TAG_MENU || + token->tag.type == HTML_TAG_META || + token->tag.type == HTML_TAG_NOBR || + token->tag.type == HTML_TAG_OL || + token->tag.type == HTML_TAG_P || + token->tag.type == HTML_TAG_PRE || + token->tag.type == HTML_TAG_RUBY || + token->tag.type == HTML_TAG_S || + token->tag.type == HTML_TAG_SMALL || + token->tag.type == HTML_TAG_SPAN || + token->tag.type == HTML_TAG_STRONG || + token->tag.type == HTML_TAG_STRIKE || + token->tag.type == HTML_TAG_SUB || + token->tag.type == HTML_TAG_SUP || + token->tag.type == HTML_TAG_TABLE || + token->tag.type == HTML_TAG_TT || + token->tag.type == HTML_TAG_U || + token->tag.type == HTML_TAG_UL || + token->tag.type == HTML_TAG_VAR)) || + (token->type == HTML_TOKEN_END_TAG && + (token->tag.type == HTML_TAG_BR || + token->tag.type == HTML_TAG_P))) { + html_parse_error(html); + + /* TODO: check mathml */ + + while (html->current_node->ns != HTML_NAMESPACE_HTML) + html_pop_current_element(html); + + /* + * Reprocess the token according to the rules given in the section + * corresponding to the current insertion mode in HTML content. + */ + return HTML_TOKEN_REPROCESS; + } + + if (token->type == HTML_TOKEN_START_TAG) { + /* TODO: check mathml */ + + if (html->current_node->ns == HTML_NAMESPACE_SVG) { + /* TODO: check svg tag name according to a list */ + + /* TODO: "adjust SVG attributes" */ + } + + /* TODO: "adjust foreign attributes" */ + + /* + * Insert a foreign element for the token, with adjusted current node's + * namespace and false. + */ + html_append_element_for_token(html, token, html->current_node->ns); + + if (token->tag.self_closing) { + if (token->tag.type == HTML_TAG_SCRIPT && + html->current_node->ns == HTML_NAMESPACE_SVG) { + token->tag.self_closing_acked = true; + goto svg_script; + } else { + html_pop_current_element(html); + token->tag.self_closing_acked = true; + } + } + + return HTML_TOKEN_PROCESSED; + } + + /* + * An end tag whose tag name is "script", if the current node is an SVG + * script element + */ + if (token->type == HTML_TOKEN_END_TAG && + token->tag.type == HTML_TAG_SCRIPT && + html->current_node->type == HTML_TAG_SCRIPT && + html->current_node->ns == HTML_NAMESPACE_SVG) { +svg_script: + html_pop_current_element(html); + + /* TODO: other things */ + + return HTML_TOKEN_PROCESSED; + } + + if (token->type == HTML_TOKEN_END_TAG) { + /* + * 1. Initialize node to be the current node (the bottommost node of + * the stack). + */ + node = html->current_node; + + /* + * 2. If node's tag name, converted to ASCII lowercase, is not the same + * as the tag name of the token, then this is a parse error. + */ + if (strcasecmp(token->tag.name, node->name) != 0) + html_parse_error(html); + + /* + * 3. Loop: If node is the topmost element in the stack of open + * elements, then return. (fragment case) + */ +loop: + if (node == html->open[0]) + return HTML_TOKEN_PROCESSED; + + /* + * 4. If node's tag name, converted to ASCII lowercase, is the same as + * the tag name of the token, pop elements from the stack of open + * elements until node has been popped from the stack, and then return. + */ + if (strcasecmp(token->tag.name, node->name) == 0) { + html_pop_nodes_until_past_tag(html, token->tag.type); + return HTML_TOKEN_PROCESSED; + } + + /* 5. Set node to the previous entry in the stack of open elements. */ + for (n = 1; n < html->open_count; n++) { + if (html->open[n] == node) { + node = html->open[n - 1]; + break; + } + } + + /* + * 6. If node is not an element in the HTML namespace, return to the + * step labeled loop. + */ + if (node->ns != HTML_NAMESPACE_HTML) + goto loop; + + /* + * 7. Otherwise, process the token according to the rules given in the + * section corresponding to the current insertion mode in HTML content. + */ + return HTML_TOKEN_REPROCESS; + } + + return HTML_TOKEN_PROCESSED; +} + +void +html_stop_parsing(struct html_page *html) +{ + while (html->current_node) + html_pop_current_element(html); +} + +/* + * helpers + */ + +bool +html_has_tag_open(struct html_page *html, html_tag_type tag) +{ + short n; + + for (n = 0; n < html->open_count; n++) { + if (html->open[n]->type == tag) + return true; + } + + return false; +} + +bool +html_is_element_open(struct html_page *html, struct html_element *el) +{ + short n; + + for (n = 0; n < html->open_count; n++) + if (html->open[n] == el) + return true; + + return false; +} + +bool +html_has_element_in_scope(struct html_page *html, struct html_element *element, + html_scope scope) +{ + return html_has_element_or_one_with_tag_open_in_scope(html, element, 0, + scope); +} + +bool +html_has_element_with_tag_open_in_scope(struct html_page *html, + html_tag_type tag, html_scope scope) +{ + return html_has_element_or_one_with_tag_open_in_scope(html, NULL, tag, + scope); +} + +bool +html_has_element_or_one_with_tag_open_in_scope(struct html_page *html, + struct html_element *element, html_tag_type tag, html_scope scope) +{ + struct html_element *oelement; + short n; + + for (n = html->open_count - 1; n >= 0; n--) { + oelement = html->open[n]; + + if (element) { + if (oelement == element) + return true; + } else { + if (oelement->type == tag) + return true; + } + + if (scope == HTML_SCOPE_DEFAULT || scope == HTML_SCOPE_LIST_ITEM || + scope == HTML_SCOPE_BUTTON) { + if (oelement->type == HTML_TAG_APPLET || + oelement->type == HTML_TAG_CAPTION || + oelement->type == HTML_TAG_HTML || + oelement->type == HTML_TAG_TABLE || + oelement->type == HTML_TAG_TD || + oelement->type == HTML_TAG_TH || + oelement->type == HTML_TAG_MARQUEE || + oelement->type == HTML_TAG_OBJECT || + oelement->type == HTML_TAG_TEMPLATE) { + /* TODO: MathML and SVG tags */ + return false; + } + } + + if (scope == HTML_SCOPE_LIST_ITEM) { + if (oelement->ns == HTML_NAMESPACE_HTML && + (oelement->type == HTML_TAG_OL || oelement->type == HTML_TAG_UL)) + return false; + } + + if (scope == HTML_SCOPE_BUTTON) { + if (oelement->ns == HTML_NAMESPACE_HTML && + oelement->type == HTML_TAG_BUTTON) + return false; + } + + if (scope == HTML_SCOPE_TABLE) { + if (oelement->ns == HTML_NAMESPACE_HTML && + (oelement->type == HTML_TAG_HTML || + oelement->type == HTML_TAG_TABLE || + oelement->type == HTML_TAG_TEMPLATE)) + return false; + } + + if (scope == HTML_SCOPE_SELECT) { + /* all but these two */ + if (oelement->ns == HTML_NAMESPACE_HTML && + (oelement->type != HTML_TAG_OPTGROUP && + oelement->type != HTML_TAG_OPTION)) + return false; + } + } + + return false; +} + +bool +html_element_serializes_as_void(struct html_page *html, + struct html_element *element) +{ + /* https://html.spec.whatwg.org/multipage/syntax.html#elements-2 */ + switch (element->type) { + case HTML_TAG_AREA: + case HTML_TAG_BASE: + case HTML_TAG_BR: + case HTML_TAG_COL: + case HTML_TAG_EMBED: + case HTML_TAG_HR: + case HTML_TAG_IMG: + case HTML_TAG_INPUT: + case HTML_TAG_LINK: + case HTML_TAG_META: + case HTML_TAG_SOURCE: + case HTML_TAG_TRACK: + case HTML_TAG_WBR: + return true; + /* + * https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments + */ + case HTML_TAG_BASEFONT: + case HTML_TAG_BGSOUND: + case HTML_TAG_FRAME: + case HTML_TAG_KEYGEN: + case HTML_TAG_PARAM: + return true; + default: + return false; + } +} + +bool +html_is_element_special(struct html_page *html, struct html_element *el) +{ + /* https://html.spec.whatwg.org/multipage/parsing.html#special */ + + switch (el->type) { + case HTML_TAG_ADDRESS: + case HTML_TAG_APPLET: + case HTML_TAG_AREA: + case HTML_TAG_ARTICLE: + case HTML_TAG_ASIDE: + case HTML_TAG_BASE: + case HTML_TAG_BASEFONT: + case HTML_TAG_BGSOUND: + case HTML_TAG_BLOCKQUOTE: + case HTML_TAG_BODY: + case HTML_TAG_BR: + case HTML_TAG_BUTTON: + case HTML_TAG_CAPTION: + case HTML_TAG_CENTER: + case HTML_TAG_COL: + case HTML_TAG_COLGROUP: + case HTML_TAG_DD: + case HTML_TAG_DETAILS: + case HTML_TAG_DIR: + case HTML_TAG_DIV: + case HTML_TAG_DL: + case HTML_TAG_DT: + case HTML_TAG_EMBED: + case HTML_TAG_FIELDSET: + case HTML_TAG_FIGCAPTION: + case HTML_TAG_FIGURE: + case HTML_TAG_FOOTER: + case HTML_TAG_FORM: + case HTML_TAG_FRAME: + case HTML_TAG_FRAMESET: + case HTML_TAG_H1: + case HTML_TAG_H2: + case HTML_TAG_H3: + case HTML_TAG_H4: + case HTML_TAG_H5: + case HTML_TAG_H6: + case HTML_TAG_HEAD: + case HTML_TAG_HEADER: + case HTML_TAG_HGROUP: + case HTML_TAG_HR: + case HTML_TAG_HTML: + case HTML_TAG_IFRAME: + case HTML_TAG_IMG: + case HTML_TAG_INPUT: + case HTML_TAG_KEYGEN: + case HTML_TAG_LI: + case HTML_TAG_LINK: + case HTML_TAG_LISTING: + case HTML_TAG_MAIN: + case HTML_TAG_MARQUEE: + case HTML_TAG_MENU: + case HTML_TAG_META: + case HTML_TAG_NAV: + case HTML_TAG_NOEMBED: + case HTML_TAG_NOFRAMES: + case HTML_TAG_NOSCRIPT: + case HTML_TAG_OBJECT: + case HTML_TAG_OL: + case HTML_TAG_P: + case HTML_TAG_PARAM: + case HTML_TAG_PLAINTEXT: + case HTML_TAG_PRE: + case HTML_TAG_SCRIPT: + case HTML_TAG_SEARCH: + case HTML_TAG_SECTION: + case HTML_TAG_SELECT: + case HTML_TAG_SOURCE: + case HTML_TAG_STYLE: + case HTML_TAG_SUMMARY: + case HTML_TAG_TABLE: + case HTML_TAG_TBODY: + case HTML_TAG_TD: + case HTML_TAG_TEMPLATE: + case HTML_TAG_TEXTAREA: + case HTML_TAG_TFOOT: + case HTML_TAG_TH: + case HTML_TAG_THEAD: + case HTML_TAG_TITLE: + case HTML_TAG_TR: + case HTML_TAG_TRACK: + case HTML_TAG_UL: + case HTML_TAG_WBR: + /* TODO: MathML and SVG */ + return true; + default: + return false; + } +} + +bool +html_is_element_formatting(struct html_page *html, struct html_element *el) +{ + switch (el->type) { + case HTML_TAG_A: + case HTML_TAG_B: + case HTML_TAG_BIG: + case HTML_TAG_CODE: + case HTML_TAG_EM: + case HTML_TAG_FONT: + case HTML_TAG_I: + case HTML_TAG_NOBR: + case HTML_TAG_S: + case HTML_TAG_SMALL: + case HTML_TAG_STRIKE: + case HTML_TAG_STRONG: + case HTML_TAG_TT: + case HTML_TAG_U: + return true; + default: + return false; + } +} + +char * +html_escape_string(struct html_page *html, char *str, size_t *len, + bool attribute_mode) +{ + size_t len_escaped; + short append; + short n; + + for (append = 0, len_escaped = 0; append <= 1; append++) { + /* + * https://html.spec.whatwg.org/multipage/parsing.html#escapingString + */ + + if (append) { + if (html->escaped_buf == NULL || + html->escaped_size < len_escaped + 1) { + html->escaped_size = len_escaped + 1; + HTML_DEBUG((": reallocing escaped to %ld", + html->escaped_size)); + if (html->escaped_buf) + xfree(&html->escaped_buf); + html->escaped_buf = xmalloc(html->escaped_size); + } + + if (html->escaped_buf == NULL) + panic("escaped_buf is null"); + + len_escaped = 0; + } + + for (n = 0; n < *len; n++) { + switch ((unsigned char)str[n]) { + case '&': + if (append) { + html->escaped_buf[len_escaped++] = '&'; + html->escaped_buf[len_escaped++] = 'a'; + html->escaped_buf[len_escaped++] = 'm'; + html->escaped_buf[len_escaped++] = 'p'; + html->escaped_buf[len_escaped++] = ';'; + } else + len_escaped += 5; + break; + case 0xa0: + if (append) { + html->escaped_buf[len_escaped++] = '&'; + html->escaped_buf[len_escaped++] = 'n'; + html->escaped_buf[len_escaped++] = 'b'; + html->escaped_buf[len_escaped++] = 's'; + html->escaped_buf[len_escaped++] = 'p'; + html->escaped_buf[len_escaped++] = ';'; + } else + len_escaped += 6; + break; + case '"': + if (attribute_mode) { + if (append) { + html->escaped_buf[len_escaped++] = '&'; + html->escaped_buf[len_escaped++] = 'q'; + html->escaped_buf[len_escaped++] = 'u'; + html->escaped_buf[len_escaped++] = 'o'; + html->escaped_buf[len_escaped++] = 't'; + html->escaped_buf[len_escaped++] = ';'; + } else + len_escaped += 6; + break; + } + /* fallthrough */ + case '<': + if (!attribute_mode) { + if (append) { + html->escaped_buf[len_escaped++] = '&'; + html->escaped_buf[len_escaped++] = 'l'; + html->escaped_buf[len_escaped++] = 't'; + html->escaped_buf[len_escaped++] = ';'; + } else + len_escaped += 4; + break; + } + /* fallthrough */ + case '>': + if (!attribute_mode) { + if (append) { + html->escaped_buf[len_escaped++] = '&'; + html->escaped_buf[len_escaped++] = 'g'; + html->escaped_buf[len_escaped++] = 't'; + html->escaped_buf[len_escaped++] = ';'; + } else + len_escaped += 4; + break; + } + /* fallthrough */ + default: + if (append) + html->escaped_buf[len_escaped++] = str[n]; + else + len_escaped++; + } + } + } + + html->escaped_buf[len_escaped] = '\0'; + HTML_DEBUG((": escaped '%s' to [%ld] '%s'", str, len_escaped, + html->escaped_buf)); + *len = len_escaped; + return html->escaped_buf; +} + +void +html_pop_current_element(struct html_page *html) +{ + short n; + + if (html->open_count <= 0) + panic("bogus open count %d", html->open_count); + + HTML_DEBUG((": rendering current <%s>", html->current_node->name)); + + html_render_current_node(html, true); + html_deref_element(html, html->current_node); + + HTML_DEBUG((": popping current <%s>", html->current_node->name)); + + html->open_count--; + if (html->open_count) + html->current_node = html->open[html->open_count - 1]; + else + html->current_node = NULL; + + HTML_DEBUG((": still open: ")); + for (n = 0; n <= html->open_count - 1; n++) + HTML_DEBUG(("<%s>", html->open[n]->name)); +} + +void +html_pop_nodes_until_past_tag(struct html_page *html, html_tag_type stop_after) +{ + short n; + bool done; + + HTML_DEBUG((": popping until past <%s>", html_tag_names[stop_after])); + + for (n = html->open_count - 1, done = false; n >= 0; n--) { + if (html->open[n]->type == stop_after) + done = true; + + html_pop_current_element(html); + + if (done) + return; + } + + /* closed a tag that was never open? */ + HTML_DEBUG(("popped tags all the way to root looking for %s", + html_tag_names[stop_after])); +} + +void +html_pop_nodes_until_past_element(struct html_page *html, + struct html_element *element) +{ + short n; + bool done; + + for (n = html->open_count - 1, done = false; n >= 0; n--) { + if (html->open[n] == element) + done = true; + + html_pop_current_element(html); + + if (done) + return; + } +} + +void +html_generate_implied_end_tags(struct html_page *html, char *except, + bool thoroughly) +{ + struct html_element *element; + + HTML_DEBUG((": html_generate_implied_end_tags")); + if (except) + HTML_DEBUG((" except <%s>", except)); + + while (html->current_node) { + element = html->current_node; + + if (except != NULL && strcmp(element->name, except) == 0) + return; + + if (element->type == HTML_TAG_DD || + element->type == HTML_TAG_DT || + element->type == HTML_TAG_LI || + element->type == HTML_TAG_OPTGROUP || + element->type == HTML_TAG_OPTION || + element->type == HTML_TAG_P || + element->type == HTML_TAG_RB || + element->type == HTML_TAG_RP || + element->type == HTML_TAG_RT || + element->type == HTML_TAG_RTC) { + html_pop_current_element(html); + continue; + } + + if (thoroughly && + (element->type == HTML_TAG_CAPTION || + element->type == HTML_TAG_COLGROUP || + element->type == HTML_TAG_TBODY || + element->type == HTML_TAG_TD || + element->type == HTML_TAG_TFOOT || + element->type == HTML_TAG_TH || + element->type == HTML_TAG_THEAD || + element->type == HTML_TAG_TR)) { + html_pop_current_element(html); + continue; + } + + return; + } +} + +bool +html_remove_active_formatting_element(struct html_page *html, + struct html_element *element) +{ + short n; + + for (n = 0; n < html->active_formatting_count; n++) { + if (html->active_formatting[n].element == element) { + /* shift out */ + for (; n < html->active_formatting_count - 1; n++) { + html->active_formatting[n] = html->active_formatting[n + 1]; + } + html->active_formatting_count--; + html_deref_element(html, element); + return true; + } + } + + return false; +} + +void +html_close_p(struct html_page *html) +{ + html_generate_implied_end_tags(html, "p", false); + + if (html->current_node->type != HTML_TAG_P) { + /* parse error */ + html_parse_error(html); + } + + html_pop_nodes_until_past_tag(html, HTML_TAG_P); +} + +bool +html_remove_open_element(struct html_page *html, struct html_element *element) +{ + short n; + + for (n = 0; n < html->open_count; n++) { + if (html->open[n] == element) { + for (; n < html->open_count - 1; n++) + html->open[n] = html->open[n + 1]; + html->open_count--; + html_deref_element(html, element); + return true; + } + } + + return false; +} + +bool +html_is_tag_in_active_formatting(struct html_page *html, html_tag_type tag) +{ + short n; + + for (n = 0; n < html->active_formatting_count - 1; n++) { + if (html->active_formatting[n].element && + html->active_formatting[n].element->type == tag) + return true; + } + + return false; +} + +bool +html_is_element_in_active_formatting(struct html_page *html, + struct html_element *element) +{ + short n; + + for (n = 0; n < html->active_formatting_count; n++) { + if (html->active_formatting[n].element == element) + return true; + } + + return false; +} + +void +html_reconstruct_active_formatting(struct html_page *html) +{ + struct html_formatting *entry; + struct html_element *new_element; + short n, entry_n; + html_token token; + + HTML_DEBUG((": reconstructing AF")); + + /* + * 1. If there are no entries in the list of active formatting elements, + * then there is nothing to reconstruct; stop this algorithm. + */ + if (html->active_formatting_count == 0) + return; + + /* + * 2. If the last (most recently added) entry in the list of active + * formatting elements is a marker, or if it is an element that is in the + * stack of open elements, then there is nothing to reconstruct; stop this + * algorithm. + */ + if (html->active_formatting[html->active_formatting_count - 1].marker) + return; + if (html_is_element_open(html, + html->active_formatting[html->active_formatting_count - 1].element)) + return; + + /* + * 3. Let entry be the last (most recently added) element in the list of + * active formatting elements. + */ + entry_n = -1; + for (n = html->active_formatting_count - 1; n >= 0; n--) { + if (html->active_formatting[n].marker) + continue; + entry = &html->active_formatting[n]; + entry_n = n; + break; + } + if (entry_n == -1) + panic("html_reconstruct_active_formatting: no last element"); + + /* + * 4. Rewind: If there are no entries before entry in the list of active + * formatting elements, then jump to the step labeled create. + */ +rewind: + if (entry_n == 0) + goto create; + + /* + * 5; Let entry be the entry one earlier than entry in the list of active + * formatting elements. + */ + entry = &html->active_formatting[--entry_n]; + + /* + * 6. If entry is neither a marker nor an element that is also in the stack + * of open elements, go to the step labeled rewind. + */ + if (!(entry->marker || html_is_element_open(html, entry->element))) + goto rewind; + +advance: + /* + * 7. Advance: Let entry be the element one later than entry in the list of + * active formatting elements. + */ + entry = &html->active_formatting[++entry_n]; + +create: + /* + * 8. Create: Insert an HTML element for the token for which the element + * entry was created, to obtain new element. + */ + memset(&token, 0, sizeof(html_token)); + token.type = entry->token; + token.tag.type = entry->element->type; + memcpy(&token.tag.name, entry->element->name, sizeof(token.tag.name)); + token.tag.name_len = entry->element->name_len; + memcpy(&token.tag.attrs, entry->element->attrs, sizeof(token.tag.attrs)); + token.tag.attrs_count = entry->element->attrs_count; + new_element = html_append_element_for_token(html, &token, + HTML_NAMESPACE_HTML); + + /* + * 9. Replace the entry for entry in the list with an entry for new element. + */ + html_deref_element(html, entry->element); + entry->element = new_element; + new_element->refs++; + + HTML_DEBUG((": AF created new <%s>", new_element->name)); + + /* + * 10. If the entry for new element in the list of active formatting + * elements is not the last entry in the list, return to the step labeled + * advance. + */ + if (entry_n + 1 != html->active_formatting_count) + goto advance; +} + +void +html_push_active_formatting_element(struct html_page *html, + struct html_element *element, html_token_type token_type) +{ + /* + * https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements + */ + short last_marker = 0; + short found = 0, n, j; + struct html_element *found_matches[3]; + + /* find last marker, if any */ + for (n = html->active_formatting_count - 1; n >= 0; n--) { + if (html->active_formatting[n].marker) { + last_marker = n; + break; + } + } + + /* + * "This is the Noah's Ark clause. But with three per family instead of + * two." + * + * 1. If there are already three elements in the list of active formatting + * elements after the last marker, if any, or anywhere in the list if there + * are no markers, that have the same tag name, namespace, and attributes + * as element, then remove the earliest such element from the list of + * active formatting elements. + */ + for (n = last_marker + 1; n < html->active_formatting_count - 1; n++) { + if (html->active_formatting[n].marker) + panic("shouldn't have a marker after last marker in active " + "formatting list"); + + if (html->active_formatting[n].element->type != element->type) + continue; + if (html->active_formatting[n].element->ns != element->ns) + continue; + + /* TODO: also compare attribute names and values */ + + found_matches[found++] = html->active_formatting[n].element; + + if (found < 3) + continue; + + /* remove found_matches[0] from the list */ + for (n = 0; n < html->active_formatting_count - 1; n++) { + if (html->active_formatting[n].element != found_matches[0]) + continue; + + HTML_DEBUG(("push_active_formatting_element shifting out tag " + "%s\r", found_matches[0]->name)); + + /* skip this one, move everything else down */ + for (j = n; j < html->active_formatting_count - 2; j++) { + html->active_formatting[j].token = + html->active_formatting[j + 1].token; + html->active_formatting[j].marker = + html->active_formatting[j + 1].marker; + } + + html->active_formatting_count--; + html_deref_element(html, found_matches[0]); + break; + } + } + + /* 2. Add element to the list of active formatting elements. */ + html->active_formatting_count++; + html->active_formatting[html->active_formatting_count - 1].marker = false; + html->active_formatting[html->active_formatting_count - 1].token = + token_type; + html->active_formatting[html->active_formatting_count - 1].element = element; + element->refs++; +} + +void +html_push_active_formatting_marker(struct html_page *html, + html_token_type token_type) +{ + if (html->active_formatting_count >= nitems(html->active_formatting)) + panic("active formatting overflow"); + + html->active_formatting[html->active_formatting_count - 1].token = + token_type; + html->active_formatting[html->active_formatting_count - 1].element = NULL; + html->active_formatting[html->active_formatting_count - 1].marker = true; + html->active_formatting_count++; +} + +bool +html_run_adoption_agency(struct html_page *html, html_token *token) +{ + /* + * https://html.spec.whatwg.org/multipage/parsing.html#adoption-agency-algorithm + */ + char *subject; + short olc, ilc, n; + struct html_element *formatting_element, *before_fe, *after_fe, + *furthest_block, *common_ancestor, *node, *last_node, *before_node, + *element; + html_token ttoken; + bool found; + + HTML_DEBUG((": AAA for <%s>: AF tags", token->tag.name)); + for (n = 0; n < html->active_formatting_count; n++) { + HTML_DEBUG((" <%s>", html->active_formatting[n].element->name)); + } + HTML_DEBUG((": open nodes ")); + for (n = 0; n < html->open_count; n++) { + HTML_DEBUG(("<%s>", html->open[n]->name)); + } + + /* 1. Let subject be token's tag name. */ + subject = token->tag.name; + + /* + * 2. If the current node is an HTML element whose tag name is subject, and + * the current node is not in the list of active formatting elements, then + * pop the current node off the stack of open elements and return. + */ + if (strcmp(html->current_node->name, subject) == 0 && + !html_is_element_in_active_formatting(html, html->current_node)) { + html_pop_current_element(html); + return true; + } + + /* 3. Let outerLoopCounter be 0. */ + olc = 0; + + /* 4. While true: */ + for (;;) { + /* 1. If outerLoopCounter is greater than or equal to 8, then return. */ + if (olc >= 8) + return true; + + /* 2. Increment outerLoopCounter by 1. */ + olc++; + + /* + * 3. Let formattingElement be the last element in the list of active + * formatting elements that: + * + * - is between the end of the list and the last marker in the list, if + * any, or the start of the list otherwise, and + * - has the tag name /subject/. + */ + formatting_element = NULL; + for (n = html->active_formatting_count - 1; n >= 0; n--) { + if (html->active_formatting[n].marker || n == 0) { + if (html->active_formatting[n].marker) + n++; + for (; n < html->active_formatting_count; n++) { + if (strcmp(html->active_formatting[n].element->name, + subject) == 0) { + formatting_element = html->active_formatting[n].element; + break; + } + } + break; + } + } + + /* + * If there is no such element, then return and instead act as + * described in the "any other end tag" entry above. + * (we'll return false to indicate that) + */ + if (formatting_element == NULL) + return false; + + /* + * 4. If formattingElement is not in the stack of open elements, then + * this is a parse error; remove the element from the list, and return. + */ + found = false; + for (n = 0; n < html->open_count; n++) { + if (html->open[n] == formatting_element) { + found = true; + break; + } + } + + if (!found) { + html_parse_error(html); + html_remove_active_formatting_element(html, formatting_element); + return true; + } + + /* + * 5. If formattingElement is in the stack of open elements, but the + * element is not in scope, then this is a parse error; return. + */ + if (!html_has_element_in_scope(html, formatting_element, + HTML_SCOPE_DEFAULT)) { + html_parse_error(html); + return true; + } + + /* + * 6. If formattingElement is not the current node, this is a parse + * error. (But do not return.) + */ + if (formatting_element != html->current_node) + html_parse_error(html); + + /* + * 7. Let furthestBlock be the topmost node in the stack of open + * elements that is lower in the stack than formattingElement, and is + * an element in the special category. There might not be one. + */ + furthest_block = NULL; + for (n = 0; n < html->active_formatting_count; n++) { + if (html->active_formatting[n].element != formatting_element) + continue; + + for (n = n + 1; n < html->active_formatting_count; n++) { + if (html_is_element_special(html, + html->active_formatting[n].element)) { + furthest_block = html->active_formatting[n].element; + break; + } + } + } + + /* + * 8. If there is no furthestBlock, then the UA must first pop all the + * nodes from the bottom of the stack of open elements, from the + * current node up to and including formattingElement, then remove + * formattingElement from the list of active formatting elements, and + * finally return. + */ + if (furthest_block == NULL) { + while (html->current_node != formatting_element) + html_pop_current_element(html); + if (html->current_node == formatting_element) + html_pop_current_element(html); + + html_remove_active_formatting_element(html, formatting_element); + return true; + } + + /* + * 9. Let commonAncestor be the element immediately above + * formattingElement in the stack of open elements. + */ + for (n = 0; n < html->active_formatting_count - 1; n++) { + if (html->active_formatting[n + 1].element == formatting_element) { + common_ancestor = html->active_formatting[n].element; + break; + } + } + + /* + * 10. Let a bookmark note the position of formattingElement in the + * list of active formatting elements relative to the elements on + * either side of it in the list. + */ + for (n = 0; n < html->active_formatting_count; n++) { + if (html->active_formatting[n].element == formatting_element) { + before_fe = html->active_formatting[n - 1].element; + after_fe = html->active_formatting[n + 1].element; + break; + } + } + + /* 11. Let node and lastNode be furthestBlock. */ + node = furthest_block; + last_node = furthest_block; + + before_node = NULL; + for (n = 1; n < html->open_count; n++) { + if (html->open[n] == node) { + before_node = html->open[n - 1]; + break; + } + } + + /* 12. Let innerLoopCounter be 0. */ + ilc = 0; + + /* 13. While true: */ + for (;;) { + /* 1. Increment innerLoopCounter by 1. */ + ilc++; + + /* + * 2. Let /node/ be the element immediately above /node/ in the + * stack of open elements, or if node is no longer in the stack of + * open elements (e.g. because it got removed by this algorithm), + * the element that was immediately above node in the stack of open + * elements before node was removed. + */ + node = before_node; + + /* 3. If node is formattingElement, then break. */ + if (node == formatting_element) + break; + + /* + * 4. If innerLoopCounter is greater than 3 and node is in the list + * of active formatting elements, then remove node from the list of + * active formatting elements. + */ + if (ilc > 3) + html_remove_active_formatting_element(html, node); + + /* + * 5. If node is not in the list of active formatting elements, + * then remove node from the stack of open elements and continue. + */ + found = false; + for (n = 0; n < html->active_formatting_count; n++) { + if (html->active_formatting[n].element == node) { + found = true; + break; + } + } + if (!found) { + before_node = NULL; + for (n = 1; n < html->open_count; n++) { + if (html->open[n] == node) { + before_node = html->open[n - 1]; + break; + } + } + + html_remove_open_element(html, node); + continue; + } + + /* + * 6. Create an element for the token for which the element node + * was created, in the HTML namespace, with commonAncestor as the + * intended parent; replace the entry for node in the list of + * active formatting elements with an entry for the new element, + * replace the entry for node in the stack of open elements with an + * entry for the new element, and let node be the new element. + */ + memset(&ttoken, 0, sizeof(html_token)); + ttoken.type = HTML_TOKEN_START_TAG; + ttoken.tag.type = node->type; + element = html_create_element_for_token(html, &ttoken); + + for (n = 0; n < html->active_formatting_count; n++) { + if (html->active_formatting[n].element == node) { + html_deref_element(html, node); + html->active_formatting[n].element = element; + element->refs++; + break; + } + } + + for (n = 0; n < html->open_count; n++) { + if (html->open[n] == node) { + html_deref_element(html, node); + html->open[n] = element; + element->refs++; + break; + } + } + + node = element; + before_node = NULL; + for (n = 1; n < html->open_count; n++) { + if (html->open[n] == node) { + before_node = html->open[n - 1]; + break; + } + } + + /* + * 7. If /last node/ is furthestBlock, then move the aforementioned + * bookmark to be immediately after the new node in the list of + * active formatting elements. + */ + if (last_node == furthest_block) { + for (n = 0; n < html->active_formatting_count; n++) { + if (html->active_formatting[n].element != element) + continue; + + before_fe = html->active_formatting[n - 1].element; + after_fe = html->active_formatting[n + 1].element; + } + } + + /* 8. Append lastNode to node. */ + /* TODO */ + + /* 9. Set lastNode to node. */ + last_node = node; + } + + /* + * 14. Insert whatever lastNode ended up being in the previous step at + * the appropriate place for inserting a node, but using commonAncestor + * as the override target. + */ + /* TODO */ + + /* + * 15. Create an element for the token for which formattingElement was + * created, in the HTML namespace, with furthestBlock as the intended + * parent. + */ + /* TODO */ + + /* + * 16. Take all of the child nodes of furthestBlock and append them to + * the element created in the last step. + */ + /* TODO */ + + /* 17. Append that new element to furthestBlock. */ + /* TODO */ + + /* + * 18. Remove formattingElement from the list of active formatting + * elements, and insert the new element into the list of active + * formatting elements at the position of the aforementioned bookmark. + */ + /* TODO */ + + /* + * 19. Remove formattingElement from the stack of open elements, and + * insert the new element into the stack of open elements immediately + * below the position of furthestBlock in that stack. + */ + /* TODO */ + } +} + +/* + * emitters + */ + +static html_token emittok = { 0 }; + +void +html_emit_char_token(struct html_page *html, short cc) +{ + emittok.type = HTML_TOKEN_CHARACTER; + emittok.ch.c = cc; + html_process_token(html, &emittok); +} + +void +html_emit_eof_token(struct html_page *html) +{ + emittok.type = HTML_TOKEN_EOF; + html_process_token(html, &emittok); +} + +void +html_emit_comment(struct html_page *html, struct html_comment *comment) +{ + size_t len; + + emittok.type = HTML_TOKEN_COMMENT; + + len = comment->len; + if (len >= sizeof(emittok.comment.data)) + len = sizeof(emittok.comment.data) - 1; + emittok.comment.len = len; + + memcpy(emittok.comment.data, comment->data, len); + emittok.comment.data[len] = '\0'; + + html_process_token(html, &emittok); +}