/* * Copyright (c) 2024 joshua stein * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * Tokenization * https://html.spec.whatwg.org/multipage/parsing.html#tokenization * * Handles characters output from html_parse() and turns them into tokens, * which are emitted to the tree builder. */ #include "html.h" #ifdef HTML_ENABLE void html_tokenize(struct html_page *html, short cc); bool html_appropriate_end_tag_token(struct html_page *html, html_token *token); html_tag_type html_find_tag_type(char *tag_name); void html_lookahead_consume(struct html_page *html, short count); void html_lookahead_consume(struct html_page *html, short count) { short n, j; for (n = 0; n < count && html->lookahead_len; n++) { HTML_DEBUG((": consuming '%c' from lookahead", html->lookahead[0])); for (j = 0; j < HTML_LOOKAHEAD_SIZE - 1; j++) html->lookahead[j] = html->lookahead[j + 1]; html->lookahead_len--; } } void html_tokenize(struct html_page *html, short cc) { html_state was_state; struct html_attr *attr; const html_entity *found_entity; short tcc, n, j, i; was_state = html->state; if (html->lookahead_len < HTML_LOOKAHEAD_SIZE && cc != EOF) { /* fill lookahead */ html->lookahead[html->lookahead_len++] = cc; return; } if (html->lookahead_len) { /* take a character from the head of lookahead and shift down */ tcc = html->lookahead[0]; for (n = 0; n < HTML_LOOKAHEAD_SIZE - 1; n++) html->lookahead[n] = html->lookahead[n + 1]; if (cc == EOF) { if (html->lookahead_len) html->lookahead_len--; } else html->lookahead[HTML_LOOKAHEAD_SIZE - 1] = cc; cc = tcc; } #ifdef HTML_ENABLE_DEBUGGING HTML_DEBUG(("pos % 4ld:", html->input_pos++)); if (cc == '\n') HTML_DEBUG((" \\n")); else if (cc == '\r') HTML_DEBUG((" \\r")); else if (cc == '\t') HTML_DEBUG((" \\t")); else if (cc == '\f') HTML_DEBUG((" \\f")); else if (cc == '\0') HTML_DEBUG((" \\0")); else if (cc == ' ') HTML_DEBUG((" ")); else if (cc == EOF) HTML_DEBUG(("EOF")); else HTML_DEBUG((" %c", cc)); HTML_DEBUG((": state %s", html_state_names[html->state])); #endif was_state = html->state; reconsume: if (html->state != was_state) { HTML_DEBUG((": reconsume as %s", html_state_names[html->state])); was_state = html->state; } switch (html->state) { case HTML_STATE_DATA: switch (cc) { case '&': html->return_state = html->state; html->tmp_len = 0; html->state = HTML_STATE_CHARACTER_REFERENCE; break; case '<': html->state = HTML_STATE_TAG_OPEN; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; html_emit_char_token(html, cc); break; case EOF: html_emit_eof_token(html); break; default: html_emit_char_token(html, cc); break; } break; case HTML_STATE_RCDATA: switch (cc) { case '&': html->return_state = html->state; html->tmp_len = 0; html->state = HTML_STATE_CHARACTER_REFERENCE; break; case '<': html->state = HTML_STATE_RCDATA_LESS_THAN_SIGN; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER); break; case EOF: html_emit_eof_token(html); break; default: html_emit_char_token(html, cc); break; } break; case HTML_STATE_RAWTEXT: switch (cc) { case '<': html->state = HTML_STATE_RAWTEXT_LESS_THAN_SIGN; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER); break; case EOF: html_emit_eof_token(html); break; default: html_emit_char_token(html, cc); break; } break; case HTML_STATE_SCRIPT_DATA: switch (cc) { case '<': html->state = HTML_STATE_SCRIPT_DATA_LESS_THAN_SIGN; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; if (!html->ignore_script_data) html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER); break; case EOF: html_emit_eof_token(html); break; default: if (!html->ignore_script_data) html_emit_char_token(html, cc); break; } break; case HTML_STATE_PLAINTEXT: switch (cc) { case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER); break; case EOF: html_emit_eof_token(html); break; default: html_emit_char_token(html, cc); break; } break; case HTML_STATE_TAG_OPEN: switch (cc) { case '!': html->state = HTML_STATE_MARKUP_DECLARATION_OPEN; html->tmp_len = 0; break; case '/': html->state = HTML_STATE_END_TAG_OPEN; break; case '?': html->error = HTML_ERROR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME; html_emit_comment(html, &html->new_token.comment); html->state = HTML_STATE_BOGUS_COMMENT; goto reconsume; case EOF: html->error = HTML_ERROR_EOF_BEFORE_TAG_NAME; html_emit_char_token(html, '<'); html_emit_eof_token(html); break; default: if (IS_ALPHA(cc)) { html_prep_new_token(html, HTML_TOKEN_START_TAG); html->state = HTML_STATE_TAG_NAME; goto reconsume; } html->error = HTML_ERROR_INVALID_FIRST_CHARACTER_OF_TAG_NAME; html_emit_char_token(html, '<'); html->state = HTML_STATE_DATA; goto reconsume; } break; case HTML_STATE_END_TAG_OPEN: switch (cc) { case '>': html->error = HTML_ERROR_MISSING_END_TAG_NAME; html->state = HTML_STATE_DATA; break; case EOF: html->error = HTML_ERROR_EOF_BEFORE_TAG_NAME; html_emit_char_token(html, '<'); html_emit_char_token(html, '/'); html_emit_eof_token(html); break; default: if (IS_ALPHA(cc)) { html_prep_new_token(html, HTML_TOKEN_END_TAG); html->state = HTML_STATE_TAG_NAME; goto reconsume; } html->error = HTML_ERROR_INVALID_FIRST_CHARACTER_OF_TAG_NAME; html_prep_new_token(html, HTML_TOKEN_COMMENT); html->state = HTML_STATE_BOGUS_COMMENT; goto reconsume; } break; case HTML_STATE_TAG_NAME: switch (cc) { case '\t': case '\n': case '\f': case ' ': html->new_token.tag.type = html_find_tag_type(html->new_token.tag.name); html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; case '/': html->new_token.tag.type = html_find_tag_type(html->new_token.tag.name); html->state = HTML_STATE_SELF_CLOSING_START_TAG; break; case '>': html->state = HTML_STATE_DATA; html->new_token.tag.type = html_find_tag_type(html->new_token.tag.name); html_emit_token(html, &html->new_token); break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; STR_APPEND(html->new_token.tag.name, html->new_token.tag.name_len, HTML_REPLACEMENT_CHARACTER); html->new_token.tag.type = 0; break; case EOF: html->error = HTML_ERROR_EOF_IN_TAG; html_emit_eof_token(html); break; default: if (IS_UPPER_ALPHA(cc)) cc += 0x20; STR_APPEND(html->new_token.tag.name, html->new_token.tag.name_len, cc); break; } break; case HTML_STATE_RCDATA_LESS_THAN_SIGN: switch (cc) { case '/': html->state = HTML_STATE_RCDATA_END_TAG_OPEN; html->tmp_len = 0; break; default: html->state = HTML_STATE_RCDATA; if (!html->ignore_comment_data) html_emit_char_token(html, '<'); goto reconsume; } break; case HTML_STATE_RCDATA_END_TAG_OPEN: if (IS_ALPHA(cc)) { html_prep_new_token(html, HTML_TOKEN_END_TAG); html->state = HTML_STATE_RCDATA_END_TAG_NAME; goto reconsume; } if (!html->ignore_comment_data) { html_emit_char_token(html, '<'); html_emit_char_token(html, '/'); } html->state = HTML_STATE_RCDATA; goto reconsume; case HTML_STATE_RCDATA_END_TAG_NAME: switch (cc) { case '\t': case '\n': case '\f': case ' ': if (!html_appropriate_end_tag_token(html, &html->new_token)) goto HTML_STATE_RCDATA_END_TAG_NAME_anything_else; html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; case '/': if (!html_appropriate_end_tag_token(html, &html->new_token)) goto HTML_STATE_RCDATA_END_TAG_NAME_anything_else; html->state = HTML_STATE_SELF_CLOSING_START_TAG; break; case '>': if (!html_appropriate_end_tag_token(html, &html->new_token)) goto HTML_STATE_RCDATA_END_TAG_NAME_anything_else; html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; default: if (IS_UPPER_ALPHA(cc)) cc += 0x20; if (IS_LOWER_ALPHA(cc)) { STR_APPEND(html->new_token.tag.name, html->new_token.tag.name_len, cc); STR_APPEND(html->tmp, html->tmp_len, cc); break; } /* FALLTHROUGH */ HTML_STATE_RCDATA_END_TAG_NAME_anything_else: if (!html->ignore_comment_data) { html_emit_char_token(html, '<'); html_emit_char_token(html, '/'); for (n = 0; n < html->tmp_len; n++) html_emit_char_token(html, html->tmp[n]); } html->state = HTML_STATE_RCDATA; goto reconsume; } break; case HTML_STATE_RAWTEXT_LESS_THAN_SIGN: switch (cc) { case '/': html->tmp_len = 0; html->state = HTML_STATE_RAWTEXT_END_TAG_OPEN; break; default: if (!html->ignore_comment_data) { html_emit_char_token(html, '<'); } html->state = HTML_STATE_RAWTEXT; goto reconsume; } break; case HTML_STATE_RAWTEXT_END_TAG_OPEN: if (IS_ALPHA(cc)) { html_prep_new_token(html, HTML_TOKEN_END_TAG); html->state = HTML_STATE_RAWTEXT_END_TAG_NAME; goto reconsume; } if (!html->ignore_comment_data) { html_emit_char_token(html, '<'); html_emit_char_token(html, '/'); } html->state = HTML_STATE_RAWTEXT; goto reconsume; case HTML_STATE_RAWTEXT_END_TAG_NAME: switch (cc) { case '\t': case '\n': case '\f': case ' ': if (!html_appropriate_end_tag_token(html, &html->new_token)) goto HTML_STATE_RAWTEXT_END_TAG_NAME_anything_else; html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; case '/': if (!html_appropriate_end_tag_token(html, &html->new_token)) goto HTML_STATE_RAWTEXT_END_TAG_NAME_anything_else; html->state = HTML_STATE_SELF_CLOSING_START_TAG; break; case '>': if (!html_appropriate_end_tag_token(html, &html->new_token)) goto HTML_STATE_RAWTEXT_END_TAG_NAME_anything_else; html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; default: if (IS_UPPER_ALPHA(cc)) cc += 0x20; if (IS_LOWER_ALPHA(cc)) { STR_APPEND(html->new_token.tag.name, html->new_token.tag.name_len, cc); STR_APPEND(html->tmp, html->tmp_len, cc); break; } /* FALLTHROUGH */ HTML_STATE_RAWTEXT_END_TAG_NAME_anything_else: if (!html->ignore_comment_data) { html_emit_char_token(html, '<'); html_emit_char_token(html, '/'); for (n = 0; n < html->tmp_len; n++) html_emit_char_token(html, html->tmp[n]); } html->state = HTML_STATE_RAWTEXT; goto reconsume; } break; case HTML_STATE_SCRIPT_DATA_LESS_THAN_SIGN: switch (cc) { case '/': html->tmp_len = 0; html->state = HTML_STATE_SCRIPT_DATA_END_TAG_OPEN; break; case '!': html->state = HTML_STATE_SCRIPT_DATA_ESCAPE_START; if (!html->ignore_comment_data) { html_emit_char_token(html, '<'); html_emit_char_token(html, '!'); } break; default: if (!html->ignore_comment_data) { html_emit_char_token(html, '<'); } html->state = HTML_STATE_SCRIPT_DATA; goto reconsume; } break; case HTML_STATE_SCRIPT_DATA_END_TAG_OPEN: if (IS_ALPHA(cc)) { html_prep_new_token(html, HTML_TOKEN_END_TAG); html->state = HTML_STATE_SCRIPT_DATA_END_TAG_NAME; goto reconsume; } if (!html->ignore_script_data) { html_emit_char_token(html, '<'); html_emit_char_token(html, '/'); } html->state = HTML_STATE_SCRIPT_DATA; goto reconsume; case HTML_STATE_SCRIPT_DATA_END_TAG_NAME: switch (cc) { case '\t': case '\n': case '\f': case ' ': if (!html_appropriate_end_tag_token(html, &html->new_token)) goto HTML_STATE_SCRIPT_DATA_END_TAG_NAME_anything_else; html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; case '/': if (!html_appropriate_end_tag_token(html, &html->new_token)) goto HTML_STATE_SCRIPT_DATA_END_TAG_NAME_anything_else; html->state = HTML_STATE_SELF_CLOSING_START_TAG; break; case '>': if (!html_appropriate_end_tag_token(html, &html->new_token)) goto HTML_STATE_SCRIPT_DATA_END_TAG_NAME_anything_else; html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; default: if (IS_UPPER_ALPHA(cc)) cc += 0x20; if (IS_LOWER_ALPHA(cc)) { STR_APPEND(html->new_token.tag.name, html->new_token.tag.name_len, cc); STR_APPEND(html->tmp, html->tmp_len, cc); break; } /* FALLTHROUGH */ HTML_STATE_SCRIPT_DATA_END_TAG_NAME_anything_else: if (!html->ignore_script_data) { html_emit_char_token(html, '<'); html_emit_char_token(html, '/'); for (n = 0; n < html->tmp_len; n++) html_emit_char_token(html, html->tmp[n]); } html->state = HTML_STATE_SCRIPT_DATA; goto reconsume; } break; case HTML_STATE_SCRIPT_DATA_ESCAPE_START: switch (cc) { case '-': html->state = HTML_STATE_SCRIPT_DATA_ESCAPE_START_DASH; if (!html->ignore_script_data) { html_emit_char_token(html, '-'); } break; default: html->state = HTML_STATE_SCRIPT_DATA; goto reconsume; } break; case HTML_STATE_SCRIPT_DATA_ESCAPE_START_DASH: switch (cc) { case '-': html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH; if (!html->ignore_script_data) { html_emit_char_token(html, '-'); } break; default: html->state = HTML_STATE_SCRIPT_DATA; goto reconsume; } break; case HTML_STATE_SCRIPT_DATA_ESCAPED: switch (cc) { case '-': html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_DASH; if (!html->ignore_script_data) { html_emit_char_token(html, '-'); } break; case '<': html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; if (!html->ignore_script_data) { html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER); } break; case EOF: html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT; html_emit_eof_token(html); break; default: if (!html->ignore_script_data) { html_emit_char_token(html, cc); } break; } break; case HTML_STATE_SCRIPT_DATA_ESCAPED_DASH: switch (cc) { case '-': html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH; if (!html->ignore_script_data) { html_emit_char_token(html, '-'); } break; case '<': html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; html->state = HTML_STATE_SCRIPT_DATA_ESCAPED; if (!html->ignore_script_data) { html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER); } break; case EOF: html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT; html_emit_eof_token(html); break; default: html->state = HTML_STATE_SCRIPT_DATA_ESCAPED; if (!html->ignore_script_data) { html_emit_char_token(html, cc); } break; } break; case HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH: switch (cc) { case '-': if (!html->ignore_script_data) { html_emit_char_token(html, '-'); } break; case '<': html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; break; case '>': html->state = HTML_STATE_SCRIPT_DATA; if (!html->ignore_script_data) { html_emit_char_token(html, '>'); } break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; html->state = HTML_STATE_SCRIPT_DATA_ESCAPED; if (!html->ignore_script_data) { html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER); } break; case EOF: html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT; html_emit_eof_token(html); break; default: html->state = HTML_STATE_SCRIPT_DATA_ESCAPED; if (!html->ignore_script_data) { html_emit_char_token(html, cc); } break; } break; case HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: switch (cc) { case '/': html->tmp_len = 0; html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN; break; default: if (IS_ALPHA(cc)) { html->tmp_len = 0; if (!html->ignore_script_data) { html_emit_char_token(html, '<'); } html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START; goto reconsume; } html->state = HTML_STATE_SCRIPT_DATA_ESCAPED; if (!html->ignore_script_data) { html_emit_char_token(html, '<'); } goto reconsume; } break; case HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN: if (IS_ALPHA(cc)) { html_prep_new_token(html, HTML_TOKEN_END_TAG); html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME; goto reconsume; } if (!html->ignore_script_data) { html_emit_char_token(html, '<'); html_emit_char_token(html, '/'); } html->state = HTML_STATE_SCRIPT_DATA_ESCAPED; goto reconsume; case HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME: switch (cc) { case '\t': case '\n': case '\f': case ' ': if (!html_appropriate_end_tag_token(html, &html->new_token)) goto HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME_anything_else; html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; case '/': if (!html_appropriate_end_tag_token(html, &html->new_token)) goto HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME_anything_else; html->state = HTML_STATE_SELF_CLOSING_START_TAG; break; case '>': if (!html_appropriate_end_tag_token(html, &html->new_token)) goto HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME_anything_else; html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; default: if (IS_UPPER_ALPHA(cc)) cc += 0x20; if (IS_LOWER_ALPHA(cc)) { STR_APPEND(html->new_token.tag.name, html->new_token.tag.name_len, cc); STR_APPEND(html->tmp, html->tmp_len, cc); break; } /* FALLTHROUGH */ HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME_anything_else: if (!html->ignore_script_data) { html_emit_char_token(html, '<'); html_emit_char_token(html, '/'); for (n = 0; n < html->tmp_len; n++) html_emit_char_token(html, html->tmp[n]); } html->state = HTML_STATE_SCRIPT_DATA_ESCAPED; goto reconsume; } break; case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START: switch (cc) { case '\t': case '\n': case '\f': case ' ': case '/': case '>': if (html->tmp_len == 6 && memcmp(html->tmp, "script", 6) == 0) { html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED; } else { html->state = HTML_STATE_SCRIPT_DATA_ESCAPED; html_emit_char_token(html, cc); } break; default: if (IS_UPPER_ALPHA(cc)) cc += 0x20; if (IS_LOWER_ALPHA(cc)) { STR_APPEND(html->tmp, html->tmp_len, cc); break; } html->state = HTML_STATE_SCRIPT_DATA_ESCAPED; goto reconsume; } break; case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED: switch (cc) { case '-': html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH; html_emit_char_token(html, '-'); break; case '<': html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; html_emit_char_token(html, '<'); break; case '\0': html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT; html_emit_eof_token(html); break; default: html_emit_char_token(html, cc); break; } break; case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH: switch (cc) { case '-': html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH; html_emit_char_token(html, '-'); break; case '<': html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; html_emit_char_token(html, '<'); break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED; html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER); break; case EOF: html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT; html_emit_eof_token(html); break; default: html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED; html_emit_char_token(html, cc); break; } break; case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: switch (cc) { case '-': html_emit_char_token(html, '-'); break; case '<': html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; html_emit_char_token(html, '<'); break; case '>': html->state = HTML_STATE_SCRIPT_DATA; html_emit_char_token(html, '<'); break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED; html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER); break; case EOF: html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT; html_emit_eof_token(html); break; default: html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED; html_emit_char_token(html, cc); break; } break; case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: switch (cc) { case '/': html->tmp_len = 0; html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END; html_emit_char_token(html, '/'); break; default: html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED; goto reconsume; } break; case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END: switch (cc) { case '\t': case '\n': case '\f': case ' ': case '/': case '>': if (html->tmp_len == 6 && memcmp(html->tmp, "script", 6) == 0) { html->state = HTML_STATE_SCRIPT_DATA_ESCAPED; } else { html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED; html_emit_char_token(html, cc); } break; default: if (IS_UPPER_ALPHA(cc)) cc += 0x20; if (IS_LOWER_ALPHA(cc)) { STR_APPEND(html->tmp, html->tmp_len, cc); html_emit_char_token(html, cc); break; } html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED; goto reconsume; } break; case HTML_STATE_BEFORE_ATTRIBUTE_NAME: switch (cc) { case '\t': case '\n': case '\f': case ' ': /* ignore */ break; case '/': case '>': case EOF: html->state = HTML_STATE_AFTER_ATTRIBUTE_NAME; goto reconsume; case '=': html->error = HTML_ERROR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME; attr = html_prep_new_attribute(html, &html->new_token.tag); STR_APPEND(attr->name, attr->name_len, cc); html->state = HTML_STATE_ATTRIBUTE_NAME; break; default: html_prep_new_attribute(html, &html->new_token.tag); html->state = HTML_STATE_ATTRIBUTE_NAME; goto reconsume; } break; case HTML_STATE_ATTRIBUTE_NAME: switch (cc) { case '\t': case '\n': case '\f': case ' ': case '/': case '>': case EOF: html->state = HTML_STATE_AFTER_ATTRIBUTE_NAME; goto reconsume; case '=': html->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; attr = &NEW_TOKEN_LAST_ATTR; STR_APPEND(attr->name, attr->name_len, HTML_REPLACEMENT_CHARACTER); break; case '"': case '\'': case '<': html->error = HTML_ERROR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME; goto HTML_STATE_ATTRIBUTE_NAME_anything_else; default: HTML_STATE_ATTRIBUTE_NAME_anything_else: if (IS_UPPER_ALPHA(cc)) cc += 0x20; attr = &NEW_TOKEN_LAST_ATTR; STR_APPEND(attr->name, attr->name_len, cc); /* TODO: check for duplicate attr names, discard this if match */ break; } break; case HTML_STATE_AFTER_ATTRIBUTE_NAME: switch (cc) { case '\t': case '\n': case '\f': case ' ': /* ignore */ break; case '/': html->state = HTML_STATE_SELF_CLOSING_START_TAG; break; case '=': html->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE; break; case '>': html_emit_token(html, &html->new_token); html->state = HTML_STATE_DATA; break; case EOF: html->error = HTML_ERROR_EOF_IN_TAG; html_emit_eof_token(html); break; default: html_prep_new_attribute(html, &html->new_token.tag); html->state = HTML_STATE_ATTRIBUTE_NAME; goto reconsume; } break; case HTML_STATE_BEFORE_ATTRIBUTE_VALUE: switch (cc) { case '\t': case '\n': case '\f': case ' ': /* ignore */ break; case '"': html->state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED; break; case '\'': html->state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED; break; case '>': html->error = HTML_ERROR_MISSING_ATTRIBUTE_VALUE; html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; default: html->state = HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED; goto reconsume; } break; case HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED: switch (cc) { case '"': html->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED; break; case '&': html->return_state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED; html->tmp_len = 0; html->state = HTML_STATE_CHARACTER_REFERENCE; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; attr = &NEW_TOKEN_LAST_ATTR; STR_APPEND(attr->val, attr->val_len, HTML_REPLACEMENT_CHARACTER); break; case EOF: html->error = HTML_ERROR_EOF_IN_TAG; html_emit_eof_token(html); break; default: attr = &NEW_TOKEN_LAST_ATTR; STR_APPEND(attr->val, attr->val_len, cc); break; } break; case HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED: switch (cc) { case '\'': html->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED; break; case '&': html->return_state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED; html->tmp_len = 0; html->state = HTML_STATE_CHARACTER_REFERENCE; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; attr = &NEW_TOKEN_LAST_ATTR; STR_APPEND(attr->val, attr->val_len, HTML_REPLACEMENT_CHARACTER); break; case EOF: html->error = HTML_ERROR_EOF_IN_TAG; html_emit_eof_token(html); break; default: attr = &NEW_TOKEN_LAST_ATTR; STR_APPEND(attr->val, attr->val_len, cc); break; } break; case HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED: switch (cc) { case '\t': case '\n': case '\f': case ' ': html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; case '&': html->return_state = HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED; html->tmp_len = 0; html->state = HTML_STATE_CHARACTER_REFERENCE; break; case '>': html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; attr = &NEW_TOKEN_LAST_ATTR; STR_APPEND(attr->val, attr->val_len, HTML_REPLACEMENT_CHARACTER); break; case '"': case '\'': case '<': case '=': case '`': html->error = HTML_ERROR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE; goto HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED_anything_else; case EOF: html->error = HTML_ERROR_EOF_IN_TAG; html_emit_eof_token(html); break; default: HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED_anything_else: attr = &NEW_TOKEN_LAST_ATTR; STR_APPEND(attr->val, attr->val_len, cc); break; } break; case HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED: switch (cc) { case '\t': case '\n': case '\f': case ' ': html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; break; case '/': html->state = HTML_STATE_SELF_CLOSING_START_TAG; break; case '>': html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_TAG; html_emit_eof_token(html); break; default: html->error = HTML_ERROR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES; html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; goto reconsume; } break; case HTML_STATE_SELF_CLOSING_START_TAG: switch (cc) { case '>': html->new_token.tag.self_closing = true; html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_TAG; html_emit_eof_token(html); break; default: html->error = HTML_ERROR_UNEXPECTED_SOLIDUS_IN_TAG; html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME; goto reconsume; } break; case HTML_STATE_BOGUS_COMMENT: switch (cc) { case '>': html->state = HTML_STATE_DATA; html_emit_comment(html, &html->new_token.comment); break; case EOF: html_emit_comment(html, &html->new_token.comment); html_emit_eof_token(html); break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; if (!html->ignore_comment_data) { STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, HTML_REPLACEMENT_CHARACTER); } break; default: if (!html->ignore_comment_data) { STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, cc); } break; } break; case HTML_STATE_MARKUP_DECLARATION_OPEN: /* "If the next few characters are" */ /* https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state */ if (cc == '-' && html->lookahead[0] == '-') { html_lookahead_consume(html, 1); html_prep_new_token(html, HTML_TOKEN_COMMENT); html->state = HTML_STATE_COMMENT_START; break; } else if ((cc == 'd' || cc == 'D') && strncasecmp(html->lookahead, "octype", 6) == 0) { html_lookahead_consume(html, 6); html->state = HTML_STATE_DOCTYPE; html_prep_new_token(html, HTML_TOKEN_DOCTYPE); break; } else if (cc == '[' && memcmp(html->lookahead, "CDATA[", 6) == 0) { html_lookahead_consume(html, 6); if (html->current_node->ns != HTML_NAMESPACE_HTML) html->state = HTML_STATE_CDATA_SECTION; else html->error = HTML_ERROR_CDATA_IN_HTML_CONTENT; html_prep_new_token(html, HTML_TOKEN_COMMENT); if (!html->ignore_comment_data) html->new_token.comment.len = strlcpy(html->new_token.comment.data, "[CDATA[", sizeof(html->new_token.comment.data)); html->state = HTML_STATE_BOGUS_COMMENT; break; } else { html->error = HTML_ERROR_INCORRECTLY_OPENED_COMMENT; html_prep_new_token(html, HTML_TOKEN_COMMENT); html->state = HTML_STATE_BOGUS_COMMENT; goto reconsume; } break; case HTML_STATE_COMMENT_START: switch (cc) { case '-': html->state = HTML_STATE_COMMENT_START_DASH; break; case '>': html->error = HTML_ERROR_ABRUPT_CLOSING_OF_EMPTY_COMMENT; html->state = HTML_STATE_DATA; html_emit_comment(html, &html->new_token.comment); break; default: html->state = HTML_STATE_COMMENT; goto reconsume; } break; case HTML_STATE_COMMENT_START_DASH: switch (cc) { case '-': html->state = HTML_STATE_COMMENT_END_DASH; break; case '>': html->error = HTML_ERROR_ABRUPT_CLOSING_OF_EMPTY_COMMENT; html->state = HTML_STATE_DATA; html->new_token.type = HTML_TOKEN_COMMENT; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_COMMENT; html->new_token.type = HTML_TOKEN_COMMENT; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: if (!html->ignore_comment_data) { STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, '-'); } html->state = HTML_STATE_COMMENT; goto reconsume; } break; case HTML_STATE_COMMENT: switch (cc) { case '<': if (!html->ignore_comment_data) { STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, cc); } html->state = HTML_STATE_COMMENT_LESS_THAN_SIGN; break; case '-': html->state = HTML_STATE_COMMENT_END_DASH; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; if (!html->ignore_comment_data) { STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, HTML_REPLACEMENT_CHARACTER); } break; case EOF: html->error = HTML_ERROR_EOF_IN_COMMENT; html->new_token.type = HTML_TOKEN_COMMENT; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: if (!html->ignore_comment_data) { STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, cc); } break; } break; case HTML_STATE_COMMENT_LESS_THAN_SIGN: switch (cc) { case '!': if (!html->ignore_comment_data) { STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, cc); } html->state = HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG; break; case '<': if (!html->ignore_comment_data) { STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, cc); } break; default: html->state = HTML_STATE_COMMENT; goto reconsume; } break; case HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG: switch (cc) { case '-': html->state = HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH; break; default: html->state = HTML_STATE_COMMENT; goto reconsume; } break; case HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH: switch (cc) { case '-': html->state = HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH; break; default: html->state = HTML_STATE_COMMENT_END_DASH; goto reconsume; } break; case HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH: switch (cc) { case '>': case EOF: html->state = HTML_STATE_COMMENT_END; goto reconsume; default: html->error = HTML_ERROR_NESTED_COMMENT; html->state = HTML_STATE_COMMENT_END; goto reconsume; } break; case HTML_STATE_COMMENT_END_DASH: switch (cc) { case '-': html->state = HTML_STATE_COMMENT_END; break; case EOF: html->error = HTML_ERROR_EOF_IN_COMMENT; html->new_token.type = HTML_TOKEN_COMMENT; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: if (!html->ignore_comment_data) { STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, '-'); } html->state = HTML_STATE_COMMENT; goto reconsume; } break; case HTML_STATE_COMMENT_END: switch (cc) { case '>': html->state = HTML_STATE_DATA; html->new_token.type = HTML_TOKEN_COMMENT; html_emit_token(html, &html->new_token); break; case '!': html->state = HTML_STATE_COMMENT_END; break; case '-': if (!html->ignore_comment_data) { STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, '-'); } break; case EOF: html->error = HTML_ERROR_EOF_IN_COMMENT; html->new_token.type = HTML_TOKEN_COMMENT; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: if (!html->ignore_comment_data) { STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, '-'); STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, '-'); } html->state = HTML_STATE_COMMENT; goto reconsume; } break; case HTML_STATE_COMMENT_END_BANG: switch (cc) { case '-': if (!html->ignore_comment_data) { STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, '-'); STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, '-'); STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, '!'); } html->state = HTML_STATE_COMMENT_END_DASH; break; case '>': html->error = HTML_ERROR_INCORRECTLY_CLOSED_COMMENT; html->state = HTML_STATE_DATA; html->new_token.type = HTML_TOKEN_COMMENT; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_COMMENT; html->new_token.type = HTML_TOKEN_COMMENT; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: if (!html->ignore_comment_data) { STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, '-'); STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, '-'); STR_APPEND(html->new_token.comment.data, html->new_token.comment.len, '!'); } html->state = HTML_STATE_COMMENT; goto reconsume; } break; case HTML_STATE_DOCTYPE: switch (cc) { case '\t': case '\n': case '\f': case ' ': html->state = HTML_STATE_BEFORE_DOCTYPE_NAME; break; case '>': html->state = HTML_STATE_BEFORE_DOCTYPE_NAME; goto reconsume; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_prep_new_token(html, HTML_TOKEN_DOCTYPE); html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: html->error = HTML_ERROR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME; html->state = HTML_STATE_BEFORE_DOCTYPE_NAME; goto reconsume; } break; case HTML_STATE_BEFORE_DOCTYPE_NAME: switch (cc) { case '\t': case '\n': case '\f': case ' ': /* ignore */ break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; html_prep_new_token(html, HTML_TOKEN_DOCTYPE); STR_APPEND(html->new_token.tag.name, html->new_token.tag.name_len, '!'); html->state = HTML_STATE_DOCTYPE_NAME; break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_prep_new_token(html, HTML_TOKEN_DOCTYPE); html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; case '>': html->error = HTML_ERROR_MISSING_DOCTYPE_NAME; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); html->state = HTML_STATE_DATA; break; default: if (IS_UPPER_ALPHA(cc)) cc += 0x20; html_prep_new_token(html, HTML_TOKEN_DOCTYPE); html->state = HTML_STATE_DOCTYPE_NAME; STR_APPEND(html->new_token.doctype.name, html->new_token.doctype.name_len, cc); break; } break; case HTML_STATE_DOCTYPE_NAME: switch (cc) { case '\t': case '\n': case '\f': case ' ': html->state = HTML_STATE_AFTER_DOCTYPE_NAME; html->tmp_len = 0; break; case '>': html_emit_token(html, &html->new_token); html->state = HTML_STATE_DATA; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; STR_APPEND(html->new_token.doctype.name, html->new_token.doctype.name_len, HTML_REPLACEMENT_CHARACTER); break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_prep_new_token(html, HTML_TOKEN_DOCTYPE); html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: if (IS_UPPER_ALPHA(cc)) cc += 0x20; STR_APPEND(html->new_token.doctype.name, html->new_token.doctype.name_len, cc); break; } break; case HTML_STATE_AFTER_DOCTYPE_NAME: switch (cc) { case '\t': case '\n': case '\f': case ' ': /* ignore */ break; case '>': html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: if ((cc == 'p' || cc == 'P') && strncasecmp(html->lookahead, "ublic", 5) == 0) { html->state = HTML_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD; html->lookahead_len = 0; } else if ((cc == 's' || cc == 'S') && strncasecmp(html->lookahead, "ystem", 5) == 0) { html->state = HTML_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD; html->lookahead_len = 0; } else { html->error = HTML_ERROR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME; html->new_token.doctype.force_quirks = true; html->state = HTML_STATE_BOGUS_DOCTYPE; goto reconsume; } break; } break; case HTML_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD: switch (cc) { case '\t': case '\n': case '\f': case ' ': html->state = HTML_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER; break; case '"': html->error = HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD; memset(html->new_token.doctype.public_identifier, 0, sizeof(html->new_token.doctype.public_identifier)); html->new_token.doctype.public_identifier_len = 0; html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; break; case '\'': html->error = HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD; memset(html->new_token.doctype.public_identifier, 0, sizeof(html->new_token.doctype.public_identifier)); html->new_token.doctype.public_identifier_len = 0; html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; break; case '>': html->error = HTML_ERROR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER; html->new_token.doctype.force_quirks = true; html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: html->error = HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER; html->new_token.doctype.force_quirks = true; html->state = HTML_STATE_BOGUS_DOCTYPE; goto reconsume; } break; case HTML_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: switch (cc) { case '\t': case '\n': case '\f': case ' ': /* ignore */ break; case '"': memset(html->new_token.doctype.public_identifier, 0, sizeof(html->new_token.doctype.public_identifier)); html->new_token.doctype.public_identifier_len = 0; html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; break; case '\'': memset(html->new_token.doctype.public_identifier, 0, sizeof(html->new_token.doctype.public_identifier)); html->new_token.doctype.public_identifier_len = 0; html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; break; case '>': html->error = HTML_ERROR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER; html->new_token.doctype.force_quirks = true; html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: html->error = HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER; html->new_token.doctype.force_quirks = true; html->state = HTML_STATE_BOGUS_DOCTYPE; goto reconsume; } break; case HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: switch (cc) { case '"': html->state = HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; STR_APPEND(html->new_token.doctype.public_identifier, html->new_token.doctype.public_identifier_len, HTML_REPLACEMENT_CHARACTER); break; case '>': html->error = HTML_ERROR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER; html->new_token.doctype.force_quirks = true; html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: STR_APPEND(html->new_token.doctype.public_identifier, html->new_token.doctype.public_identifier_len, cc); break; } break; case HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: switch (cc) { case '\'': html->state = HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; STR_APPEND(html->new_token.doctype.public_identifier, html->new_token.doctype.public_identifier_len, HTML_REPLACEMENT_CHARACTER); break; case '>': html->error = HTML_ERROR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER; html->new_token.doctype.force_quirks = true; html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: STR_APPEND(html->new_token.doctype.public_identifier, html->new_token.doctype.public_identifier_len, cc); break; } break; case HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER: switch (cc) { case '\t': case '\n': case '\f': case ' ': html->state = HTML_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS; break; case '>': html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; case '"': html->error = HTML_ERROR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS; memset(html->new_token.doctype.system_identifier, 0, sizeof(html->new_token.doctype.system_identifier)); html->new_token.doctype.system_identifier_len = 0; html->state = HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; break; case '\'': html->error = HTML_ERROR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS; memset(html->new_token.doctype.system_identifier, 0, sizeof(html->new_token.doctype.system_identifier)); html->new_token.doctype.system_identifier_len = 0; html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: html->error = HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; html->new_token.doctype.force_quirks = true; html->state = HTML_STATE_BOGUS_DOCTYPE; goto reconsume; } break; case HTML_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: switch (cc) { case '\t': case '\n': case '\f': case ' ': /* ignore */ break; case '>': html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; case '"': memset(html->new_token.doctype.system_identifier, 0, sizeof(html->new_token.doctype.system_identifier)); html->new_token.doctype.system_identifier_len = 0; html->state = HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; break; case '\'': memset(html->new_token.doctype.system_identifier, 0, sizeof(html->new_token.doctype.system_identifier)); html->new_token.doctype.system_identifier_len = 0; html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: html->error = HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; html->new_token.doctype.force_quirks = true; html->state = HTML_STATE_BOGUS_DOCTYPE; goto reconsume; } break; case HTML_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD: switch (cc) { case '\t': case '\n': case '\f': case ' ': html->state = HTML_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; break; case '"': html->error = HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD; memset(html->new_token.doctype.system_identifier, 0, sizeof(html->new_token.doctype.system_identifier)); html->new_token.doctype.system_identifier_len = 0; html->state = HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; break; case '\'': html->error = HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD; memset(html->new_token.doctype.system_identifier, 0, sizeof(html->new_token.doctype.system_identifier)); html->new_token.doctype.system_identifier_len = 0; html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; break; case '>': html->error = HTML_ERROR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: html->error = HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; html->new_token.doctype.force_quirks = true; html->state = HTML_STATE_BOGUS_DOCTYPE; goto reconsume; } break; case HTML_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: switch (cc) { case '\t': case '\n': case '\f': case ' ': /* ignore */ break; case '"': memset(html->new_token.doctype.system_identifier, 0, sizeof(html->new_token.doctype.system_identifier)); html->new_token.doctype.system_identifier_len = 0; html->state = HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; break; case '\'': memset(html->new_token.doctype.system_identifier, 0, sizeof(html->new_token.doctype.system_identifier)); html->new_token.doctype.system_identifier_len = 0; html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; break; case '>': html->error = HTML_ERROR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: html->error = HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; html->new_token.doctype.force_quirks = true; html->state = HTML_STATE_BOGUS_DOCTYPE; goto reconsume; } break; case HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: switch (cc) { case '"': html->state = HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; STR_APPEND(html->new_token.doctype.system_identifier, html->new_token.doctype.system_identifier_len, HTML_REPLACEMENT_CHARACTER); break; case '>': html->error = HTML_ERROR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: STR_APPEND(html->new_token.doctype.system_identifier, html->new_token.doctype.system_identifier_len, cc); break; } break; case HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: switch (cc) { case '\'': html->state = HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER; break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; STR_APPEND(html->new_token.doctype.system_identifier, html->new_token.doctype.system_identifier_len, HTML_REPLACEMENT_CHARACTER); break; case '>': html->error = HTML_ERROR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: STR_APPEND(html->new_token.doctype.system_identifier, html->new_token.doctype.system_identifier_len, cc); break; } break; case HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER: switch (cc) { case '\t': case '\n': case '\f': case ' ': /* ignore */ break; case '>': html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; case EOF: html->error = HTML_ERROR_EOF_IN_DOCTYPE; html->new_token.doctype.force_quirks = true; html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: html->error = HTML_ERROR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER; html->state = HTML_STATE_BOGUS_DOCTYPE; goto reconsume; } break; case HTML_STATE_BOGUS_DOCTYPE: switch (cc) { case '>': html->state = HTML_STATE_DATA; html_emit_token(html, &html->new_token); break; case '\0': html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER; /* ignore */ break; case EOF: html_emit_token(html, &html->new_token); html_emit_eof_token(html); break; default: /* ignore */ break; } break; case HTML_STATE_CDATA_SECTION: switch (cc) { case ']': html->state = HTML_STATE_CDATA_SECTION_BRACKET; break; case EOF: html->error = HTML_ERROR_EOF_IN_CDATA; html_emit_eof_token(html); break; default: if (!html->ignore_comment_data) html_emit_char_token(html, cc); break; } break; case HTML_STATE_CDATA_SECTION_BRACKET: switch (cc) { case ']': html->state = HTML_STATE_CDATA_SECTION_END; break; default: if (!html->ignore_comment_data) html_emit_char_token(html, ']'); html->state = HTML_STATE_CDATA_SECTION; goto reconsume; } break; case HTML_STATE_CDATA_SECTION_END: switch (cc) { case ']': if (!html->ignore_comment_data) html_emit_char_token(html, ']'); break; case '>': html->state = HTML_STATE_DATA; break; default: if (!html->ignore_comment_data) { html_emit_char_token(html, ']'); html_emit_char_token(html, ']'); } html->state = HTML_STATE_CDATA_SECTION; goto reconsume; } break; case HTML_STATE_CHARACTER_REFERENCE: STR_APPEND(html->tmp, html->tmp_len, '&'); if (cc == '#') { STR_APPEND(html->tmp, html->tmp_len, cc); html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE; break; } if (IS_ALPHANUMERIC(cc)) { html->state = HTML_STATE_NAMED_CHARACTER_REFERENCE; goto reconsume; } /* "flush code points consumed as a character reference" */ if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) { /* consumed as part of an attribute */ for (n = 0; n < html->tmp_len; n++) { attr = &NEW_TOKEN_LAST_ATTR; STR_APPEND(attr->val, attr->val_len, html->tmp[n]); } } else { /* TODO: check return state for comment ones if ignoring */ for (n = 0; n < html->tmp_len; n++) html_emit_char_token(html, html->tmp[n]); } html->tmp_len = 0; html->state = html->return_state; goto reconsume; case HTML_STATE_NAMED_CHARACTER_REFERENCE: found_entity = NULL; STR_APPEND(html->tmp, html->tmp_len, cc); for (n = 0; n < html->lookahead_len; n++) { STR_APPEND(html->tmp, html->tmp_len, html->lookahead[n]); if (html->lookahead[n] == ';') break; } HTML_DEBUG((": trying to match '%s'", html->tmp)); found_entity = NULL; for (j = 0; html_entities[j].entity != NULL; j++) { for (i = 0; ; i++) { if (html_entities[j].entity[i] == '\0') { /* * If we have an ; in our buffer, match the longer * version of this entity instead (& instead of * &) */ if (html_entities[j].entity[i - 1] != ';' && html->tmp[i] == ';') goto next_entity; found_entity = &html_entities[j]; HTML_DEBUG((": matched lookahead to entity '%s'", found_entity->entity)); html_lookahead_consume(html, i - 2); break; } if (i >= html->tmp_len || html_entities[j].entity[i] != html->tmp[i]) goto next_entity; } next_entity: continue; } if (found_entity != NULL) { if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE && html->tmp[html->tmp_len - 1] != ';' && (html->lookahead[0] == '=' || IS_ALPHANUMERIC(html->lookahead[0]))) { /* * "for historical reasons, flush code points consumed as a * character reference and switch to the return state." */ HTML_DEBUG((": doing historical flush thing")); attr = &NEW_TOKEN_LAST_ATTR; for (n = 0; n < html->tmp_len; n++) { STR_APPEND(attr->val, attr->val_len, html->tmp[n]); } html->tmp_len = 0; html->state = html->return_state; break; } /* otherwise... */ if (html->tmp[html->tmp_len - 1] != ';') html->error = HTML_ERROR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE; html->tmp_len = 0; if ((j = (found_entity->codepoint >> 24) & 0xff)) html->tmp[html->tmp_len++] = j; if ((j = (found_entity->codepoint >> 16) & 0xff)) html->tmp[html->tmp_len++] = j; if ((j = (found_entity->codepoint >> 8) & 0xff)) html->tmp[html->tmp_len++] = j; if ((j = found_entity->codepoint & 0xff)) html->tmp[html->tmp_len++] = j; /* fall through */ } else { HTML_DEBUG((": no entity found for '%s'", html->tmp)); /* pretend we didn't copy anything into tmp after & and cc */ html->tmp_len = 2; html->tmp[html->tmp_len] = '\0'; } if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) { attr = &NEW_TOKEN_LAST_ATTR; for (n = 0; n < html->tmp_len; n++) { STR_APPEND(attr->val, attr->val_len, html->tmp[n]); } HTML_DEBUG((": attribute %s=\"%s\"", attr->name, attr->val)); } else { for (j = 0; j < html->tmp_len; j++) html_emit_char_token(html, html->tmp[j]); } html->tmp_len = 0; if (found_entity == NULL) html->state = HTML_STATE_AMBIGUOUS_AMPERSAND; else html->state = html->return_state; break; case HTML_STATE_AMBIGUOUS_AMPERSAND: if (IS_ALPHANUMERIC(cc)) { if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) { attr = &NEW_TOKEN_LAST_ATTR; STR_APPEND(attr->val, attr->val_len, cc); } else { html_emit_char_token(html, cc); } break; } if (cc == ';') { html->error = HTML_ERROR_UNKNOWN_NAMED_CHARACTER_REFERENCE; html->state = html->return_state; goto reconsume; } html->state = html->return_state; goto reconsume; case HTML_STATE_NUMERIC_CHARACTER_REFERENCE: html->char_ref_code = 0; switch (cc) { case 'x': case 'X': STR_APPEND(html->tmp, html->tmp_len, cc); html->state = HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE_START; break; default: html->state = HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START; goto reconsume; } break; case HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE_START: if (IS_HEX_DIGIT(cc)) { html->state = HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE; goto reconsume; } html->error = HTML_ERROR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE; /* "flush code points consumed as a character reference" */ if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) { /* consumed as part of an attribute */ for (n = 0; n < html->tmp_len; n++) { attr = &NEW_TOKEN_LAST_ATTR; STR_APPEND(attr->val, attr->val_len, html->tmp[n]); } } else { for (n = 0; n < html->tmp_len; n++) html_emit_char_token(html, html->tmp[n]); } html->state = html->return_state; goto reconsume; case HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START: if (IS_DIGIT(cc)) { html->state = HTML_STATE_DECIMAL_CHARACTER_REFERENCE; goto reconsume; } html->error = HTML_ERROR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE; /* "flush code points consumed as a character reference" */ if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) { /* consumed as part of an attribute */ for (n = 0; n < html->tmp_len; n++) { attr = &NEW_TOKEN_LAST_ATTR; STR_APPEND(attr->val, attr->val_len, html->tmp[n]); } } else { for (n = 0; n < html->tmp_len; n++) html_emit_char_token(html, html->tmp[n]); } html->state = html->return_state; goto reconsume; case HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE: if (IS_DIGIT(cc)) { html->char_ref_code *= 16; html->char_ref_code += (cc - 0x30); } else if (IS_UPPER_HEX_DIGIT(cc)) { html->char_ref_code *= 16; html->char_ref_code += (cc - 0x37); } else if (IS_LOWER_HEX_DIGIT(cc)) { html->char_ref_code *= 16; html->char_ref_code += (cc - 0x57); } else if (cc == ';') { html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END; goto reconsume; } else { html->error = HTML_ERROR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE; html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END; goto reconsume; } break; case HTML_STATE_DECIMAL_CHARACTER_REFERENCE: if (IS_DIGIT(cc)) { html->char_ref_code *= 10; html->char_ref_code += (cc - 0x30); } else if (cc == ';') { html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END; goto reconsume; } else { html->error = HTML_ERROR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE; html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END; goto reconsume; } break; case HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END: /* this state does not consume a character */ if (html->char_ref_code == 0) { html->error = HTML_ERROR_NULL_CHARACTER_REFERENCE; html->char_ref_code = 0xfffd; } else if (html->char_ref_code > 0x10ffff) { html->error = HTML_ERROR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE; html->char_ref_code = 0xfffd; } else if (IS_SURROGATE(html->char_ref_code)) { html->error = HTML_ERROR_SURROGATE_CHARACTER_REFERENCE; html->char_ref_code = 0xfffd; } else if (IS_NONCHARACTER(html->char_ref_code)) { html->error = HTML_ERROR_NONCHARACTER_CHARACTER_REFERENCE; } else if (html->char_ref_code == 0x0d || (IS_CONTROL(html->char_ref_code) && !IS_WHITESPACE(html->char_ref_code))) { html->error = HTML_ERROR_CONTROL_CHARACTER_REFERENCE; /* TODO: lookup in table */ } html->tmp[0] = html->char_ref_code; html->tmp_len = 1; /* "flush code points consumed as a character reference" */ if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) { /* consumed as part of an attribute */ for (n = 0; n < html->tmp_len; n++) { attr = &NEW_TOKEN_LAST_ATTR; STR_APPEND(attr->val, attr->val_len, html->tmp[n]); } } else { for (n = 0; n < html->tmp_len; n++) html_emit_char_token(html, html->tmp[n]); } html->state = html->return_state; break; default: panic("bogus tokenize state %d", html->state); } if (html->state != was_state) HTML_DEBUG((": exited state %d", html_state_names[html->state])); if (html->error) { HTML_DEBUG((": error %s", html_error_strings[html->error])); html->error = 0; } HTML_DEBUG(("\r")); } void html_tokenize_finish(struct html_page *html) { if (html->lookahead_len) { HTML_DEBUG(("finish requested, tokenizing remaining %d lookahead\r", html->lookahead_len)); while (html->lookahead_len) html_tokenize(html, EOF); } html_tokenize(html, EOF); html_stop_parsing(html); } void html_prep_new_token(struct html_page *html, html_token_type token_type) { memset(&html->new_token, 0, sizeof(html_token)); html->new_token.type = token_type; } struct html_attr * html_prep_new_attribute(struct html_page *html, struct html_tag *tag) { if (tag->attrs_count >= nitems(tag->attrs)) panic("tag attr overflow"); tag->attrs_count++; tag->attrs[tag->attrs_count - 1].name_len = 0; tag->attrs[tag->attrs_count - 1].name[0] = '\0'; tag->attrs[tag->attrs_count - 1].val_len = 0; tag->attrs[tag->attrs_count - 1].val[0] = '\0'; return &tag->attrs[tag->attrs_count - 1]; } bool html_appropriate_end_tag_token(struct html_page *html, html_token *token) { /* https://html.spec.whatwg.org/multipage/parsing.html#tokenization * "an end tag token whose tag name matches the tag name of the last start * tag to have been emitted" */ if (html->open_count <= 0) return false; /* TODO: store last start tag to have been emitted and check that */ return (strcmp(html->current_node->name, html->new_token.tag.name) == 0); } html_tag_type html_find_tag_type(char *name) { long idx; idx = strcaseidx(name, html_tag_names); if (idx >= 0) return idx; HTML_DEBUG((": html_find_tag_type couldn't find %s", name)); return 0; } #endif /* HTML_ENABLE */