AmendHub

Download:

jcs

/

detritus

/

amendments

/

65

html_tokenize: Implement remaining character reference states

NUMERIC_CHARACTER_REFERENCE_END does not consume a character, so
although the spec doesn't say to reconsume to get there, do that in
states that switch to that state so we don't consume one character
too many.

jcs made amendment 65 about 1 year ago
--- html_tokenize.c Sun Dec 22 21:41:58 2024 +++ html_tokenize.c Mon Dec 23 19:48:17 2024 @@ -1955,7 +1955,7 @@ reconsume: goto reconsume; } - /* flush consumed */ + /* "flush code points consumed as a character reference" */ if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) { /* consumed as part of an attribute */ for (n = 0; n < html->tmp_len; n++) { @@ -2089,6 +2089,8 @@ next_entity: html->state = html->return_state; goto reconsume; case HTML_STATE_NUMERIC_CHARACTER_REFERENCE: + html->char_ref_code = 0; + switch (cc) { case 'x': case 'X': @@ -2108,20 +2110,114 @@ next_entity: html->error = HTML_ERROR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE; + + /* "flush code points consumed as a character reference" */ if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) { - attr = &NEW_TOKEN_LAST_ATTR; - STR_APPEND(attr->val, attr->val_len, cc); + /* consumed as part of an attribute */ + for (n = 0; n < html->tmp_len; n++) { + attr = &NEW_TOKEN_LAST_ATTR; + STR_APPEND(attr->val, attr->val_len, html->tmp[n]); + } } else { - html_emit_char_token(html, cc); + for (n = 0; n < html->tmp_len; n++) + html_emit_char_token(html, html->tmp[n]); } html->state = html->return_state; goto reconsume; case HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START: + if (IS_DIGIT(cc)) { + html->state = HTML_STATE_DECIMAL_CHARACTER_REFERENCE; + goto reconsume; + } + + html->error = + HTML_ERROR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE; + + /* "flush code points consumed as a character reference" */ + if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) { + /* consumed as part of an attribute */ + for (n = 0; n < html->tmp_len; n++) { + attr = &NEW_TOKEN_LAST_ATTR; + STR_APPEND(attr->val, attr->val_len, html->tmp[n]); + } + } else { + for (n = 0; n < html->tmp_len; n++) + html_emit_char_token(html, html->tmp[n]); + } + html->state = html->return_state; + goto reconsume; case HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE: + if (IS_DIGIT(cc)) { + html->char_ref_code *= 16; + html->char_ref_code += (cc - 0x30); + } else if (IS_UPPER_HEX_DIGIT(cc)) { + html->char_ref_code *= 16; + html->char_ref_code += (cc - 0x37); + } else if (IS_LOWER_HEX_DIGIT(cc)) { + html->char_ref_code *= 16; + html->char_ref_code += (cc - 0x57); + } else if (cc == ';') { + html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END; + goto reconsume; + } else { + html->error = + HTML_ERROR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE; + html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END; + goto reconsume; + } + break; case HTML_STATE_DECIMAL_CHARACTER_REFERENCE: + if (IS_DIGIT(cc)) { + html->char_ref_code *= 10; + html->char_ref_code += (cc - 0x30); + } else if (cc == ';') { + html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END; + goto reconsume; + } else { + html->error = + HTML_ERROR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE; + html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END; + goto reconsume; + } + break; case HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END: - /* TODO */ - panic("state %d not supported", html->state); + /* this state does not consume a character */ + + if (html->char_ref_code == 0) { + html->error = HTML_ERROR_NULL_CHARACTER_REFERENCE; + html->char_ref_code = 0xfffd; + } else if (html->char_ref_code > 0x10ffff) { + html->error = + HTML_ERROR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE; + html->char_ref_code = 0xfffd; + } else if (IS_SURROGATE(html->char_ref_code)) { + html->error = HTML_ERROR_SURROGATE_CHARACTER_REFERENCE; + html->char_ref_code = 0xfffd; + } else if (IS_NONCHARACTER(html->char_ref_code)) { + html->error = HTML_ERROR_NONCHARACTER_CHARACTER_REFERENCE; + } else if (html->char_ref_code == 0x0d || + (IS_CONTROL(html->char_ref_code) && + !IS_WHITESPACE(html->char_ref_code))) { + html->error = HTML_ERROR_CONTROL_CHARACTER_REFERENCE; + /* TODO: lookup in table */ + } + + html->tmp[0] = html->char_ref_code; + html->tmp_len = 1; + + /* "flush code points consumed as a character reference" */ + if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) { + /* consumed as part of an attribute */ + for (n = 0; n < html->tmp_len; n++) { + attr = &NEW_TOKEN_LAST_ATTR; + STR_APPEND(attr->val, attr->val_len, html->tmp[n]); + } + } else { + for (n = 0; n < html->tmp_len; n++) + html_emit_char_token(html, html->tmp[n]); + } + + html->state = html->return_state; break; default: panic("bogus tokenize state %d", html->state);