jcs
/detritus
/amendments
/57
html_tokenize: Import HTML tokenizer written to WHATWG spec docs
jcs made amendment 57 about 1 year ago
--- html_data.c Wed Dec 11 11:20:37 2024
+++ html_data.c Wed Dec 11 11:20:37 2024
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2024 joshua stein <jcs@jcs.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "html.h"
+
+#ifdef HTML_ENABLE_DEBUGGING
+const char *html_mode_names[] = {
+ "NONE",
+ "INITIAL",
+ "BEFORE_HTML",
+ "BEFORE_HEAD",
+ "IN_HEAD",
+ "IN_HEAD_NOSCRIPT",
+ "AFTER_HEAD",
+ "IN_BODY",
+ "TEXT",
+ "IN_TABLE",
+ "IN_TABLE_TEXT",
+ "IN_CAPTION",
+ "IN_COLUMN_GROUP",
+ "IN_TABLE_BODY",
+ "IN_ROW",
+ "IN_CELL",
+ "IN_SELECT",
+ "IN_SELECT_IN_TABLE",
+ "IN_TEMPLATE",
+ "AFTER_BODY",
+ "IN_FRAMESET",
+ "AFTER_FRAMESET",
+ "AFTER_AFTER_BODY",
+ "AFTER_AFTER_FRAMESET"
+};
+
+const char *html_state_names[] = {
+ "NONE",
+ "DATA",
+ "RCDATA",
+ "RAWTEXT",
+ "SCRIPT_DATA",
+ "PLAINTEXT",
+ "TAG_OPEN",
+ "END_TAG_OPEN",
+ "TAG_NAME",
+ "RCDATA_LESS_THAN_SIGN",
+ "RCDATA_END_TAG_OPEN",
+ "RCDATA_END_TAG_NAME",
+ "RAWTEXT_LESS_THAN_SIGN",
+ "RAWTEXT_END_TAG_OPEN",
+ "RAWTEXT_END_TAG_NAME",
+ "SCRIPT_DATA_LESS_THAN_SIGN",
+ "SCRIPT_DATA_END_TAG_OPEN",
+ "SCRIPT_DATA_END_TAG_NAME",
+ "SCRIPT_DATA_ESCAPE_START",
+ "SCRIPT_DATA_ESCAPE_START_DASH",
+ "SCRIPT_DATA_ESCAPED",
+ "SCRIPT_DATA_ESCAPED_DASH",
+ "SCRIPT_DATA_ESCAPED_DASH_DASH",
+ "SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN",
+ "SCRIPT_DATA_ESCAPED_END_TAG_OPEN",
+ "SCRIPT_DATA_ESCAPED_END_TAG_NAME",
+ "SCRIPT_DATA_DOUBLE_ESCAPE_START",
+ "SCRIPT_DATA_DOUBLE_ESCAPED",
+ "SCRIPT_DATA_DOUBLE_ESCAPED_DASH",
+ "SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH",
+ "SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN",
+ "SCRIPT_DATA_DOUBLE_ESCAPE_END",
+ "BEFORE_ATTRIBUTE_NAME",
+ "ATTRIBUTE_NAME",
+ "AFTER_ATTRIBUTE_NAME",
+ "BEFORE_ATTRIBUTE_VALUE",
+ "ATTRIBUTE_VALUE_DOUBLE_QUOTED",
+ "ATTRIBUTE_VALUE_SINGLE_QUOTED",
+ "ATTRIBUTE_VALUE_UNQUOTED",
+ "AFTER_ATTRIBUTE_VALUE_QUOTED",
+ "SELF_CLOSING_START_TAG",
+ "BOGUS_COMMENT",
+ "MARKUP_DECLARATION_OPEN",
+ "COMMENT_START",
+ "COMMENT_START_DASH",
+ "COMMENT",
+ "COMMENT_LESS_THAN_SIGN",
+ "COMMENT_LESS_THAN_SIGN_BANG",
+ "COMMENT_LESS_THAN_SIGN_BANG_DASH",
+ "COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH",
+ "COMMENT_END_DASH",
+ "COMMENT_END",
+ "COMMENT_END_BANG",
+ "DOCTYPE",
+ "BEFORE_DOCTYPE_NAME",
+ "DOCTYPE_NAME",
+ "AFTER_DOCTYPE_NAME",
+ "AFTER_DOCTYPE_PUBLIC_KEYWORD",
+ "BEFORE_DOCTYPE_PUBLIC_IDENTIFIER",
+ "DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED",
+ "DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED",
+ "AFTER_DOCTYPE_PUBLIC_IDENTIFIER",
+ "BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS",
+ "AFTER_DOCTYPE_SYSTEM_KEYWORD",
+ "BEFORE_DOCTYPE_SYSTEM_IDENTIFIER",
+ "DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED",
+ "DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED",
+ "AFTER_DOCTYPE_SYSTEM_IDENTIFIER",
+ "BOGUS_DOCTYPE",
+ "CDATA_SECTION",
+ "CDATA_SECTION_BRACKET",
+ "CDATA_SECTION_END",
+ "CHARACTER_REFERENCE",
+ "NAMED_CHARACTER_REFERENCE",
+ "AMBIGUOUS_AMPERSAND",
+ "NUMERIC_CHARACTER_REFERENCE",
+ "HEXADECIMAL_CHARACTER_REFERENCE_START",
+ "DECIMAL_CHARACTER_REFERENCE_START",
+ "HEXADECIMAL_CHARACTER_REFERENCE",
+ "DECIMAL_CHARACTER_REFERENCE",
+ "NUMERIC_CHARACTER_REFERENCE_END"
+};
+
+const char *html_error_strings[] = {
+ "NONE",
+ "ABRUPT_CLOSING_OF_EMPTY_COMMENT",
+ "ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER",
+ "ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER",
+ "ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE",
+ "CDATA_IN_HTML_CONTENT",
+ "CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE",
+ "CONTROL_CHARACTER_IN_INPUT_STREAM",
+ "CONTROL_CHARACTER_REFERENCE",
+ "DUPLICATE_ATTRIBUTE",
+ "END_TAG_WITH_ATTRIBUTES",
+ "END_TAG_WITH_TRAILING_SOLIDUS",
+ "EOF_BEFORE_TAG_NAME",
+ "EOF_IN_CDATA",
+ "EOF_IN_COMMENT",
+ "EOF_IN_DOCTYPE",
+ "EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT",
+ "EOF_IN_TAG",
+ "INCORRECTLY_CLOSED_COMMENT",
+ "INCORRECTLY_OPENED_COMMENT",
+ "INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME",
+ "INVALID_FIRST_CHARACTER_OF_TAG_NAME",
+ "MISSING_ATTRIBUTE_VALUE",
+ "MISSING_DOCTYPE_NAME",
+ "MISSING_DOCTYPE_PUBLIC_IDENTIFIER",
+ "MISSING_DOCTYPE_SYSTEM_IDENTIFIER",
+ "MISSING_END_TAG_NAME",
+ "MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER",
+ "MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER",
+ "MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE",
+ "MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD",
+ "MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD",
+ "MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME",
+ "MISSING_WHITESPACE_BETWEEN_ATTRIBUTES",
+ "MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS",
+ "NESTED_COMMENT",
+ "NONCHARACTER_CHARACTER_REFERENCE",
+ "NONCHARACTER_IN_INPUT_STREAM",
+ "NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS",
+ "NULL_CHARACTER_REFERENCE",
+ "SURROGATE_CHARACTER_REFERENCE",
+ "SURROGATE_IN_INPUT_STREAM",
+ "UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER",
+ "UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME",
+ "UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE",
+ "UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME",
+ "UNEXPECTED_NULL_CHARACTER",
+ "UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME",
+ "UNEXPECTED_SOLIDUS_IN_TAG",
+ "UNKNOWN_NAMED_CHARACTER_REFERENCE"
+};
+
+const char *html_token_names[] = {
+ "INVALID",
+
+ "DOCTYPE",
+ "START_TAG",
+ "END_TAG",
+ "COMMENT",
+ "CHARACTER",
+ "EOF"
+};
+#endif
+
+/* this doesn't have to list all tags, just ones the docs reference */
+const char *html_tag_names[] = {
+ NULL,
+
+ "a", /* 1 */
+ "address",
+ "applet",
+ "area",
+ "article",
+ "aside",
+ "b",
+ "base",
+ "basefont",
+ "bgsound",
+ "big",
+ "blockquote",
+ "body",
+ "br",
+ "button",
+ "caption",
+ "center",
+ "cite",
+ "code",
+ "col",
+ "colgroup",
+ "dd",
+ "details",
+ "dfn",
+ "dialog",
+ "dir",
+ "div",
+ "dl",
+ "dt",
+ "em",
+ "embed",
+ "fieldset",
+ "figcaption",
+ "figure",
+ "font",
+ "footer",
+ "form",
+ "frame",
+ "frameset",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "head",
+ "header",
+ "hgroup",
+ "hr",
+ "html",
+ "i",
+ "iframe",
+ "image",
+ "img",
+ "input",
+ "ins",
+ "kbd",
+ "keygen",
+ "li",
+ "link",
+ "listing",
+ "main",
+ "marquee",
+ "math",
+ "menu",
+ "meta",
+ "nav",
+ "nobr",
+ "noembed",
+ "noframes",
+ "noscript",
+ "object",
+ "ol",
+ "optgroup",
+ "option",
+ "p",
+ "param",
+ "plaintext",
+ "pre",
+ "rb",
+ "rp",
+ "rt",
+ "rtc",
+ "ruby",
+ "s",
+ "samp",
+ "script",
+ "search",
+ "section",
+ "select",
+ "small",
+ "source",
+ "span",
+ "strike",
+ "strong",
+ "style",
+ "sub",
+ "sup",
+ "summary",
+ "svg",
+ "table",
+ "tbody",
+ "td",
+ "template",
+ "textarea",
+ "tfoot",
+ "th",
+ "thead",
+ "title",
+ "tr",
+ "track",
+ "tt",
+ "u",
+ "ul",
+ "var",
+ "wbr",
+ "xmp",
+
+ NULL
+};
--- html_entities.c Tue Dec 10 22:39:10 2024
+++ html_entities.c Tue Dec 10 22:39:10 2024
@@ -0,0 +1,2300 @@
+/*
+ * Copyright (c) 2024 joshua stein <jcs@jcs.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "html.h"
+
+/* https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references */
+const html_entity html_entities[] = {
+ /* super common ones to speed up searching */
+ { " ", 0x000000a0, },
+ { " ", 0x000000a0, },
+ { "<", 0x0000003c, },
+ { ">", 0x0000003e, },
+ { "&", 0x00000026, },
+ { "&", 0x00000026, },
+
+ /* list taken from https://www.w3.org/wiki/Common_HTML_entities_used_for_typography */
+ { "¢", 0x000000a2, },
+ { "£", 0x000000a3, },
+ { "§", 0x000000a7, },
+ { "©", 0x000000a9, },
+ { "«", 0x000000ab, },
+ { "»", 0x000000bb, },
+ { "®", 0x000000ae, },
+ { "°", 0x000000b0, },
+ { "±", 0x000000b1, },
+ { "¶", 0x000000b6, },
+ { "·", 0x000000b7, },
+ { "½", 0x000000bd, },
+ { "–", 0x00002013, },
+ { "—", 0x00002014, },
+ { "‘", 0x00002018, },
+ { "’", 0x00002019, },
+ { "‚", 0x0000201a, },
+ { "“", 0x0000201c, },
+ { "”", 0x0000201d, },
+ { "„", 0x0000201e, },
+ { "†", 0x00002020, },
+ { "‡", 0x00002021, },
+ { "•", 0x00002022, },
+ { "•", 0x00002022, },
+ { "…", 0x00002026, },
+ { "′", 0x00002032, },
+ { "″", 0x00002033, },
+ { "€", 0x000020ac, },
+ { "™", 0x00002122, },
+ { "≈", 0x00002248, },
+ { "≠", 0x00002260, },
+ { "≤", 0x00002264, },
+ { "≥", 0x00002265, },
+
+#if 0
+ /* not much point in supporting all these, we can't draw them anyway */
+ { "Æ", 0x000000c6, },
+ { "Æ", 0x000000c6, },
+ { "&", 0x00000026, },
+ { "&", 0x00000026, },
+ { "Á", 0x000000c1, },
+ { "Á", 0x000000c1, },
+ { "Ă", 0x00000102, },
+ { "Â", 0x000000c2, },
+ { "Â", 0x000000c2, },
+ { "А", 0x00000410, },
+ { "𝔄", 0x0001d504, },
+ { "À", 0x000000c0, },
+ { "À", 0x000000c0, },
+ { "Α", 0x00000391, },
+ { "Ā", 0x00000100, },
+ { "⩓", 0x00002a53, },
+ { "Ą", 0x00000104, },
+ { "𝔸", 0x0001d538, },
+ { "⁡", 0x00002061, },
+ { "Å", 0x000000c5, },
+ { "Å", 0x000000c5, },
+ { "𝒜", 0x0001d49c, },
+ { "≔", 0x00002254, },
+ { "Ã", 0x000000c3, },
+ { "Ã", 0x000000c3, },
+ { "Ä", 0x000000c4, },
+ { "Ä", 0x000000c4, },
+ { "∖", 0x00002216, },
+ { "⫧", 0x00002ae7, },
+ { "⌆", 0x00002306, },
+ { "Б", 0x00000411, },
+ { "∵", 0x00002235, },
+ { "ℬ", 0x0000212c, },
+ { "Β", 0x00000392, },
+ { "𝔅", 0x0001d505, },
+ { "𝔹", 0x0001d539, },
+ { "˘", 0x000002d8, },
+ { "ℬ", 0x0000212c, },
+ { "≎", 0x0000224e, },
+ { "Ч", 0x00000427, },
+ { "©", 0x000000a9, },
+ { "©", 0x000000a9, },
+ { "Ć", 0x00000106, },
+ { "⋒", 0x000022d2, },
+ { "ⅅ", 0x00002145, },
+ { "ℭ", 0x0000212d, },
+ { "Č", 0x0000010c, },
+ { "Ç", 0x000000c7, },
+ { "Ç", 0x000000c7, },
+ { "Ĉ", 0x00000108, },
+ { "∰", 0x00002230, },
+ { "Ċ", 0x0000010a, },
+ { "¸", 0x000000b8, },
+ { "·", 0x000000b7, },
+ { "ℭ", 0x0000212d, },
+ { "Χ", 0x000003a7, },
+ { "⊙", 0x00002299, },
+ { "⊖", 0x00002296, },
+ { "⊕", 0x00002295, },
+ { "⊗", 0x00002297, },
+ { "∲", 0x00002232, },
+ { "”", 0x0000201d, },
+ { "’", 0x00002019, },
+ { "∷", 0x00002237, },
+ { "⩴", 0x00002a74, },
+ { "≡", 0x00002261, },
+ { "∯", 0x0000222f, },
+ { "∮", 0x0000222e, },
+ { "ℂ", 0x00002102, },
+ { "∐", 0x00002210, },
+ { "∳", 0x00002233, },
+ { "⨯", 0x00002a2f, },
+ { "𝒞", 0x0001d49e, },
+ { "⋓", 0x000022d3, },
+ { "≍", 0x0000224d, },
+ { "ⅅ", 0x00002145, },
+ { "⤑", 0x00002911, },
+ { "Ђ", 0x00000402, },
+ { "Ѕ", 0x00000405, },
+ { "Џ", 0x0000040f, },
+ { "‡", 0x00002021, },
+ { "↡", 0x000021a1, },
+ { "⫤", 0x00002ae4, },
+ { "Ď", 0x0000010e, },
+ { "Д", 0x00000414, },
+ { "∇", 0x00002207, },
+ { "Δ", 0x00000394, },
+ { "𝔇", 0x0001d507, },
+ { "´", 0x000000b4, },
+ { "˙", 0x000002d9, },
+ { "˝", 0x000002dd, },
+ { "`", 0x00000060, },
+ { "˜", 0x000002dc, },
+ { "⋄", 0x000022c4, },
+ { "ⅆ", 0x00002146, },
+ { "𝔻", 0x0001d53b, },
+ { "¨", 0x000000a8, },
+ { "⃜", 0x000020dc, },
+ { "≐", 0x00002250, },
+ { "∯", 0x0000222f, },
+ { "¨", 0x000000a8, },
+ { "⇓", 0x000021d3, },
+ { "⇐", 0x000021d0, },
+ { "⇔", 0x000021d4, },
+ { "⫤", 0x00002ae4, },
+ { "⟸", 0x000027f8, },
+ { "⟺", 0x000027fa, },
+ { "⟹", 0x000027f9, },
+ { "⇒", 0x000021d2, },
+ { "⊨", 0x000022a8, },
+ { "⇑", 0x000021d1, },
+ { "⇕", 0x000021d5, },
+ { "∥", 0x00002225, },
+ { "↓", 0x00002193, },
+ { "⤓", 0x00002913, },
+ { "⇵", 0x000021f5, },
+ { "̑", 0x00000311, },
+ { "⥐", 0x00002950, },
+ { "⥞", 0x0000295e, },
+ { "↽", 0x000021bd, },
+ { "⥖", 0x00002956, },
+ { "⥟", 0x0000295f, },
+ { "⇁", 0x000021c1, },
+ { "⥗", 0x00002957, },
+ { "⊤", 0x000022a4, },
+ { "↧", 0x000021a7, },
+ { "⇓", 0x000021d3, },
+ { "𝒟", 0x0001d49f, },
+ { "Đ", 0x00000110, },
+ { "Ŋ", 0x0000014a, },
+ { "Ð", 0x000000d0, },
+ { "Ð", 0x000000d0, },
+ { "É", 0x000000c9, },
+ { "É", 0x000000c9, },
+ { "Ě", 0x0000011a, },
+ { "Ê", 0x000000ca, },
+ { "Ê", 0x000000ca, },
+ { "Э", 0x0000042d, },
+ { "Ė", 0x00000116, },
+ { "𝔈", 0x0001d508, },
+ { "È", 0x000000c8, },
+ { "È", 0x000000c8, },
+ { "∈", 0x00002208, },
+ { "Ē", 0x00000112, },
+ { "◻", 0x000025fb, },
+ { "▫", 0x000025ab, },
+ { "Ę", 0x00000118, },
+ { "𝔼", 0x0001d53c, },
+ { "Ε", 0x00000395, },
+ { "⩵", 0x00002a75, },
+ { "≂", 0x00002242, },
+ { "⇌", 0x000021cc, },
+ { "ℰ", 0x00002130, },
+ { "⩳", 0x00002a73, },
+ { "Η", 0x00000397, },
+ { "Ë", 0x000000cb, },
+ { "Ë", 0x000000cb, },
+ { "∃", 0x00002203, },
+ { "ⅇ", 0x00002147, },
+ { "Ф", 0x00000424, },
+ { "𝔉", 0x0001d509, },
+ { "◼", 0x000025fc, },
+ { "▪", 0x000025aa, },
+ { "𝔽", 0x0001d53d, },
+ { "∀", 0x00002200, },
+ { "ℱ", 0x00002131, },
+ { "ℱ", 0x00002131, },
+ { "Ѓ", 0x00000403, },
+ { ">", 0x0000003e, },
+ { ">", 0x0000003e, },
+ { "Γ", 0x00000393, },
+ { "Ϝ", 0x000003dc, },
+ { "Ğ", 0x0000011e, },
+ { "Ģ", 0x00000122, },
+ { "Ĝ", 0x0000011c, },
+ { "Г", 0x00000413, },
+ { "Ġ", 0x00000120, },
+ { "𝔊", 0x0001d50a, },
+ { "⋙", 0x000022d9, },
+ { "𝔾", 0x0001d53e, },
+ { "≥", 0x00002265, },
+ { "⋛", 0x000022db, },
+ { "≧", 0x00002267, },
+ { "⪢", 0x00002aa2, },
+ { "≷", 0x00002277, },
+ { "⩾", 0x00002a7e, },
+ { "≳", 0x00002273, },
+ { "𝒢", 0x0001d4a2, },
+ { "≫", 0x0000226b, },
+ { "Ъ", 0x0000042a, },
+ { "ˇ", 0x000002c7, },
+ { "^", 0x0000005e, },
+ { "Ĥ", 0x00000124, },
+ { "ℌ", 0x0000210c, },
+ { "ℋ", 0x0000210b, },
+ { "ℍ", 0x0000210d, },
+ { "─", 0x00002500, },
+ { "ℋ", 0x0000210b, },
+ { "Ħ", 0x00000126, },
+ { "≎", 0x0000224e, },
+ { "≏", 0x0000224f, },
+ { "Е", 0x00000415, },
+ { "IJ", 0x00000132, },
+ { "Ё", 0x00000401, },
+ { "Í", 0x000000cd, },
+ { "Í", 0x000000cd, },
+ { "Î", 0x000000ce, },
+ { "Î", 0x000000ce, },
+ { "И", 0x00000418, },
+ { "İ", 0x00000130, },
+ { "ℑ", 0x00002111, },
+ { "Ì", 0x000000cc, },
+ { "Ì", 0x000000cc, },
+ { "ℑ", 0x00002111, },
+ { "Ī", 0x0000012a, },
+ { "ⅈ", 0x00002148, },
+ { "⇒", 0x000021d2, },
+ { "∬", 0x0000222c, },
+ { "∫", 0x0000222b, },
+ { "⋂", 0x000022c2, },
+ { "⁣", 0x00002063, },
+ { "⁢", 0x00002062, },
+ { "Į", 0x0000012e, },
+ { "𝕀", 0x0001d540, },
+ { "Ι", 0x00000399, },
+ { "ℐ", 0x00002110, },
+ { "Ĩ", 0x00000128, },
+ { "І", 0x00000406, },
+ { "Ï", 0x000000cf, },
+ { "Ï", 0x000000cf, },
+ { "Ĵ", 0x00000134, },
+ { "Й", 0x00000419, },
+ { "𝔍", 0x0001d50d, },
+ { "𝕁", 0x0001d541, },
+ { "𝒥", 0x0001d4a5, },
+ { "Ј", 0x00000408, },
+ { "Є", 0x00000404, },
+ { "Х", 0x00000425, },
+ { "Ќ", 0x0000040c, },
+ { "Κ", 0x0000039a, },
+ { "Ķ", 0x00000136, },
+ { "К", 0x0000041a, },
+ { "𝔎", 0x0001d50e, },
+ { "𝕂", 0x0001d542, },
+ { "𝒦", 0x0001d4a6, },
+ { "Љ", 0x00000409, },
+ { "<", 0x0000003c, },
+ { "<", 0x0000003c, },
+ { "Ĺ", 0x00000139, },
+ { "Λ", 0x0000039b, },
+ { "⟪", 0x000027ea, },
+ { "ℒ", 0x00002112, },
+ { "↞", 0x0000219e, },
+ { "Ľ", 0x0000013d, },
+ { "Ļ", 0x0000013b, },
+ { "Л", 0x0000041b, },
+ { "⟨", 0x000027e8, },
+ { "←", 0x00002190, },
+ { "⇤", 0x000021e4, },
+ { "⇆", 0x000021c6, },
+ { "⌈", 0x00002308, },
+ { "⟦", 0x000027e6, },
+ { "⥡", 0x00002961, },
+ { "⇃", 0x000021c3, },
+ { "⥙", 0x00002959, },
+ { "⌊", 0x0000230a, },
+ { "↔", 0x00002194, },
+ { "⥎", 0x0000294e, },
+ { "⊣", 0x000022a3, },
+ { "↤", 0x000021a4, },
+ { "⥚", 0x0000295a, },
+ { "⊲", 0x000022b2, },
+ { "⧏", 0x000029cf, },
+ { "⊴", 0x000022b4, },
+ { "⥑", 0x00002951, },
+ { "⥠", 0x00002960, },
+ { "↿", 0x000021bf, },
+ { "⥘", 0x00002958, },
+ { "↼", 0x000021bc, },
+ { "⥒", 0x00002952, },
+ { "⇐", 0x000021d0, },
+ { "⇔", 0x000021d4, },
+ { "⋚", 0x000022da, },
+ { "≦", 0x00002266, },
+ { "≶", 0x00002276, },
+ { "⪡", 0x00002aa1, },
+ { "⩽", 0x00002a7d, },
+ { "≲", 0x00002272, },
+ { "𝔏", 0x0001d50f, },
+ { "⋘", 0x000022d8, },
+ { "⇚", 0x000021da, },
+ { "Ŀ", 0x0000013f, },
+ { "⟵", 0x000027f5, },
+ { "⟷", 0x000027f7, },
+ { "⟶", 0x000027f6, },
+ { "⟸", 0x000027f8, },
+ { "⟺", 0x000027fa, },
+ { "⟹", 0x000027f9, },
+ { "𝕃", 0x0001d543, },
+ { "↙", 0x00002199, },
+ { "↘", 0x00002198, },
+ { "ℒ", 0x00002112, },
+ { "↰", 0x000021b0, },
+ { "Ł", 0x00000141, },
+ { "≪", 0x0000226a, },
+ { "⤅", 0x00002905, },
+ { "М", 0x0000041c, },
+ { " ", 0x0000205f, },
+ { "ℳ", 0x00002133, },
+ { "𝔐", 0x0001d510, },
+ { "∓", 0x00002213, },
+ { "𝕄", 0x0001d544, },
+ { "ℳ", 0x00002133, },
+ { "Μ", 0x0000039c, },
+ { "Њ", 0x0000040a, },
+ { "Ń", 0x00000143, },
+ { "Ň", 0x00000147, },
+ { "Ņ", 0x00000145, },
+ { "Н", 0x0000041d, },
+ { "​", 0x0000200b, },
+ { "​", 0x0000200b, },
+ { "​", 0x0000200b, },
+ { "​", 0x0000200b, },
+ { "≫", 0x0000226b, },
+ { "≪", 0x0000226a, },
+ { "
", 0x0000000a, },
+ { "𝔑", 0x0001d511, },
+ { "⁠", 0x00002060, },
+ { " ", 0x000000a0, },
+ { "ℕ", 0x00002115, },
+ { "⫬", 0x00002aec, },
+ { "≢", 0x00002262, },
+ { "≭", 0x0000226d, },
+ { "∦", 0x00002226, },
+ { "∉", 0x00002209, },
+ { "≠", 0x00002260, },
+// { "≂̸", 0x00002242, 0x00000338, },
+ { "∄", 0x00002204, },
+ { "≯", 0x0000226f, },
+ { "≱", 0x00002271, },
+// { "≧̸", 8807, 0x00000338, },
+// { "≫̸", 8811, 0x00000338, },
+ { "≹", 0x00002279, },
+// { "⩾̸", 10878, 0x00000338, },
+ { "≵", 0x00002275, },
+// { "≎̸", 8782, 0x00000338, },
+// { "≏̸", 8783, 0x00000338, },
+ { "⋪", 0x000022ea, },
+// { "⧏̸", 10703, 0x00000338, },
+ { "⋬", 0x000022ec, },
+ { "≮", 0x0000226e, },
+ { "≰", 0x00002270, },
+ { "≸", 0x00002278, },
+// { "≪̸", 8810, 0x00000338, },
+// { "⩽̸", 10877, 0x00000338, },
+ { "≴", 0x00002274, },
+// { "⪢̸",10914, 0x00000338, },
+// { "⪡̸", 10913, 0x00000338, },
+ { "⊀", 0x00002280, },
+// { "⪯̸", 10927, 0x00000338, },
+ { "⋠", 0x000022e0, },
+ { "∌", 0x0000220c, },
+ { "⋫", 0x000022eb, },
+// { "⧐̸", 10704, 0x00000338, },
+ { "⋭", 0x000022ed, },
+// { "⊏̸", 8847, 0x00000338, },
+ { "⋢", 0x000022e2, },
+// { "⊐̸", 8848, 0x00000338, },
+ { "⋣", 0x000022e3, },
+// { "⊂⃒", 8834, 0x000020d2, },
+ { "⊈", 0x00002288, },
+ { "⊁", 0x00002281, },
+// { "⪰̸", 10928, 0x00000338, },
+ { "⋡", 0x000022e1, },
+// { "≿̸", 8831, 0x00000338, },
+// { "⊃⃒", 8835, 0x000020d2, },
+ { "⊉", 0x00002289, },
+ { "≁", 0x00002241, },
+ { "≄", 0x00002244, },
+ { "≇", 0x00002247, },
+ { "≉", 0x00002249, },
+ { "∤", 0x00002224, },
+ { "𝒩", 0x0001d4a9, },
+ { "Ñ", 0x000000d1, },
+ { "Ñ", 0x000000d1, },
+ { "Ν", 0x0000039d, },
+ { "Œ", 0x00000152, },
+ { "Ó", 0x000000d3, },
+ { "Ó", 0x000000d3, },
+ { "Ô", 0x000000d4, },
+ { "Ô", 0x000000d4, },
+ { "О", 0x0000041e, },
+ { "Ő", 0x00000150, },
+ { "𝔒", 0x0001d512, },
+ { "Ò", 0x000000d2, },
+ { "Ò", 0x000000d2, },
+ { "Ō", 0x0000014c, },
+ { "Ω", 0x000003a9, },
+ { "Ο", 0x0000039f, },
+ { "𝕆", 0x0001d546, },
+ { "“", 0x0000201c, },
+ { "‘", 0x00002018, },
+ { "⩔", 0x00002a54, },
+ { "𝒪", 0x0001d4aa, },
+ { "Ø", 0x000000d8, },
+ { "Ø", 0x000000d8, },
+ { "Õ", 0x000000d5, },
+ { "Õ", 0x000000d5, },
+ { "⨷", 0x00002a37, },
+ { "Ö", 0x000000d6, },
+ { "Ö", 0x000000d6, },
+ { "‾", 0x0000203e, },
+ { "⏞", 0x000023de, },
+ { "⎴", 0x000023b4, },
+ { "⏜", 0x000023dc, },
+ { "∂", 0x00002202, },
+ { "П", 0x0000041f, },
+ { "𝔓", 0x0001d513, },
+ { "Φ", 0x000003a6, },
+ { "Π", 0x000003a0, },
+ { "±", 0x000000b1, },
+ { "ℌ", 0x0000210c, },
+ { "ℙ", 0x00002119, },
+ { "⪻", 0x00002abb, },
+ { "≺", 0x0000227a, },
+ { "⪯", 0x00002aaf, },
+ { "≼", 0x0000227c, },
+ { "≾", 0x0000227e, },
+ { "″", 0x00002033, },
+ { "∏", 0x0000220f, },
+ { "∷", 0x00002237, },
+ { "∝", 0x0000221d, },
+ { "𝒫", 0x0001d4ab, },
+ { "Ψ", 0x000003a8, },
+ { """, 0x00000022, },
+ { """, 0x00000022, },
+ { "𝔔", 0x0001d514, },
+ { "ℚ", 0x0000211a, },
+ { "𝒬", 0x0001d4ac, },
+ { "⤐", 0x00002910, },
+ { "®", 0x000000ae, },
+ { "®", 0x000000ae, },
+ { "Ŕ", 0x00000154, },
+ { "⟫", 0x000027eb, },
+ { "↠", 0x000021a0, },
+ { "⤖", 0x00002916, },
+ { "Ř", 0x00000158, },
+ { "Ŗ", 0x00000156, },
+ { "Р", 0x00000420, },
+ { "ℜ", 0x0000211c, },
+ { "∋", 0x0000220b, },
+ { "⇋", 0x000021cb, },
+ { "⥯", 0x0000296f, },
+ { "ℜ", 0x0000211c, },
+ { "Ρ", 0x000003a1, },
+ { "⟩", 0x000027e9, },
+ { "→", 0x00002192, },
+ { "⇥", 0x000021e5, },
+ { "⇄", 0x000021c4, },
+ { "⌉", 0x00002309, },
+ { "⟧", 0x000027e7, },
+ { "⥝", 0x0000295d, },
+ { "⇂", 0x000021c2, },
+ { "⥕", 0x00002955, },
+ { "⌋", 0x0000230b, },
+ { "⊢", 0x000022a2, },
+ { "↦", 0x000021a6, },
+ { "⥛", 0x0000295b, },
+ { "⊳", 0x000022b3, },
+ { "⧐", 0x000029d0, },
+ { "⊵", 0x000022b5, },
+ { "⥏", 0x0000294f, },
+ { "⥜", 0x0000295c, },
+ { "↾", 0x000021be, },
+ { "⥔", 0x00002954, },
+ { "⇀", 0x000021c0, },
+ { "⥓", 0x00002953, },
+ { "⇒", 0x000021d2, },
+ { "ℝ", 0x0000211d, },
+ { "⥰", 0x00002970, },
+ { "⇛", 0x000021db, },
+ { "ℛ", 0x0000211b, },
+ { "↱", 0x000021b1, },
+ { "⧴", 0x000029f4, },
+ { "Щ", 0x00000429, },
+ { "Ш", 0x00000428, },
+ { "Ь", 0x0000042c, },
+ { "Ś", 0x0000015a, },
+ { "⪼", 0x00002abc, },
+ { "Š", 0x00000160, },
+ { "Ş", 0x0000015e, },
+ { "Ŝ", 0x0000015c, },
+ { "С", 0x00000421, },
+ { "𝔖", 0x0001d516, },
+ { "↓", 0x00002193, },
+ { "←", 0x00002190, },
+ { "→", 0x00002192, },
+ { "↑", 0x00002191, },
+ { "Σ", 0x000003a3, },
+ { "∘", 0x00002218, },
+ { "𝕊", 0x0001d54a, },
+ { "√", 0x0000221a, },
+ { "□", 0x000025a1, },
+ { "⊓", 0x00002293, },
+ { "⊏", 0x0000228f, },
+ { "⊑", 0x00002291, },
+ { "⊐", 0x00002290, },
+ { "⊒", 0x00002292, },
+ { "⊔", 0x00002294, },
+ { "𝒮", 0x0001d4ae, },
+ { "⋆", 0x000022c6, },
+ { "⋐", 0x000022d0, },
+ { "⋐", 0x000022d0, },
+ { "⊆", 0x00002286, },
+ { "≻", 0x0000227b, },
+ { "⪰", 0x00002ab0, },
+ { "≽", 0x0000227d, },
+ { "≿", 0x0000227f, },
+ { "∋", 0x0000220b, },
+ { "∑", 0x00002211, },
+ { "⋑", 0x000022d1, },
+ { "⊃", 0x00002283, },
+ { "⊇", 0x00002287, },
+ { "⋑", 0x000022d1, },
+ { "Þ", 0x000000de, },
+ { "Þ", 0x000000de, },
+ { "™", 0x00002122, },
+ { "Ћ", 0x0000040b, },
+ { "Ц", 0x00000426, },
+ { "	", 0x00000009, },
+ { "Τ", 0x000003a4, },
+ { "Ť", 0x00000164, },
+ { "Ţ", 0x00000162, },
+ { "Т", 0x00000422, },
+ { "𝔗", 0x0001d517, },
+ { "∴", 0x00002234, },
+ { "Θ", 0x00000398, },
+// { "  ", 8287, 0x0000200a, },
+ { " ", 0x00002009, },
+ { "∼", 0x0000223c, },
+ { "≃", 0x00002243, },
+ { "≅", 0x00002245, },
+ { "≈", 0x00002248, },
+ { "𝕋", 0x0001d54b, },
+ { "⃛", 0x000020db, },
+ { "𝒯", 0x0001d4af, },
+ { "Ŧ", 0x00000166, },
+ { "Ú", 0x000000da, },
+ { "Ú", 0x000000da, },
+ { "↟", 0x0000219f, },
+ { "⥉", 0x00002949, },
+ { "Ў", 0x0000040e, },
+ { "Ŭ", 0x0000016c, },
+ { "Û", 0x000000db, },
+ { "Û", 0x000000db, },
+ { "У", 0x00000423, },
+ { "Ű", 0x00000170, },
+ { "𝔘", 0x0001d518, },
+ { "Ù", 0x000000d9, },
+ { "Ù", 0x000000d9, },
+ { "Ū", 0x0000016a, },
+ { "_", 0x0000005f, },
+ { "⏟", 0x000023df, },
+ { "⎵", 0x000023b5, },
+ { "⏝", 0x000023dd, },
+ { "⋃", 0x000022c3, },
+ { "⊎", 0x0000228e, },
+ { "Ų", 0x00000172, },
+ { "𝕌", 0x0001d54c, },
+ { "↑", 0x00002191, },
+ { "⤒", 0x00002912, },
+ { "⇅", 0x000021c5, },
+ { "↕", 0x00002195, },
+ { "⥮", 0x0000296e, },
+ { "⊥", 0x000022a5, },
+ { "↥", 0x000021a5, },
+ { "⇑", 0x000021d1, },
+ { "⇕", 0x000021d5, },
+ { "↖", 0x00002196, },
+ { "↗", 0x00002197, },
+ { "ϒ", 0x000003d2, },
+ { "Υ", 0x000003a5, },
+ { "Ů", 0x0000016e, },
+ { "𝒰", 0x0001d4b0, },
+ { "Ũ", 0x00000168, },
+ { "Ü", 0x000000dc, },
+ { "Ü", 0x000000dc, },
+ { "⊫", 0x000022ab, },
+ { "⫫", 0x00002aeb, },
+ { "В", 0x00000412, },
+ { "⊩", 0x000022a9, },
+ { "⫦", 0x00002ae6, },
+ { "⋁", 0x000022c1, },
+ { "‖", 0x00002016, },
+ { "‖", 0x00002016, },
+ { "∣", 0x00002223, },
+ { "|", 0x0000007c, },
+ { "❘", 0x00002758, },
+ { "≀", 0x00002240, },
+ { " ", 0x0000200a, },
+ { "𝔙", 0x0001d519, },
+ { "𝕍", 0x0001d54d, },
+ { "𝒱", 0x0001d4b1, },
+ { "⊪", 0x000022aa, },
+ { "Ŵ", 0x00000174, },
+ { "⋀", 0x000022c0, },
+ { "𝔚", 0x0001d51a, },
+ { "𝕎", 0x0001d54e, },
+ { "𝒲", 0x0001d4b2, },
+ { "𝔛", 0x0001d51b, },
+ { "Ξ", 0x0000039e, },
+ { "𝕏", 0x0001d54f, },
+ { "𝒳", 0x0001d4b3, },
+ { "Я", 0x0000042f, },
+ { "Ї", 0x00000407, },
+ { "Ю", 0x0000042e, },
+ { "Ý", 0x000000dd, },
+ { "Ý", 0x000000dd, },
+ { "Ŷ", 0x00000176, },
+ { "Ы", 0x0000042b, },
+ { "𝔜", 0x0001d51c, },
+ { "𝕐", 0x0001d550, },
+ { "𝒴", 0x0001d4b4, },
+ { "Ÿ", 0x00000178, },
+ { "Ж", 0x00000416, },
+ { "Ź", 0x00000179, },
+ { "Ž", 0x0000017d, },
+ { "З", 0x00000417, },
+ { "Ż", 0x0000017b, },
+ { "​", 0x0000200b, },
+ { "Ζ", 0x00000396, },
+ { "ℨ", 0x00002128, },
+ { "ℤ", 0x00002124, },
+ { "𝒵", 0x0001d4b5, },
+ { "á", 0x000000e1, },
+ { "á", 0x000000e1, },
+ { "ă", 0x00000103, },
+ { "∾", 0x0000223e, },
+// { "∾̳", 8766, 0x00000333, },
+ { "∿", 0x0000223f, },
+ { "â", 0x000000e2, },
+ { "â", 0x000000e2, },
+ { "´", 0x000000b4, },
+ { "´", 0x000000b4, },
+ { "а", 0x00000430, },
+ { "æ", 0x000000e6, },
+ { "æ", 0x000000e6, },
+ { "⁡", 0x00002061, },
+ { "𝔞", 0x0001d51e, },
+ { "à", 0x000000e0, },
+ { "à", 0x000000e0, },
+ { "ℵ", 0x00002135, },
+ { "ℵ", 0x00002135, },
+ { "α", 0x000003b1, },
+ { "ā", 0x00000101, },
+ { "⨿", 0x00002a3f, },
+ { "&", 0x00000026, },
+ { "&", 0x00000026, },
+ { "∧", 0x00002227, },
+ { "⩕", 0x00002a55, },
+ { "⩜", 0x00002a5c, },
+ { "⩘", 0x00002a58, },
+ { "⩚", 0x00002a5a, },
+ { "∠", 0x00002220, },
+ { "⦤", 0x000029a4, },
+ { "∠", 0x00002220, },
+ { "∡", 0x00002221, },
+ { "⦨", 0x000029a8, },
+ { "⦩", 0x000029a9, },
+ { "⦪", 0x000029aa, },
+ { "⦫", 0x000029ab, },
+ { "⦬", 0x000029ac, },
+ { "⦭", 0x000029ad, },
+ { "⦮", 0x000029ae, },
+ { "⦯", 0x000029af, },
+ { "∟", 0x0000221f, },
+ { "⊾", 0x000022be, },
+ { "⦝", 0x0000299d, },
+ { "∢", 0x00002222, },
+ { "Å", 0x000000c5, },
+ { "⍼", 0x0000237c, },
+ { "ą", 0x00000105, },
+ { "𝕒", 0x0001d552, },
+ { "≈", 0x00002248, },
+ { "⩰", 0x00002a70, },
+ { "⩯", 0x00002a6f, },
+ { "≊", 0x0000224a, },
+ { "≋", 0x0000224b, },
+ { "'", 0x00000027, },
+ { "≈", 0x00002248, },
+ { "≊", 0x0000224a, },
+ { "å", 0x000000e5, },
+ { "å", 0x000000e5, },
+ { "𝒶", 0x0001d4b6, },
+ { "*", 0x0000002a, },
+ { "≈", 0x00002248, },
+ { "≍", 0x0000224d, },
+ { "ã", 0x000000e3, },
+ { "ã", 0x000000e3, },
+ { "ä", 0x000000e4, },
+ { "ä", 0x000000e4, },
+ { "∳", 0x00002233, },
+ { "⨑", 0x00002a11, },
+ { "⫭", 0x00002aed, },
+ { "≌", 0x0000224c, },
+ { "϶", 0x000003f6, },
+ { "‵", 0x00002035, },
+ { "∽", 0x0000223d, },
+ { "⋍", 0x000022cd, },
+ { "⊽", 0x000022bd, },
+ { "⌅", 0x00002305, },
+ { "⌅", 0x00002305, },
+ { "⎵", 0x000023b5, },
+ { "⎶", 0x000023b6, },
+ { "≌", 0x0000224c, },
+ { "б", 0x00000431, },
+ { "„", 0x0000201e, },
+ { "∵", 0x00002235, },
+ { "∵", 0x00002235, },
+ { "⦰", 0x000029b0, },
+ { "϶", 0x000003f6, },
+ { "ℬ", 0x0000212c, },
+ { "β", 0x000003b2, },
+ { "ℶ", 0x00002136, },
+ { "≬", 0x0000226c, },
+ { "𝔟", 0x0001d51f, },
+ { "⋂", 0x000022c2, },
+ { "◯", 0x000025ef, },
+ { "⋃", 0x000022c3, },
+ { "⨀", 0x00002a00, },
+ { "⨁", 0x00002a01, },
+ { "⨂", 0x00002a02, },
+ { "⨆", 0x00002a06, },
+ { "★", 0x00002605, },
+ { "▽", 0x000025bd, },
+ { "△", 0x000025b3, },
+ { "⨄", 0x00002a04, },
+ { "⋁", 0x000022c1, },
+ { "⋀", 0x000022c0, },
+ { "⤍", 0x0000290d, },
+ { "⧫", 0x000029eb, },
+ { "▪", 0x000025aa, },
+ { "▴", 0x000025b4, },
+ { "▾", 0x000025be, },
+ { "◂", 0x000025c2, },
+ { "▸", 0x000025b8, },
+ { "␣", 0x00002423, },
+ { "▒", 0x00002592, },
+ { "░", 0x00002591, },
+ { "▓", 0x00002593, },
+ { "█", 0x00002588, },
+// { "=⃥", 61, 0x000020e5, },
+// { "≡⃥", 8801, 0x000020e5, },
+ { "⌐", 0x00002310, },
+ { "𝕓", 0x0001d553, },
+ { "⊥", 0x000022a5, },
+ { "⊥", 0x000022a5, },
+ { "⋈", 0x000022c8, },
+ { "╗", 0x00002557, },
+ { "╔", 0x00002554, },
+ { "╖", 0x00002556, },
+ { "╓", 0x00002553, },
+ { "═", 0x00002550, },
+ { "╦", 0x00002566, },
+ { "╩", 0x00002569, },
+ { "╤", 0x00002564, },
+ { "╧", 0x00002567, },
+ { "╝", 0x0000255d, },
+ { "╚", 0x0000255a, },
+ { "╜", 0x0000255c, },
+ { "╙", 0x00002559, },
+ { "║", 0x00002551, },
+ { "╬", 0x0000256c, },
+ { "╣", 0x00002563, },
+ { "╠", 0x00002560, },
+ { "╫", 0x0000256b, },
+ { "╢", 0x00002562, },
+ { "╟", 0x0000255f, },
+ { "⧉", 0x000029c9, },
+ { "╕", 0x00002555, },
+ { "╒", 0x00002552, },
+ { "┐", 0x00002510, },
+ { "┌", 0x0000250c, },
+ { "─", 0x00002500, },
+ { "╥", 0x00002565, },
+ { "╨", 0x00002568, },
+ { "┬", 0x0000252c, },
+ { "┴", 0x00002534, },
+ { "⊟", 0x0000229f, },
+ { "⊞", 0x0000229e, },
+ { "⊠", 0x000022a0, },
+ { "╛", 0x0000255b, },
+ { "╘", 0x00002558, },
+ { "┘", 0x00002518, },
+ { "└", 0x00002514, },
+ { "│", 0x00002502, },
+ { "╪", 0x0000256a, },
+ { "╡", 0x00002561, },
+ { "╞", 0x0000255e, },
+ { "┼", 0x0000253c, },
+ { "┤", 0x00002524, },
+ { "├", 0x0000251c, },
+ { "‵", 0x00002035, },
+ { "˘", 0x000002d8, },
+ { "¦", 0x000000a6, },
+ { "¦", 0x000000a6, },
+ { "𝒷", 0x0001d4b7, },
+ { "⁏", 0x0000204f, },
+ { "∽", 0x0000223d, },
+ { "⋍", 0x000022cd, },
+ { "\", 0x0000005c, },
+ { "⧅", 0x000029c5, },
+ { "⟈", 0x000027c8, },
+ { "•", 0x00002022, },
+ { "•", 0x00002022, },
+ { "≎", 0x0000224e, },
+ { "⪮", 0x00002aae, },
+ { "≏", 0x0000224f, },
+ { "≏", 0x0000224f, },
+ { "ć", 0x00000107, },
+ { "∩", 0x00002229, },
+ { "⩄", 0x00002a44, },
+ { "⩉", 0x00002a49, },
+ { "⩋", 0x00002a4b, },
+ { "⩇", 0x00002a47, },
+ { "⩀", 0x00002a40, },
+// { "∩︀", 8745, 0x0000fe00, },
+ { "⁁", 0x00002041, },
+ { "ˇ", 0x000002c7, },
+ { "⩍", 0x00002a4d, },
+ { "č", 0x0000010d, },
+ { "ç", 0x000000e7, },
+ { "ç", 0x000000e7, },
+ { "ĉ", 0x00000109, },
+ { "⩌", 0x00002a4c, },
+ { "⩐", 0x00002a50, },
+ { "ċ", 0x0000010b, },
+ { "¸", 0x000000b8, },
+ { "¸", 0x000000b8, },
+ { "⦲", 0x000029b2, },
+ { "¢", 0x000000a2, },
+ { "¢", 0x000000a2, },
+ { "·", 0x000000b7, },
+ { "𝔠", 0x0001d520, },
+ { "ч", 0x00000447, },
+ { "✓", 0x00002713, },
+ { "✓", 0x00002713, },
+ { "χ", 0x000003c7, },
+ { "○", 0x000025cb, },
+ { "⧃", 0x000029c3, },
+ { "ˆ", 0x000002c6, },
+ { "≗", 0x00002257, },
+ { "↺", 0x000021ba, },
+ { "↻", 0x000021bb, },
+ { "®", 0x000000ae, },
+ { "Ⓢ", 0x000024c8, },
+ { "⊛", 0x0000229b, },
+ { "⊚", 0x0000229a, },
+ { "⊝", 0x0000229d, },
+ { "≗", 0x00002257, },
+ { "⨐", 0x00002a10, },
+ { "⫯", 0x00002aef, },
+ { "⧂", 0x000029c2, },
+ { "♣", 0x00002663, },
+ { "♣", 0x00002663, },
+ { ":", 0x0000003a, },
+ { "≔", 0x00002254, },
+ { "≔", 0x00002254, },
+ { ",", 0x0000002c, },
+ { "@", 0x00000040, },
+ { "∁", 0x00002201, },
+ { "∘", 0x00002218, },
+ { "∁", 0x00002201, },
+ { "ℂ", 0x00002102, },
+ { "≅", 0x00002245, },
+ { "⩭", 0x00002a6d, },
+ { "∮", 0x0000222e, },
+ { "𝕔", 0x0001d554, },
+ { "∐", 0x00002210, },
+ { "©", 0x000000a9, },
+ { "©", 0x000000a9, },
+ { "℗", 0x00002117, },
+ { "↵", 0x000021b5, },
+ { "✗", 0x00002717, },
+ { "𝒸", 0x0001d4b8, },
+ { "⫏", 0x00002acf, },
+ { "⫑", 0x00002ad1, },
+ { "⫐", 0x00002ad0, },
+ { "⫒", 0x00002ad2, },
+ { "⋯", 0x000022ef, },
+ { "⤸", 0x00002938, },
+ { "⤵", 0x00002935, },
+ { "⋞", 0x000022de, },
+ { "⋟", 0x000022df, },
+ { "↶", 0x000021b6, },
+ { "⤽", 0x0000293d, },
+ { "∪", 0x0000222a, },
+ { "⩈", 0x00002a48, },
+ { "⩆", 0x00002a46, },
+ { "⩊", 0x00002a4a, },
+ { "⊍", 0x0000228d, },
+ { "⩅", 0x00002a45, },
+// { "∪︀", 8746, 0x0000fe00, },
+ { "↷", 0x000021b7, },
+ { "⤼", 0x0000293c, },
+ { "⋞", 0x000022de, },
+ { "⋟", 0x000022df, },
+ { "⋎", 0x000022ce, },
+ { "⋏", 0x000022cf, },
+ { "¤", 0x000000a4, },
+ { "¤", 0x000000a4, },
+ { "↶", 0x000021b6, },
+ { "↷", 0x000021b7, },
+ { "⋎", 0x000022ce, },
+ { "⋏", 0x000022cf, },
+ { "∲", 0x00002232, },
+ { "∱", 0x00002231, },
+ { "⌭", 0x0000232d, },
+ { "⇓", 0x000021d3, },
+ { "⥥", 0x00002965, },
+ { "†", 0x00002020, },
+ { "ℸ", 0x00002138, },
+ { "↓", 0x00002193, },
+ { "‐", 0x00002010, },
+ { "⊣", 0x000022a3, },
+ { "⤏", 0x0000290f, },
+ { "˝", 0x000002dd, },
+ { "ď", 0x0000010f, },
+ { "д", 0x00000434, },
+ { "ⅆ", 0x00002146, },
+ { "‡", 0x00002021, },
+ { "⇊", 0x000021ca, },
+ { "⩷", 0x00002a77, },
+ { "°", 0x000000b0, },
+ { "°", 0x000000b0, },
+ { "δ", 0x000003b4, },
+ { "⦱", 0x000029b1, },
+ { "⥿", 0x0000297f, },
+ { "𝔡", 0x0001d521, },
+ { "⇃", 0x000021c3, },
+ { "⇂", 0x000021c2, },
+ { "⋄", 0x000022c4, },
+ { "⋄", 0x000022c4, },
+ { "♦", 0x00002666, },
+ { "♦", 0x00002666, },
+ { "¨", 0x000000a8, },
+ { "ϝ", 0x000003dd, },
+ { "⋲", 0x000022f2, },
+ { "÷", 0x000000f7, },
+ { "÷", 0x000000f7, },
+ { "÷", 0x000000f7, },
+ { "⋇", 0x000022c7, },
+ { "⋇", 0x000022c7, },
+ { "ђ", 0x00000452, },
+ { "⌞", 0x0000231e, },
+ { "⌍", 0x0000230d, },
+ { "$", 0x00000024, },
+ { "𝕕", 0x0001d555, },
+ { "˙", 0x000002d9, },
+ { "≐", 0x00002250, },
+ { "≑", 0x00002251, },
+ { "∸", 0x00002238, },
+ { "∔", 0x00002214, },
+ { "⊡", 0x000022a1, },
+ { "⌆", 0x00002306, },
+ { "↓", 0x00002193, },
+ { "⇊", 0x000021ca, },
+ { "⇃", 0x000021c3, },
+ { "⇂", 0x000021c2, },
+ { "⤐", 0x00002910, },
+ { "⌟", 0x0000231f, },
+ { "⌌", 0x0000230c, },
+ { "𝒹", 0x0001d4b9, },
+ { "ѕ", 0x00000455, },
+ { "⧶", 0x000029f6, },
+ { "đ", 0x00000111, },
+ { "⋱", 0x000022f1, },
+ { "▿", 0x000025bf, },
+ { "▾", 0x000025be, },
+ { "⇵", 0x000021f5, },
+ { "⥯", 0x0000296f, },
+ { "⦦", 0x000029a6, },
+ { "џ", 0x0000045f, },
+ { "⟿", 0x000027ff, },
+ { "⩷", 0x00002a77, },
+ { "≑", 0x00002251, },
+ { "é", 0x000000e9, },
+ { "é", 0x000000e9, },
+ { "⩮", 0x00002a6e, },
+ { "ě", 0x0000011b, },
+ { "≖", 0x00002256, },
+ { "ê", 0x000000ea, },
+ { "ê", 0x000000ea, },
+ { "≕", 0x00002255, },
+ { "э", 0x0000044d, },
+ { "ė", 0x00000117, },
+ { "ⅇ", 0x00002147, },
+ { "≒", 0x00002252, },
+ { "𝔢", 0x0001d522, },
+ { "⪚", 0x00002a9a, },
+ { "è", 0x000000e8, },
+ { "è", 0x000000e8, },
+ { "⪖", 0x00002a96, },
+ { "⪘", 0x00002a98, },
+ { "⪙", 0x00002a99, },
+ { "⏧", 0x000023e7, },
+ { "ℓ", 0x00002113, },
+ { "⪕", 0x00002a95, },
+ { "⪗", 0x00002a97, },
+ { "ē", 0x00000113, },
+ { "∅", 0x00002205, },
+ { "∅", 0x00002205, },
+ { "∅", 0x00002205, },
+ { " ", 0x00002004, },
+ { " ", 0x00002005, },
+ { " ", 0x00002003, },
+ { "ŋ", 0x0000014b, },
+ { " ", 0x00002002, },
+ { "ę", 0x00000119, },
+ { "𝕖", 0x0001d556, },
+ { "⋕", 0x000022d5, },
+ { "⧣", 0x000029e3, },
+ { "⩱", 0x00002a71, },
+ { "ε", 0x000003b5, },
+ { "ε", 0x000003b5, },
+ { "ϵ", 0x000003f5, },
+ { "≖", 0x00002256, },
+ { "≕", 0x00002255, },
+ { "≂", 0x00002242, },
+ { "⪖", 0x00002a96, },
+ { "⪕", 0x00002a95, },
+ { "=", 0x0000003d, },
+ { "≟", 0x0000225f, },
+ { "≡", 0x00002261, },
+ { "⩸", 0x00002a78, },
+ { "⧥", 0x000029e5, },
+ { "≓", 0x00002253, },
+ { "⥱", 0x00002971, },
+ { "ℯ", 0x0000212f, },
+ { "≐", 0x00002250, },
+ { "≂", 0x00002242, },
+ { "η", 0x000003b7, },
+ { "ð", 0x000000f0, },
+ { "ð", 0x000000f0, },
+ { "ë", 0x000000eb, },
+ { "ë", 0x000000eb, },
+ { "€", 0x000020ac, },
+ { "!", 0x00000021, },
+ { "∃", 0x00002203, },
+ { "ℰ", 0x00002130, },
+ { "ⅇ", 0x00002147, },
+ { "≒", 0x00002252, },
+ { "ф", 0x00000444, },
+ { "♀", 0x00002640, },
+ { "ffi", 0x0000fb03, },
+ { "ff", 0x0000fb00, },
+ { "ffl", 0x0000fb04, },
+ { "𝔣", 0x0001d523, },
+ { "fi", 0x0000fb01, },
+// { "fj", 102, 0x0000006a, },
+ { "♭", 0x0000266d, },
+ { "fl", 0x0000fb02, },
+ { "▱", 0x000025b1, },
+ { "ƒ", 0x00000192, },
+ { "𝕗", 0x0001d557, },
+ { "∀", 0x00002200, },
+ { "⋔", 0x000022d4, },
+ { "⫙", 0x00002ad9, },
+ { "⨍", 0x00002a0d, },
+ { "½", 0x000000bd, },
+ { "½", 0x000000bd, },
+ { "⅓", 0x00002153, },
+ { "¼", 0x000000bc, },
+ { "¼", 0x000000bc, },
+ { "⅕", 0x00002155, },
+ { "⅙", 0x00002159, },
+ { "⅛", 0x0000215b, },
+ { "⅔", 0x00002154, },
+ { "⅖", 0x00002156, },
+ { "¾", 0x000000be, },
+ { "¾", 0x000000be, },
+ { "⅗", 0x00002157, },
+ { "⅜", 0x0000215c, },
+ { "⅘", 0x00002158, },
+ { "⅚", 0x0000215a, },
+ { "⅝", 0x0000215d, },
+ { "⅞", 0x0000215e, },
+ { "⁄", 0x00002044, },
+ { "⌢", 0x00002322, },
+ { "𝒻", 0x0001d4bb, },
+ { "≧", 0x00002267, },
+ { "⪌", 0x00002a8c, },
+ { "ǵ", 0x000001f5, },
+ { "γ", 0x000003b3, },
+ { "ϝ", 0x000003dd, },
+ { "⪆", 0x00002a86, },
+ { "ğ", 0x0000011f, },
+ { "ĝ", 0x0000011d, },
+ { "г", 0x00000433, },
+ { "ġ", 0x00000121, },
+ { "≥", 0x00002265, },
+ { "⋛", 0x000022db, },
+ { "≥", 0x00002265, },
+ { "≧", 0x00002267, },
+ { "⩾", 0x00002a7e, },
+ { "⩾", 0x00002a7e, },
+ { "⪩", 0x00002aa9, },
+ { "⪀", 0x00002a80, },
+ { "⪂", 0x00002a82, },
+ { "⪄", 0x00002a84, },
+// { "⋛︀", 8923, 0x0000fe00, },
+ { "⪔", 0x00002a94, },
+ { "𝔤", 0x0001d524, },
+ { "≫", 0x0000226b, },
+ { "⋙", 0x000022d9, },
+ { "ℷ", 0x00002137, },
+ { "ѓ", 0x00000453, },
+ { "≷", 0x00002277, },
+ { "⪒", 0x00002a92, },
+ { "⪥", 0x00002aa5, },
+ { "⪤", 0x00002aa4, },
+ { "≩", 0x00002269, },
+ { "⪊", 0x00002a8a, },
+ { "⪊", 0x00002a8a, },
+ { "⪈", 0x00002a88, },
+ { "⪈", 0x00002a88, },
+ { "≩", 0x00002269, },
+ { "⋧", 0x000022e7, },
+ { "𝕘", 0x0001d558, },
+ { "`", 0x00000060, },
+ { "ℊ", 0x0000210a, },
+ { "≳", 0x00002273, },
+ { "⪎", 0x00002a8e, },
+ { "⪐", 0x00002a90, },
+ { ">", 0x0000003e, },
+ { ">", 0x0000003e, },
+ { "⪧", 0x00002aa7, },
+ { "⩺", 0x00002a7a, },
+ { "⋗", 0x000022d7, },
+ { "⦕", 0x00002995, },
+ { "⩼", 0x00002a7c, },
+ { "⪆", 0x00002a86, },
+ { "⥸", 0x00002978, },
+ { "⋗", 0x000022d7, },
+ { "⋛", 0x000022db, },
+ { "⪌", 0x00002a8c, },
+ { "≷", 0x00002277, },
+ { "≳", 0x00002273, },
+// { "≩︀", 8809, 0x0000fe00, },
+// { "≩︀", 8809, 0x0000fe00, },
+ { "⇔", 0x000021d4, },
+ { " ", 0x0000200a, },
+ { "½", 0x000000bd, },
+ { "ℋ", 0x0000210b, },
+ { "ъ", 0x0000044a, },
+ { "↔", 0x00002194, },
+ { "⥈", 0x00002948, },
+ { "↭", 0x000021ad, },
+ { "ℏ", 0x0000210f, },
+ { "ĥ", 0x00000125, },
+ { "♥", 0x00002665, },
+ { "♥", 0x00002665, },
+ { "…", 0x00002026, },
+ { "⊹", 0x000022b9, },
+ { "𝔥", 0x0001d525, },
+ { "⤥", 0x00002925, },
+ { "⤦", 0x00002926, },
+ { "⇿", 0x000021ff, },
+ { "∻", 0x0000223b, },
+ { "↩", 0x000021a9, },
+ { "↪", 0x000021aa, },
+ { "𝕙", 0x0001d559, },
+ { "―", 0x00002015, },
+ { "𝒽", 0x0001d4bd, },
+ { "ℏ", 0x0000210f, },
+ { "ħ", 0x00000127, },
+ { "⁃", 0x00002043, },
+ { "‐", 0x00002010, },
+ { "í", 0x000000ed, },
+ { "í", 0x000000ed, },
+ { "⁣", 0x00002063, },
+ { "î", 0x000000ee, },
+ { "î", 0x000000ee, },
+ { "и", 0x00000438, },
+ { "е", 0x00000435, },
+ { "¡", 0x000000a1, },
+ { "¡", 0x000000a1, },
+ { "⇔", 0x000021d4, },
+ { "𝔦", 0x0001d526, },
+ { "ì", 0x000000ec, },
+ { "ì", 0x000000ec, },
+ { "ⅈ", 0x00002148, },
+ { "⨌", 0x00002a0c, },
+ { "∭", 0x0000222d, },
+ { "⧜", 0x000029dc, },
+ { "℩", 0x00002129, },
+ { "ij", 0x00000133, },
+ { "ī", 0x0000012b, },
+ { "ℑ", 0x00002111, },
+ { "ℐ", 0x00002110, },
+ { "ℑ", 0x00002111, },
+ { "ı", 0x00000131, },
+ { "⊷", 0x000022b7, },
+ { "Ƶ", 0x000001b5, },
+ { "∈", 0x00002208, },
+ { "℅", 0x00002105, },
+ { "∞", 0x0000221e, },
+ { "⧝", 0x000029dd, },
+ { "ı", 0x00000131, },
+ { "∫", 0x0000222b, },
+ { "⊺", 0x000022ba, },
+ { "ℤ", 0x00002124, },
+ { "⊺", 0x000022ba, },
+ { "⨗", 0x00002a17, },
+ { "⨼", 0x00002a3c, },
+ { "ё", 0x00000451, },
+ { "į", 0x0000012f, },
+ { "𝕚", 0x0001d55a, },
+ { "ι", 0x000003b9, },
+ { "⨼", 0x00002a3c, },
+ { "¿", 0x000000bf, },
+ { "¿", 0x000000bf, },
+ { "𝒾", 0x0001d4be, },
+ { "∈", 0x00002208, },
+ { "⋹", 0x000022f9, },
+ { "⋵", 0x000022f5, },
+ { "⋴", 0x000022f4, },
+ { "⋳", 0x000022f3, },
+ { "∈", 0x00002208, },
+ { "⁢", 0x00002062, },
+ { "ĩ", 0x00000129, },
+ { "і", 0x00000456, },
+ { "ï", 0x000000ef, },
+ { "ï", 0x000000ef, },
+ { "ĵ", 0x00000135, },
+ { "й", 0x00000439, },
+ { "𝔧", 0x0001d527, },
+ { "ȷ", 0x00000237, },
+ { "𝕛", 0x0001d55b, },
+ { "𝒿", 0x0001d4bf, },
+ { "ј", 0x00000458, },
+ { "є", 0x00000454, },
+ { "κ", 0x000003ba, },
+ { "ϰ", 0x000003f0, },
+ { "ķ", 0x00000137, },
+ { "к", 0x0000043a, },
+ { "𝔨", 0x0001d528, },
+ { "ĸ", 0x00000138, },
+ { "х", 0x00000445, },
+ { "ќ", 0x0000045c, },
+ { "𝕜", 0x0001d55c, },
+ { "𝓀", 0x0001d4c0, },
+ { "⇚", 0x000021da, },
+ { "⇐", 0x000021d0, },
+ { "⤛", 0x0000291b, },
+ { "⤎", 0x0000290e, },
+ { "≦", 0x00002266, },
+ { "⪋", 0x00002a8b, },
+ { "⥢", 0x00002962, },
+ { "ĺ", 0x0000013a, },
+ { "⦴", 0x000029b4, },
+ { "ℒ", 0x00002112, },
+ { "λ", 0x000003bb, },
+ { "⟨", 0x000027e8, },
+ { "⦑", 0x00002991, },
+ { "⟨", 0x000027e8, },
+ { "⪅", 0x00002a85, },
+ { "«", 0x000000ab, },
+ { "«", 0x000000ab, },
+ { "←", 0x00002190, },
+ { "⇤", 0x000021e4, },
+ { "⤟", 0x0000291f, },
+ { "⤝", 0x0000291d, },
+ { "↩", 0x000021a9, },
+ { "↫", 0x000021ab, },
+ { "⤹", 0x00002939, },
+ { "⥳", 0x00002973, },
+ { "↢", 0x000021a2, },
+ { "⪫", 0x00002aab, },
+ { "⤙", 0x00002919, },
+ { "⪭", 0x00002aad, },
+// { "⪭︀", 10925, 0x0000fe00, },
+ { "⤌", 0x0000290c, },
+ { "❲", 0x00002772, },
+ { "{", 0x0000007b, },
+ { "[", 0x0000005b, },
+ { "⦋", 0x0000298b, },
+ { "⦏", 0x0000298f, },
+ { "⦍", 0x0000298d, },
+ { "ľ", 0x0000013e, },
+ { "ļ", 0x0000013c, },
+ { "⌈", 0x00002308, },
+ { "{", 0x0000007b, },
+ { "л", 0x0000043b, },
+ { "⤶", 0x00002936, },
+ { "“", 0x0000201c, },
+ { "„", 0x0000201e, },
+ { "⥧", 0x00002967, },
+ { "⥋", 0x0000294b, },
+ { "↲", 0x000021b2, },
+ { "≤", 0x00002264, },
+ { "←", 0x00002190, },
+ { "↢", 0x000021a2, },
+ { "↽", 0x000021bd, },
+ { "↼", 0x000021bc, },
+ { "⇇", 0x000021c7, },
+ { "↔", 0x00002194, },
+ { "⇆", 0x000021c6, },
+ { "⇋", 0x000021cb, },
+ { "↭", 0x000021ad, },
+ { "⋋", 0x000022cb, },
+ { "⋚", 0x000022da, },
+ { "≤", 0x00002264, },
+ { "≦", 0x00002266, },
+ { "⩽", 0x00002a7d, },
+ { "⩽", 0x00002a7d, },
+ { "⪨", 0x00002aa8, },
+ { "⩿", 0x00002a7f, },
+ { "⪁", 0x00002a81, },
+ { "⪃", 0x00002a83, },
+// { "⋚︀", 8922, 0x0000fe00, },
+ { "⪓", 0x00002a93, },
+ { "⪅", 0x00002a85, },
+ { "⋖", 0x000022d6, },
+ { "⋚", 0x000022da, },
+ { "⪋", 0x00002a8b, },
+ { "≶", 0x00002276, },
+ { "≲", 0x00002272, },
+ { "⥼", 0x0000297c, },
+ { "⌊", 0x0000230a, },
+ { "𝔩", 0x0001d529, },
+ { "≶", 0x00002276, },
+ { "⪑", 0x00002a91, },
+ { "↽", 0x000021bd, },
+ { "↼", 0x000021bc, },
+ { "⥪", 0x0000296a, },
+ { "▄", 0x00002584, },
+ { "љ", 0x00000459, },
+ { "≪", 0x0000226a, },
+ { "⇇", 0x000021c7, },
+ { "⌞", 0x0000231e, },
+ { "⥫", 0x0000296b, },
+ { "◺", 0x000025fa, },
+ { "ŀ", 0x00000140, },
+ { "⎰", 0x000023b0, },
+ { "⎰", 0x000023b0, },
+ { "≨", 0x00002268, },
+ { "⪉", 0x00002a89, },
+ { "⪉", 0x00002a89, },
+ { "⪇", 0x00002a87, },
+ { "⪇", 0x00002a87, },
+ { "≨", 0x00002268, },
+ { "⋦", 0x000022e6, },
+ { "⟬", 0x000027ec, },
+ { "⇽", 0x000021fd, },
+ { "⟦", 0x000027e6, },
+ { "⟵", 0x000027f5, },
+ { "⟷", 0x000027f7, },
+ { "⟼", 0x000027fc, },
+ { "⟶", 0x000027f6, },
+ { "↫", 0x000021ab, },
+ { "↬", 0x000021ac, },
+ { "⦅", 0x00002985, },
+ { "𝕝", 0x0001d55d, },
+ { "⨭", 0x00002a2d, },
+ { "⨴", 0x00002a34, },
+ { "∗", 0x00002217, },
+ { "_", 0x0000005f, },
+ { "◊", 0x000025ca, },
+ { "◊", 0x000025ca, },
+ { "⧫", 0x000029eb, },
+ { "(", 0x00000028, },
+ { "⦓", 0x00002993, },
+ { "⇆", 0x000021c6, },
+ { "⌟", 0x0000231f, },
+ { "⇋", 0x000021cb, },
+ { "⥭", 0x0000296d, },
+ { "‎", 0x0000200e, },
+ { "⊿", 0x000022bf, },
+ { "‹", 0x00002039, },
+ { "𝓁", 0x0001d4c1, },
+ { "↰", 0x000021b0, },
+ { "≲", 0x00002272, },
+ { "⪍", 0x00002a8d, },
+ { "⪏", 0x00002a8f, },
+ { "[", 0x0000005b, },
+ { "‘", 0x00002018, },
+ { "‚", 0x0000201a, },
+ { "ł", 0x00000142, },
+ { "<", 0x0000003c, },
+ { "<", 0x0000003c, },
+ { "⪦", 0x00002aa6, },
+ { "⩹", 0x00002a79, },
+ { "⋖", 0x000022d6, },
+ { "⋋", 0x000022cb, },
+ { "⋉", 0x000022c9, },
+ { "⥶", 0x00002976, },
+ { "⩻", 0x00002a7b, },
+ { "⦖", 0x00002996, },
+ { "◃", 0x000025c3, },
+ { "⊴", 0x000022b4, },
+ { "◂", 0x000025c2, },
+ { "⥊", 0x0000294a, },
+ { "⥦", 0x00002966, },
+// { "≨︀", 8808, 0x0000fe00, },
+// { "≨︀", 8808, 0x0000fe00, },
+ { "∺", 0x0000223a, },
+ { "¯", 0x000000af, },
+ { "¯", 0x000000af, },
+ { "♂", 0x00002642, },
+ { "✠", 0x00002720, },
+ { "✠", 0x00002720, },
+ { "↦", 0x000021a6, },
+ { "↦", 0x000021a6, },
+ { "↧", 0x000021a7, },
+ { "↤", 0x000021a4, },
+ { "↥", 0x000021a5, },
+ { "▮", 0x000025ae, },
+ { "⨩", 0x00002a29, },
+ { "м", 0x0000043c, },
+ { "—", 0x00002014, },
+ { "∡", 0x00002221, },
+ { "𝔪", 0x0001d52a, },
+ { "℧", 0x00002127, },
+ { "µ", 0x000000b5, },
+ { "µ", 0x000000b5, },
+ { "∣", 0x00002223, },
+ { "*", 0x0000002a, },
+ { "⫰", 0x00002af0, },
+ { "·", 0x000000b7, },
+ { "·", 0x000000b7, },
+ { "−", 0x00002212, },
+ { "⊟", 0x0000229f, },
+ { "∸", 0x00002238, },
+ { "⨪", 0x00002a2a, },
+ { "⫛", 0x00002adb, },
+ { "…", 0x00002026, },
+ { "∓", 0x00002213, },
+ { "⊧", 0x000022a7, },
+ { "𝕞", 0x0001d55e, },
+ { "∓", 0x00002213, },
+ { "𝓂", 0x0001d4c2, },
+ { "∾", 0x0000223e, },
+ { "μ", 0x000003bc, },
+ { "⊸", 0x000022b8, },
+ { "⊸", 0x000022b8, },
+// { "⋙̸", 8921, 0x00000338, },
+// { "≫⃒", 8811, 0x000020d2, },
+// { "≫̸", 8811, 0x00000338, },
+ { "⇍", 0x000021cd, },
+ { "⇎", 0x000021ce, },
+// { "⋘̸", 8920, 0x00000338, },
+// { "≪⃒", 8810, 0x000020d2, },
+// { "≪̸", 8810, 0x00000338, },
+ { "⇏", 0x000021cf, },
+ { "⊯", 0x000022af, },
+ { "⊮", 0x000022ae, },
+ { "∇", 0x00002207, },
+ { "ń", 0x00000144, },
+// { "∠⃒", 8736, 0x000020d2, },
+ { "≉", 0x00002249, },
+// { "⩰̸", 10864, 0x00000338, },
+// { "≋̸", 8779, 0x00000338, },
+ { "ʼn", 0x00000149, },
+ { "≉", 0x00002249, },
+ { "♮", 0x0000266e, },
+ { "♮", 0x0000266e, },
+ { "ℕ", 0x00002115, },
+ { " ", 0x000000a0, },
+ { " ", 0x000000a0, },
+// { "≎̸", 8782, 0x00000338, },
+// { "≏̸", 8783, 0x00000338, },
+ { "⩃", 0x00002a43, },
+ { "ň", 0x00000148, },
+ { "ņ", 0x00000146, },
+ { "≇", 0x00002247, },
+// { "⩭̸", 10861, 0x00000338, },
+ { "⩂", 0x00002a42, },
+ { "н", 0x0000043d, },
+ { "–", 0x00002013, },
+ { "≠", 0x00002260, },
+ { "⇗", 0x000021d7, },
+ { "⤤", 0x00002924, },
+ { "↗", 0x00002197, },
+ { "↗", 0x00002197, },
+// { "≐̸", 8784, 0x00000338, },
+ { "≢", 0x00002262, },
+ { "⤨", 0x00002928, },
+// { "≂̸", 8770, 0x00000338, },
+ { "∄", 0x00002204, },
+ { "∄", 0x00002204, },
+ { "𝔫", 0x0001d52b, },
+// { "≧̸", 8807, 0x00000338, },
+ { "≱", 0x00002271, },
+ { "≱", 0x00002271, },
+// { "≧̸", 8807, 0x00000338, },
+// { "⩾̸", 10878, 0x00000338, },
+// { "⩾̸", 10878, 0x00000338, },
+ { "≵", 0x00002275, },
+ { "≯", 0x0000226f, },
+ { "≯", 0x0000226f, },
+ { "⇎", 0x000021ce, },
+ { "↮", 0x000021ae, },
+ { "⫲", 0x00002af2, },
+ { "∋", 0x0000220b, },
+ { "⋼", 0x000022fc, },
+ { "⋺", 0x000022fa, },
+ { "∋", 0x0000220b, },
+ { "њ", 0x0000045a, },
+ { "⇍", 0x000021cd, },
+// { "≦̸", 8806, 0x00000338, },
+ { "↚", 0x0000219a, },
+ { "‥", 0x00002025, },
+ { "≰", 0x00002270, },
+ { "↚", 0x0000219a, },
+ { "↮", 0x000021ae, },
+ { "≰", 0x00002270, },
+// { "≦̸", 8806, 0x00000338, },
+// { "⩽̸", 10877, 0x00000338, },
+// { "⩽̸", 10877, 0x00000338, },
+ { "≮", 0x0000226e, },
+ { "≴", 0x00002274, },
+ { "≮", 0x0000226e, },
+ { "⋪", 0x000022ea, },
+ { "⋬", 0x000022ec, },
+ { "∤", 0x00002224, },
+ { "𝕟", 0x0001d55f, },
+ { "¬", 0x000000ac, },
+ { "¬", 0x000000ac, },
+ { "∉", 0x00002209, },
+// { "⋹̸", 8953, 0x00000338, },
+// { "⋵̸", 8949, 0x00000338, },
+ { "∉", 0x00002209, },
+ { "⋷", 0x000022f7, },
+ { "⋶", 0x000022f6, },
+ { "∌", 0x0000220c, },
+ { "∌", 0x0000220c, },
+ { "⋾", 0x000022fe, },
+ { "⋽", 0x000022fd, },
+ { "∦", 0x00002226, },
+ { "∦", 0x00002226, },
+// { "⫽⃥", 11005, 0x000020e5, },
+// { "∂̸", 8706, 0x00000338, },
+ { "⨔", 0x00002a14, },
+ { "⊀", 0x00002280, },
+ { "⋠", 0x000022e0, },
+// { "⪯̸", 10927, 0x00000338, },
+ { "⊀", 0x00002280, },
+// { "⪯̸", 10927, 0x00000338, },
+ { "⇏", 0x000021cf, },
+ { "↛", 0x0000219b, },
+// { "⤳̸", 10547, 0x00000338, },
+// { "↝̸", 8605, 0x00000338, },
+ { "↛", 0x0000219b, },
+ { "⋫", 0x000022eb, },
+ { "⋭", 0x000022ed, },
+ { "⊁", 0x00002281, },
+ { "⋡", 0x000022e1, },
+// { "⪰̸", 10928, 0x00000338, },
+ { "𝓃", 0x0001d4c3, },
+ { "∤", 0x00002224, },
+ { "∦", 0x00002226, },
+ { "≁", 0x00002241, },
+ { "≄", 0x00002244, },
+ { "≄", 0x00002244, },
+ { "∤", 0x00002224, },
+ { "∦", 0x00002226, },
+ { "⋢", 0x000022e2, },
+ { "⋣", 0x000022e3, },
+ { "⊄", 0x00002284, },
+// { "⫅̸", 10949, 0x00000338, },
+ { "⊈", 0x00002288, },
+// { "⊂⃒", 8834, 0x000020d2, },
+ { "⊈", 0x00002288, },
+// { "⫅̸", 10949, 0x00000338, },
+ { "⊁", 0x00002281, },
+// { "⪰̸", 10928, 0x00000338, },
+ { "⊅", 0x00002285, },
+// { "⫆̸", 10950, 0x00000338, },
+ { "⊉", 0x00002289, },
+// { "⊃⃒", 8835, 0x000020d2, },
+ { "⊉", 0x00002289, },
+// { "⫆̸", 10950, 0x00000338, },
+ { "≹", 0x00002279, },
+ { "ñ", 0x000000f1, },
+ { "ñ", 0x000000f1, },
+ { "≸", 0x00002278, },
+ { "⋪", 0x000022ea, },
+ { "⋬", 0x000022ec, },
+ { "⋫", 0x000022eb, },
+ { "⋭", 0x000022ed, },
+ { "ν", 0x000003bd, },
+ { "#", 0x00000023, },
+ { "№", 0x00002116, },
+ { " ", 0x00002007, },
+ { "⊭", 0x000022ad, },
+ { "⤄", 0x00002904, },
+// { "≍⃒", 8781, 0x000020d2, },
+ { "⊬", 0x000022ac, },
+// { "≥⃒", 8805, 0x000020d2, },
+// { ">⃒", 62, 0x000020d2, },
+ { "⧞", 0x000029de, },
+ { "⤂", 0x00002902, },
+// { "≤⃒", 8804, 0x000020d2, },
+// { "<⃒", 60, 0x000020d2, },
+// { "⊴⃒", 8884, 0x000020d2, },
+ { "⤃", 0x00002903, },
+// { "⊵⃒", 8885, 0x000020d2, },
+// { "∼⃒", 8764, 0x000020d2, },
+ { "⇖", 0x000021d6, },
+ { "⤣", 0x00002923, },
+ { "↖", 0x00002196, },
+ { "↖", 0x00002196, },
+ { "⤧", 0x00002927, },
+ { "Ⓢ", 0x000024c8, },
+ { "ó", 0x000000f3, },
+ { "ó", 0x000000f3, },
+ { "⊛", 0x0000229b, },
+ { "⊚", 0x0000229a, },
+ { "ô", 0x000000f4, },
+ { "ô", 0x000000f4, },
+ { "о", 0x0000043e, },
+ { "⊝", 0x0000229d, },
+ { "ő", 0x00000151, },
+ { "⨸", 0x00002a38, },
+ { "⊙", 0x00002299, },
+ { "⦼", 0x000029bc, },
+ { "œ", 0x00000153, },
+ { "⦿", 0x000029bf, },
+ { "𝔬", 0x0001d52c, },
+ { "˛", 0x000002db, },
+ { "ò", 0x000000f2, },
+ { "ò", 0x000000f2, },
+ { "⧁", 0x000029c1, },
+ { "⦵", 0x000029b5, },
+ { "Ω", 0x000003a9, },
+ { "∮", 0x0000222e, },
+ { "↺", 0x000021ba, },
+ { "⦾", 0x000029be, },
+ { "⦻", 0x000029bb, },
+ { "‾", 0x0000203e, },
+ { "⧀", 0x000029c0, },
+ { "ō", 0x0000014d, },
+ { "ω", 0x000003c9, },
+ { "ο", 0x000003bf, },
+ { "⦶", 0x000029b6, },
+ { "⊖", 0x00002296, },
+ { "𝕠", 0x0001d560, },
+ { "⦷", 0x000029b7, },
+ { "⦹", 0x000029b9, },
+ { "⊕", 0x00002295, },
+ { "∨", 0x00002228, },
+ { "↻", 0x000021bb, },
+ { "⩝", 0x00002a5d, },
+ { "ℴ", 0x00002134, },
+ { "ℴ", 0x00002134, },
+ { "ª", 0x000000aa, },
+ { "ª", 0x000000aa, },
+ { "º", 0x000000ba, },
+ { "º", 0x000000ba, },
+ { "⊶", 0x000022b6, },
+ { "⩖", 0x00002a56, },
+ { "⩗", 0x00002a57, },
+ { "⩛", 0x00002a5b, },
+ { "ℴ", 0x00002134, },
+ { "ø", 0x000000f8, },
+ { "ø", 0x000000f8, },
+ { "⊘", 0x00002298, },
+ { "õ", 0x000000f5, },
+ { "õ", 0x000000f5, },
+ { "⊗", 0x00002297, },
+ { "⨶", 0x00002a36, },
+ { "ö", 0x000000f6, },
+ { "ö", 0x000000f6, },
+ { "⌽", 0x0000233d, },
+ { "∥", 0x00002225, },
+ { "¶", 0x000000b6, },
+ { "¶", 0x000000b6, },
+ { "∥", 0x00002225, },
+ { "⫳", 0x00002af3, },
+ { "⫽", 0x00002afd, },
+ { "∂", 0x00002202, },
+ { "п", 0x0000043f, },
+ { "%", 0x00000025, },
+ { ".", 0x0000002e, },
+ { "‰", 0x00002030, },
+ { "⊥", 0x000022a5, },
+ { "‱", 0x00002031, },
+ { "𝔭", 0x0001d52d, },
+ { "φ", 0x000003c6, },
+ { "ϕ", 0x000003d5, },
+ { "ℳ", 0x00002133, },
+ { "☎", 0x0000260e, },
+ { "π", 0x000003c0, },
+ { "⋔", 0x000022d4, },
+ { "ϖ", 0x000003d6, },
+ { "ℏ", 0x0000210f, },
+ { "ℎ", 0x0000210e, },
+ { "ℏ", 0x0000210f, },
+ { "+", 0x0000002b, },
+ { "⨣", 0x00002a23, },
+ { "⊞", 0x0000229e, },
+ { "⨢", 0x00002a22, },
+ { "∔", 0x00002214, },
+ { "⨥", 0x00002a25, },
+ { "⩲", 0x00002a72, },
+ { "±", 0x000000b1, },
+ { "±", 0x000000b1, },
+ { "⨦", 0x00002a26, },
+ { "⨧", 0x00002a27, },
+ { "±", 0x000000b1, },
+ { "⨕", 0x00002a15, },
+ { "𝕡", 0x0001d561, },
+ { "£", 0x000000a3, },
+ { "£", 0x000000a3, },
+ { "≺", 0x0000227a, },
+ { "⪳", 0x00002ab3, },
+ { "⪷", 0x00002ab7, },
+ { "≼", 0x0000227c, },
+ { "⪯", 0x00002aaf, },
+ { "≺", 0x0000227a, },
+ { "⪷", 0x00002ab7, },
+ { "≼", 0x0000227c, },
+ { "⪯", 0x00002aaf, },
+ { "⪹", 0x00002ab9, },
+ { "⪵", 0x00002ab5, },
+ { "⋨", 0x000022e8, },
+ { "≾", 0x0000227e, },
+ { "′", 0x00002032, },
+ { "ℙ", 0x00002119, },
+ { "⪵", 0x00002ab5, },
+ { "⪹", 0x00002ab9, },
+ { "⋨", 0x000022e8, },
+ { "∏", 0x0000220f, },
+ { "⌮", 0x0000232e, },
+ { "⌒", 0x00002312, },
+ { "⌓", 0x00002313, },
+ { "∝", 0x0000221d, },
+ { "∝", 0x0000221d, },
+ { "≾", 0x0000227e, },
+ { "⊰", 0x000022b0, },
+ { "𝓅", 0x0001d4c5, },
+ { "ψ", 0x000003c8, },
+ { " ", 0x00002008, },
+ { "𝔮", 0x0001d52e, },
+ { "⨌", 0x00002a0c, },
+ { "𝕢", 0x0001d562, },
+ { "⁗", 0x00002057, },
+ { "𝓆", 0x0001d4c6, },
+ { "ℍ", 0x0000210d, },
+ { "⨖", 0x00002a16, },
+ { "?", 0x0000003f, },
+ { "≟", 0x0000225f, },
+ { """, 0x00000022, },
+ { """, 0x00000022, },
+ { "⇛", 0x000021db, },
+ { "⇒", 0x000021d2, },
+ { "⤜", 0x0000291c, },
+ { "⤏", 0x0000290f, },
+ { "⥤", 0x00002964, },
+// { "∽̱", 8765, 0x00000331, },
+ { "ŕ", 0x00000155, },
+ { "√", 0x0000221a, },
+ { "⦳", 0x000029b3, },
+ { "⟩", 0x000027e9, },
+ { "⦒", 0x00002992, },
+ { "⦥", 0x000029a5, },
+ { "⟩", 0x000027e9, },
+ { "»", 0x000000bb, },
+ { "»", 0x000000bb, },
+ { "→", 0x00002192, },
+ { "⥵", 0x00002975, },
+ { "⇥", 0x000021e5, },
+ { "⤠", 0x00002920, },
+ { "⤳", 0x00002933, },
+ { "⤞", 0x0000291e, },
+ { "↪", 0x000021aa, },
+ { "↬", 0x000021ac, },
+ { "⥅", 0x00002945, },
+ { "⥴", 0x00002974, },
+ { "↣", 0x000021a3, },
+ { "↝", 0x0000219d, },
+ { "⤚", 0x0000291a, },
+ { "∶", 0x00002236, },
+ { "ℚ", 0x0000211a, },
+ { "⤍", 0x0000290d, },
+ { "❳", 0x00002773, },
+ { "}", 0x0000007d, },
+ { "]", 0x0000005d, },
+ { "⦌", 0x0000298c, },
+ { "⦎", 0x0000298e, },
+ { "⦐", 0x00002990, },
+ { "ř", 0x00000159, },
+ { "ŗ", 0x00000157, },
+ { "⌉", 0x00002309, },
+ { "}", 0x0000007d, },
+ { "р", 0x00000440, },
+ { "⤷", 0x00002937, },
+ { "⥩", 0x00002969, },
+ { "”", 0x0000201d, },
+ { "”", 0x0000201d, },
+ { "↳", 0x000021b3, },
+ { "ℜ", 0x0000211c, },
+ { "ℛ", 0x0000211b, },
+ { "ℜ", 0x0000211c, },
+ { "ℝ", 0x0000211d, },
+ { "▭", 0x000025ad, },
+ { "®", 0x000000ae, },
+ { "®", 0x000000ae, },
+ { "⥽", 0x0000297d, },
+ { "⌋", 0x0000230b, },
+ { "𝔯", 0x0001d52f, },
+ { "⇁", 0x000021c1, },
+ { "⇀", 0x000021c0, },
+ { "⥬", 0x0000296c, },
+ { "ρ", 0x000003c1, },
+ { "ϱ", 0x000003f1, },
+ { "→", 0x00002192, },
+ { "↣", 0x000021a3, },
+ { "⇁", 0x000021c1, },
+ { "⇀", 0x000021c0, },
+ { "⇄", 0x000021c4, },
+ { "⇌", 0x000021cc, },
+ { "⇉", 0x000021c9, },
+ { "↝", 0x0000219d, },
+ { "⋌", 0x000022cc, },
+ { "˚", 0x000002da, },
+ { "≓", 0x00002253, },
+ { "⇄", 0x000021c4, },
+ { "⇌", 0x000021cc, },
+ { "‏", 0x0000200f, },
+ { "⎱", 0x000023b1, },
+ { "⎱", 0x000023b1, },
+ { "⫮", 0x00002aee, },
+ { "⟭", 0x000027ed, },
+ { "⇾", 0x000021fe, },
+ { "⟧", 0x000027e7, },
+ { "⦆", 0x00002986, },
+ { "𝕣", 0x0001d563, },
+ { "⨮", 0x00002a2e, },
+ { "⨵", 0x00002a35, },
+ { ")", 0x00000029, },
+ { "⦔", 0x00002994, },
+ { "⨒", 0x00002a12, },
+ { "⇉", 0x000021c9, },
+ { "›", 0x0000203a, },
+ { "𝓇", 0x0001d4c7, },
+ { "↱", 0x000021b1, },
+ { "]", 0x0000005d, },
+ { "’", 0x00002019, },
+ { "’", 0x00002019, },
+ { "⋌", 0x000022cc, },
+ { "⋊", 0x000022ca, },
+ { "▹", 0x000025b9, },
+ { "⊵", 0x000022b5, },
+ { "▸", 0x000025b8, },
+ { "⧎", 0x000029ce, },
+ { "⥨", 0x00002968, },
+ { "℞", 0x0000211e, },
+ { "ś", 0x0000015b, },
+ { "‚", 0x0000201a, },
+ { "≻", 0x0000227b, },
+ { "⪴", 0x00002ab4, },
+ { "⪸", 0x00002ab8, },
+ { "š", 0x00000161, },
+ { "≽", 0x0000227d, },
+ { "⪰", 0x00002ab0, },
+ { "ş", 0x0000015f, },
+ { "ŝ", 0x0000015d, },
+ { "⪶", 0x00002ab6, },
+ { "⪺", 0x00002aba, },
+ { "⋩", 0x000022e9, },
+ { "⨓", 0x00002a13, },
+ { "≿", 0x0000227f, },
+ { "с", 0x00000441, },
+ { "⋅", 0x000022c5, },
+ { "⊡", 0x000022a1, },
+ { "⩦", 0x00002a66, },
+ { "⇘", 0x000021d8, },
+ { "⤥", 0x00002925, },
+ { "↘", 0x00002198, },
+ { "↘", 0x00002198, },
+ { "§", 0x000000a7, },
+ { "§", 0x000000a7, },
+ { ";", 0x0000003b, },
+ { "⤩", 0x00002929, },
+ { "∖", 0x00002216, },
+ { "∖", 0x00002216, },
+ { "✶", 0x00002736, },
+ { "𝔰", 0x0001d530, },
+ { "⌢", 0x00002322, },
+ { "♯", 0x0000266f, },
+ { "щ", 0x00000449, },
+ { "ш", 0x00000448, },
+ { "∣", 0x00002223, },
+ { "∥", 0x00002225, },
+ { "­", 0x000000ad, },
+ { "­", 0x000000ad, },
+ { "σ", 0x000003c3, },
+ { "ς", 0x000003c2, },
+ { "ς", 0x000003c2, },
+ { "∼", 0x0000223c, },
+ { "⩪", 0x00002a6a, },
+ { "≃", 0x00002243, },
+ { "≃", 0x00002243, },
+ { "⪞", 0x00002a9e, },
+ { "⪠", 0x00002aa0, },
+ { "⪝", 0x00002a9d, },
+ { "⪟", 0x00002a9f, },
+ { "≆", 0x00002246, },
+ { "⨤", 0x00002a24, },
+ { "⥲", 0x00002972, },
+ { "←", 0x00002190, },
+ { "∖", 0x00002216, },
+ { "⨳", 0x00002a33, },
+ { "⧤", 0x000029e4, },
+ { "∣", 0x00002223, },
+ { "⌣", 0x00002323, },
+ { "⪪", 0x00002aaa, },
+ { "⪬", 0x00002aac, },
+// { "⪬︀", 10924, 0x0000fe00, },
+ { "ь", 0x0000044c, },
+ { "/", 0x0000002f, },
+ { "⧄", 0x000029c4, },
+ { "⌿", 0x0000233f, },
+ { "𝕤", 0x0001d564, },
+ { "♠", 0x00002660, },
+ { "♠", 0x00002660, },
+ { "∥", 0x00002225, },
+ { "⊓", 0x00002293, },
+// { "⊓︀", 8851, 0x0000fe00, },
+ { "⊔", 0x00002294, },
+// { "⊔︀", 8852, 0x0000fe00, },
+ { "⊏", 0x0000228f, },
+ { "⊑", 0x00002291, },
+ { "⊏", 0x0000228f, },
+ { "⊑", 0x00002291, },
+ { "⊐", 0x00002290, },
+ { "⊒", 0x00002292, },
+ { "⊐", 0x00002290, },
+ { "⊒", 0x00002292, },
+ { "□", 0x000025a1, },
+ { "□", 0x000025a1, },
+ { "▪", 0x000025aa, },
+ { "▪", 0x000025aa, },
+ { "→", 0x00002192, },
+ { "𝓈", 0x0001d4c8, },
+ { "∖", 0x00002216, },
+ { "⌣", 0x00002323, },
+ { "⋆", 0x000022c6, },
+ { "☆", 0x00002606, },
+ { "★", 0x00002605, },
+ { "ϵ", 0x000003f5, },
+ { "ϕ", 0x000003d5, },
+ { "¯", 0x000000af, },
+ { "⊂", 0x00002282, },
+ { "⫅", 0x00002ac5, },
+ { "⪽", 0x00002abd, },
+ { "⊆", 0x00002286, },
+ { "⫃", 0x00002ac3, },
+ { "⫁", 0x00002ac1, },
+ { "⫋", 0x00002acb, },
+ { "⊊", 0x0000228a, },
+ { "⪿", 0x00002abf, },
+ { "⥹", 0x00002979, },
+ { "⊂", 0x00002282, },
+ { "⊆", 0x00002286, },
+ { "⫅", 0x00002ac5, },
+ { "⊊", 0x0000228a, },
+ { "⫋", 0x00002acb, },
+ { "⫇", 0x00002ac7, },
+ { "⫕", 0x00002ad5, },
+ { "⫓", 0x00002ad3, },
+ { "≻", 0x0000227b, },
+ { "⪸", 0x00002ab8, },
+ { "≽", 0x0000227d, },
+ { "⪰", 0x00002ab0, },
+ { "⪺", 0x00002aba, },
+ { "⪶", 0x00002ab6, },
+ { "⋩", 0x000022e9, },
+ { "≿", 0x0000227f, },
+ { "∑", 0x00002211, },
+ { "♪", 0x0000266a, },
+ { "¹", 0x000000b9, },
+ { "¹", 0x000000b9, },
+ { "²", 0x000000b2, },
+ { "²", 0x000000b2, },
+ { "³", 0x000000b3, },
+ { "³", 0x000000b3, },
+ { "⊃", 0x00002283, },
+ { "⫆", 0x00002ac6, },
+ { "⪾", 0x00002abe, },
+ { "⫘", 0x00002ad8, },
+ { "⊇", 0x00002287, },
+ { "⫄", 0x00002ac4, },
+ { "⟉", 0x000027c9, },
+ { "⫗", 0x00002ad7, },
+ { "⥻", 0x0000297b, },
+ { "⫂", 0x00002ac2, },
+ { "⫌", 0x00002acc, },
+ { "⊋", 0x0000228b, },
+ { "⫀", 0x00002ac0, },
+ { "⊃", 0x00002283, },
+ { "⊇", 0x00002287, },
+ { "⫆", 0x00002ac6, },
+ { "⊋", 0x0000228b, },
+ { "⫌", 0x00002acc, },
+ { "⫈", 0x00002ac8, },
+ { "⫔", 0x00002ad4, },
+ { "⫖", 0x00002ad6, },
+ { "⇙", 0x000021d9, },
+ { "⤦", 0x00002926, },
+ { "↙", 0x00002199, },
+ { "↙", 0x00002199, },
+ { "⤪", 0x0000292a, },
+ { "ß", 0x000000df, },
+ { "ß", 0x000000df, },
+ { "⌖", 0x00002316, },
+ { "τ", 0x000003c4, },
+ { "⎴", 0x000023b4, },
+ { "ť", 0x00000165, },
+ { "ţ", 0x00000163, },
+ { "т", 0x00000442, },
+ { "⃛", 0x000020db, },
+ { "⌕", 0x00002315, },
+ { "𝔱", 0x0001d531, },
+ { "∴", 0x00002234, },
+ { "∴", 0x00002234, },
+ { "θ", 0x000003b8, },
+ { "ϑ", 0x000003d1, },
+ { "ϑ", 0x000003d1, },
+ { "≈", 0x00002248, },
+ { "∼", 0x0000223c, },
+ { " ", 0x00002009, },
+ { "≈", 0x00002248, },
+ { "∼", 0x0000223c, },
+ { "þ", 0x000000fe, },
+ { "þ", 0x000000fe, },
+ { "˜", 0x000002dc, },
+ { "×", 0x000000d7, },
+ { "×", 0x000000d7, },
+ { "⊠", 0x000022a0, },
+ { "⨱", 0x00002a31, },
+ { "⨰", 0x00002a30, },
+ { "∭", 0x0000222d, },
+ { "⤨", 0x00002928, },
+ { "⊤", 0x000022a4, },
+ { "⌶", 0x00002336, },
+ { "⫱", 0x00002af1, },
+ { "𝕥", 0x0001d565, },
+ { "⫚", 0x00002ada, },
+ { "⤩", 0x00002929, },
+ { "‴", 0x00002034, },
+ { "™", 0x00002122, },
+ { "▵", 0x000025b5, },
+ { "▿", 0x000025bf, },
+ { "◃", 0x000025c3, },
+ { "⊴", 0x000022b4, },
+ { "≜", 0x0000225c, },
+ { "▹", 0x000025b9, },
+ { "⊵", 0x000022b5, },
+ { "◬", 0x000025ec, },
+ { "≜", 0x0000225c, },
+ { "⨺", 0x00002a3a, },
+ { "⨹", 0x00002a39, },
+ { "⧍", 0x000029cd, },
+ { "⨻", 0x00002a3b, },
+ { "⏢", 0x000023e2, },
+ { "𝓉", 0x0001d4c9, },
+ { "ц", 0x00000446, },
+ { "ћ", 0x0000045b, },
+ { "ŧ", 0x00000167, },
+ { "≬", 0x0000226c, },
+ { "↞", 0x0000219e, },
+ { "↠", 0x000021a0, },
+ { "⇑", 0x000021d1, },
+ { "⥣", 0x00002963, },
+ { "ú", 0x000000fa, },
+ { "ú", 0x000000fa, },
+ { "↑", 0x00002191, },
+ { "ў", 0x0000045e, },
+ { "ŭ", 0x0000016d, },
+ { "û", 0x000000fb, },
+ { "û", 0x000000fb, },
+ { "у", 0x00000443, },
+ { "⇅", 0x000021c5, },
+ { "ű", 0x00000171, },
+ { "⥮", 0x0000296e, },
+ { "⥾", 0x0000297e, },
+ { "𝔲", 0x0001d532, },
+ { "ù", 0x000000f9, },
+ { "ù", 0x000000f9, },
+ { "↿", 0x000021bf, },
+ { "↾", 0x000021be, },
+ { "▀", 0x00002580, },
+ { "⌜", 0x0000231c, },
+ { "⌜", 0x0000231c, },
+ { "⌏", 0x0000230f, },
+ { "◸", 0x000025f8, },
+ { "ū", 0x0000016b, },
+ { "¨", 0x000000a8, },
+ { "¨", 0x000000a8, },
+ { "ų", 0x00000173, },
+ { "𝕦", 0x0001d566, },
+ { "↑", 0x00002191, },
+ { "↕", 0x00002195, },
+ { "↿", 0x000021bf, },
+ { "↾", 0x000021be, },
+ { "⊎", 0x0000228e, },
+ { "υ", 0x000003c5, },
+ { "ϒ", 0x000003d2, },
+ { "υ", 0x000003c5, },
+ { "⇈", 0x000021c8, },
+ { "⌝", 0x0000231d, },
+ { "⌝", 0x0000231d, },
+ { "⌎", 0x0000230e, },
+ { "ů", 0x0000016f, },
+ { "◹", 0x000025f9, },
+ { "𝓊", 0x0001d4ca, },
+ { "⋰", 0x000022f0, },
+ { "ũ", 0x00000169, },
+ { "▵", 0x000025b5, },
+ { "▴", 0x000025b4, },
+ { "⇈", 0x000021c8, },
+ { "ü", 0x000000fc, },
+ { "ü", 0x000000fc, },
+ { "⦧", 0x000029a7, },
+ { "⇕", 0x000021d5, },
+ { "⫨", 0x00002ae8, },
+ { "⫩", 0x00002ae9, },
+ { "⊨", 0x000022a8, },
+ { "⦜", 0x0000299c, },
+ { "ϵ", 0x000003f5, },
+ { "ϰ", 0x000003f0, },
+ { "∅", 0x00002205, },
+ { "ϕ", 0x000003d5, },
+ { "ϖ", 0x000003d6, },
+ { "∝", 0x0000221d, },
+ { "↕", 0x00002195, },
+ { "ϱ", 0x000003f1, },
+ { "ς", 0x000003c2, },
+// { "⊊︀", 8842, 0x0000fe00, },
+// { "⫋︀", 10955, 0x0000fe00, },
+// { "⊋︀", 8843, 0x0000fe00, },
+// { "⫌︀", 10956, 0x0000fe00, },
+ { "ϑ", 0x000003d1, },
+ { "⊲", 0x000022b2, },
+ { "⊳", 0x000022b3, },
+ { "в", 0x00000432, },
+ { "⊢", 0x000022a2, },
+ { "∨", 0x00002228, },
+ { "⊻", 0x000022bb, },
+ { "≚", 0x0000225a, },
+ { "⋮", 0x000022ee, },
+ { "|", 0x0000007c, },
+ { "|", 0x0000007c, },
+ { "𝔳", 0x0001d533, },
+ { "⊲", 0x000022b2, },
+// { "⊂⃒", 8834, 0x000020d2, },
+// { "⊃⃒", 8835, 0x000020d2, },
+ { "𝕧", 0x0001d567, },
+ { "∝", 0x0000221d, },
+ { "⊳", 0x000022b3, },
+ { "𝓋", 0x0001d4cb, },
+// { "⫋︀", 10955, 0x0000fe00, },
+// { "⊊︀", 8842, 0x0000fe00, },
+// { "⫌︀", 10956, 0x0000fe00, },
+// { "⊋︀", 8843, 0x0000fe00, },
+ { "⦚", 0x0000299a, },
+ { "ŵ", 0x00000175, },
+ { "⩟", 0x00002a5f, },
+ { "∧", 0x00002227, },
+ { "≙", 0x00002259, },
+ { "℘", 0x00002118, },
+ { "𝔴", 0x0001d534, },
+ { "𝕨", 0x0001d568, },
+ { "℘", 0x00002118, },
+ { "≀", 0x00002240, },
+ { "≀", 0x00002240, },
+ { "𝓌", 0x0001d4cc, },
+ { "⋂", 0x000022c2, },
+ { "◯", 0x000025ef, },
+ { "⋃", 0x000022c3, },
+ { "▽", 0x000025bd, },
+ { "𝔵", 0x0001d535, },
+ { "⟺", 0x000027fa, },
+ { "⟷", 0x000027f7, },
+ { "ξ", 0x000003be, },
+ { "⟸", 0x000027f8, },
+ { "⟵", 0x000027f5, },
+ { "⟼", 0x000027fc, },
+ { "⋻", 0x000022fb, },
+ { "⨀", 0x00002a00, },
+ { "𝕩", 0x0001d569, },
+ { "⨁", 0x00002a01, },
+ { "⨂", 0x00002a02, },
+ { "⟹", 0x000027f9, },
+ { "⟶", 0x000027f6, },
+ { "𝓍", 0x0001d4cd, },
+ { "⨆", 0x00002a06, },
+ { "⨄", 0x00002a04, },
+ { "△", 0x000025b3, },
+ { "⋁", 0x000022c1, },
+ { "⋀", 0x000022c0, },
+ { "ý", 0x000000fd, },
+ { "ý", 0x000000fd, },
+ { "я", 0x0000044f, },
+ { "ŷ", 0x00000177, },
+ { "ы", 0x0000044b, },
+ { "¥", 0x000000a5, },
+ { "¥", 0x000000a5, },
+ { "𝔶", 0x0001d536, },
+ { "ї", 0x00000457, },
+ { "𝕪", 0x0001d56a, },
+ { "𝓎", 0x0001d4ce, },
+ { "ю", 0x0000044e, },
+ { "ÿ", 0x000000ff, },
+ { "ÿ", 0x000000ff, },
+ { "ź", 0x0000017a, },
+ { "ž", 0x0000017e, },
+ { "з", 0x00000437, },
+ { "ż", 0x0000017c, },
+ { "ℨ", 0x00002128, },
+ { "ζ", 0x000003b6, },
+ { "𝔷", 0x0001d537, },
+ { "ж", 0x00000436, },
+ { "⇝", 0x000021dd, },
+ { "𝕫", 0x0001d56b, },
+ { "𝓏", 0x0001d4cf, },
+ { "‍", 0x0000200d, },
+ { "‌", 0x0000200c, },
+#endif
+
+ { NULL, 0 },
+};
--- html_tokenize.c Wed Dec 11 11:21:18 2024
+++ html_tokenize.c Wed Dec 11 11:21:18 2024
@@ -0,0 +1,2201 @@
+/*
+ * Copyright (c) 2024 joshua stein <jcs@jcs.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * Tokenization
+ * https://html.spec.whatwg.org/multipage/parsing.html#tokenization
+ *
+ * Handles characters output from html_parse() and turns them into tokens,
+ * which are emitted to the tree builder.
+ */
+
+#include "html.h"
+
+void html_tokenize(struct html_page *html, short cc);
+bool html_appropriate_end_tag_token(struct html_page *html, html_token *token);
+html_tag_type html_find_tag_type(char *tag_name);
+void html_lookahead_consume(struct html_page *html, short count);
+
+void
+html_lookahead_consume(struct html_page *html, short count)
+{
+ short n, j;
+
+ for (n = 0; n < count && html->lookahead_len; n++) {
+ HTML_DEBUG((": consuming '%c' from lookahead", html->lookahead[0]));
+ for (j = 0; j < HTML_LOOKAHEAD_SIZE - 1; j++)
+ html->lookahead[j] = html->lookahead[j + 1];
+ html->lookahead_len--;
+ }
+}
+
+void
+html_tokenize(struct html_page *html, short cc)
+{
+ html_state was_state;
+ struct html_attr *attr;
+ const html_entity *found_entity;
+ short tcc, n, j, i;
+
+ was_state = html->state;
+
+ if (html->lookahead_len < HTML_LOOKAHEAD_SIZE && cc != EOF) {
+ /* fill lookahead */
+ html->lookahead[html->lookahead_len++] = cc;
+ return;
+ }
+
+ if (html->lookahead_len) {
+ /* take a character from the head of lookahead and shift down */
+ tcc = html->lookahead[0];
+ for (n = 0; n < HTML_LOOKAHEAD_SIZE - 1; n++)
+ html->lookahead[n] = html->lookahead[n + 1];
+ if (cc == EOF) {
+ if (html->lookahead_len)
+ html->lookahead_len--;
+ } else
+ html->lookahead[HTML_LOOKAHEAD_SIZE - 1] = cc;
+ cc = tcc;
+ }
+
+#ifdef HTML_ENABLE_DEBUGGING
+ HTML_DEBUG(("pos % 4ld:", html->input_pos++));
+
+ if (cc == '\n')
+ HTML_DEBUG((" \\n"));
+ else if (cc == '\r')
+ HTML_DEBUG((" \\r"));
+ else if (cc == '\t')
+ HTML_DEBUG((" \\t"));
+ else if (cc == '\f')
+ HTML_DEBUG((" \\f"));
+ else if (cc == '\0')
+ HTML_DEBUG((" \\0"));
+ else if (cc == ' ')
+ HTML_DEBUG((" "));
+ else if (cc == EOF)
+ HTML_DEBUG(("EOF"));
+ else
+ HTML_DEBUG((" %c", cc));
+
+ HTML_DEBUG((": state %s", html_state_names[html->state]));
+#endif
+
+ was_state = html->state;
+
+reconsume:
+ if (html->state != was_state) {
+ HTML_DEBUG((": reconsume as %s", html_state_names[html->state]));
+ was_state = html->state;
+ }
+
+ switch (html->state) {
+ case HTML_STATE_DATA:
+ switch (cc) {
+ case '&':
+ html->return_state = html->state;
+ html->tmp_len = 0;
+ html->state = HTML_STATE_CHARACTER_REFERENCE;
+ break;
+ case '<':
+ html->state = HTML_STATE_TAG_OPEN;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ html_emit_char_token(html, cc);
+ break;
+ case EOF:
+ html_emit_eof_token(html);
+ break;
+ default:
+ html_emit_char_token(html, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_RCDATA:
+ switch (cc) {
+ case '&':
+ html->return_state = html->state;
+ html->tmp_len = 0;
+ html->state = HTML_STATE_CHARACTER_REFERENCE;
+ break;
+ case '<':
+ html->state = HTML_STATE_RCDATA_LESS_THAN_SIGN;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
+ break;
+ case EOF:
+ html_emit_eof_token(html);
+ break;
+ default:
+ html_emit_char_token(html, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_RAWTEXT:
+ switch (cc) {
+ case '<':
+ html->state = HTML_STATE_RAWTEXT_LESS_THAN_SIGN;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
+ break;
+ case EOF:
+ html_emit_eof_token(html);
+ break;
+ default:
+ html_emit_char_token(html, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA:
+ switch (cc) {
+ case '<':
+ html->state = HTML_STATE_SCRIPT_DATA_LESS_THAN_SIGN;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ if (!html->ignore_script_data)
+ html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
+ break;
+ case EOF:
+ html_emit_eof_token(html);
+ break;
+ default:
+ if (!html->ignore_script_data)
+ html_emit_char_token(html, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_PLAINTEXT:
+ switch (cc) {
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
+ break;
+ case EOF:
+ html_emit_eof_token(html);
+ break;
+ default:
+ html_emit_char_token(html, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_TAG_OPEN:
+ switch (cc) {
+ case '!':
+ html->state = HTML_STATE_MARKUP_DECLARATION_OPEN;
+ html->tmp_len = 0;
+ break;
+ case '/':
+ html->state = HTML_STATE_END_TAG_OPEN;
+ break;
+ case '?':
+ html->error =
+ HTML_ERROR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME;
+ html_emit_comment(html, &html->new_token.comment);
+ html->state = HTML_STATE_BOGUS_COMMENT;
+ goto reconsume;
+ case EOF:
+ html->error = HTML_ERROR_EOF_BEFORE_TAG_NAME;
+ html_emit_char_token(html, '<');
+ html_emit_eof_token(html);
+ break;
+ default:
+ if (IS_ALPHA(cc)) {
+ html_prep_new_token(html, HTML_TOKEN_START_TAG);
+ html->state = HTML_STATE_TAG_NAME;
+ goto reconsume;
+ }
+ html->error = HTML_ERROR_INVALID_FIRST_CHARACTER_OF_TAG_NAME;
+ html_emit_char_token(html, '<');
+ html->state = HTML_STATE_DATA;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_END_TAG_OPEN:
+ switch (cc) {
+ case '>':
+ html->error = HTML_ERROR_MISSING_END_TAG_NAME;
+ html->state = HTML_STATE_DATA;
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_BEFORE_TAG_NAME;
+ html_emit_char_token(html, '<');
+ html_emit_char_token(html, '/');
+ html_emit_eof_token(html);
+ break;
+ default:
+ if (IS_ALPHA(cc)) {
+ html_prep_new_token(html, HTML_TOKEN_END_TAG);
+ html->state = HTML_STATE_TAG_NAME;
+ goto reconsume;
+ }
+ html->error = HTML_ERROR_INVALID_FIRST_CHARACTER_OF_TAG_NAME;
+ html_prep_new_token(html, HTML_TOKEN_COMMENT);
+ html->state = HTML_STATE_BOGUS_COMMENT;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_TAG_NAME:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ html->new_token.tag.type =
+ html_find_tag_type(html->new_token.tag.name);
+ html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
+ break;
+ case '/':
+ html->new_token.tag.type =
+ html_find_tag_type(html->new_token.tag.name);
+ html->state = HTML_STATE_SELF_CLOSING_START_TAG;
+ break;
+ case '>':
+ html->state = HTML_STATE_DATA;
+ html->new_token.tag.type =
+ html_find_tag_type(html->new_token.tag.name);
+ html_emit_token(html, &html->new_token);
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ STR_APPEND(html->new_token.tag.name,
+ html->new_token.tag.name_len, HTML_REPLACEMENT_CHARACTER);
+ html->new_token.tag.type = 0;
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_TAG;
+ html_emit_eof_token(html);
+ break;
+ default:
+ if (IS_UPPER_ALPHA(cc))
+ cc += 0x20;
+ STR_APPEND(html->new_token.tag.name,
+ html->new_token.tag.name_len, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_RCDATA_LESS_THAN_SIGN:
+ switch (cc) {
+ case '/':
+ html->state = HTML_STATE_RCDATA_END_TAG_OPEN;
+ html->tmp_len = 0;
+ break;
+ default:
+ html->state = HTML_STATE_RCDATA;
+ if (!html->ignore_comment_data)
+ html_emit_char_token(html, '<');
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_RCDATA_END_TAG_OPEN:
+ if (IS_ALPHA(cc)) {
+ html_prep_new_token(html, HTML_TOKEN_END_TAG);
+ html->state = HTML_STATE_RCDATA_END_TAG_NAME;
+ goto reconsume;
+ }
+ if (!html->ignore_comment_data) {
+ html_emit_char_token(html, '<');
+ html_emit_char_token(html, '/');
+ }
+ html->state = HTML_STATE_RCDATA;
+ goto reconsume;
+ case HTML_STATE_RCDATA_END_TAG_NAME:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ if (!html_appropriate_end_tag_token(html, &html->new_token))
+ goto HTML_STATE_RCDATA_END_TAG_NAME_anything_else;
+ html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
+ break;
+ case '/':
+ if (!html_appropriate_end_tag_token(html, &html->new_token))
+ goto HTML_STATE_RCDATA_END_TAG_NAME_anything_else;
+ html->state = HTML_STATE_SELF_CLOSING_START_TAG;
+ break;
+ case '>':
+ if (!html_appropriate_end_tag_token(html, &html->new_token))
+ goto HTML_STATE_RCDATA_END_TAG_NAME_anything_else;
+ html->state = HTML_STATE_DATA;
+ break;
+ default:
+ if (IS_UPPER_ALPHA(cc))
+ cc += 0x20;
+ if (IS_LOWER_ALPHA(cc)) {
+ STR_APPEND(html->new_token.tag.name,
+ html->new_token.tag.name_len, cc);
+ STR_APPEND(html->tmp, html->tmp_len, cc);
+ break;
+ }
+ /* FALLTHROUGH */
+ HTML_STATE_RCDATA_END_TAG_NAME_anything_else:
+ if (!html->ignore_comment_data) {
+ html_emit_char_token(html, '<');
+ html_emit_char_token(html, '/');
+ for (n = 0; n < html->tmp_len; n++)
+ html_emit_char_token(html, html->tmp[n]);
+ }
+ html->state = HTML_STATE_RCDATA;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_RAWTEXT_LESS_THAN_SIGN:
+ switch (cc) {
+ case '/':
+ html->tmp_len = 0;
+ html->state = HTML_STATE_RAWTEXT_END_TAG_OPEN;
+ break;
+ default:
+ if (!html->ignore_comment_data) {
+ html_emit_char_token(html, '<');
+ }
+ html->state = HTML_STATE_RAWTEXT;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_RAWTEXT_END_TAG_OPEN:
+ if (IS_ALPHA(cc)) {
+ html_prep_new_token(html, HTML_TOKEN_END_TAG);
+ html->state = HTML_STATE_RAWTEXT_END_TAG_NAME;
+ goto reconsume;
+ }
+ if (!html->ignore_comment_data) {
+ html_emit_char_token(html, '<');
+ html_emit_char_token(html, '/');
+ }
+ html->state = HTML_STATE_RAWTEXT;
+ goto reconsume;
+ case HTML_STATE_RAWTEXT_END_TAG_NAME:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ if (!html_appropriate_end_tag_token(html, &html->new_token))
+ goto HTML_STATE_RAWTEXT_END_TAG_NAME_anything_else;
+ html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
+ break;
+ case '/':
+ if (!html_appropriate_end_tag_token(html, &html->new_token))
+ goto HTML_STATE_RAWTEXT_END_TAG_NAME_anything_else;
+ html->state = HTML_STATE_SELF_CLOSING_START_TAG;
+ break;
+ case '>':
+ if (!html_appropriate_end_tag_token(html, &html->new_token))
+ goto HTML_STATE_RAWTEXT_END_TAG_NAME_anything_else;
+ html->state = HTML_STATE_DATA;
+ break;
+ default:
+ if (IS_UPPER_ALPHA(cc))
+ cc += 0x20;
+ if (IS_LOWER_ALPHA(cc)) {
+ STR_APPEND(html->new_token.tag.name,
+ html->new_token.tag.name_len, cc);
+ STR_APPEND(html->tmp, html->tmp_len, cc);
+ break;
+ }
+ /* FALLTHROUGH */
+ HTML_STATE_RAWTEXT_END_TAG_NAME_anything_else:
+ if (!html->ignore_comment_data) {
+ html_emit_char_token(html, '<');
+ html_emit_char_token(html, '/');
+ for (n = 0; n < html->tmp_len; n++)
+ html_emit_char_token(html, html->tmp[n]);
+ }
+ html->state = HTML_STATE_RAWTEXT;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_LESS_THAN_SIGN:
+ switch (cc) {
+ case '/':
+ html->tmp_len = 0;
+ html->state = HTML_STATE_SCRIPT_DATA_END_TAG_OPEN;
+ break;
+ case '!':
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPE_START;
+ if (!html->ignore_comment_data) {
+ html_emit_char_token(html, '<');
+ html_emit_char_token(html, '!');
+ }
+ break;
+ default:
+ if (!html->ignore_comment_data) {
+ html_emit_char_token(html, '<');
+ }
+ html->state = HTML_STATE_SCRIPT_DATA;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_END_TAG_OPEN:
+ if (IS_ALPHA(cc)) {
+ html_prep_new_token(html, HTML_TOKEN_END_TAG);
+ html->state = HTML_STATE_SCRIPT_DATA_END_TAG_NAME;
+ goto reconsume;
+ }
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, '<');
+ html_emit_char_token(html, '/');
+ }
+ html->state = HTML_STATE_SCRIPT_DATA;
+ goto reconsume;
+ case HTML_STATE_SCRIPT_DATA_END_TAG_NAME:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ if (!html_appropriate_end_tag_token(html, &html->new_token))
+ goto HTML_STATE_SCRIPT_DATA_END_TAG_NAME_anything_else;
+ html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
+ break;
+ case '/':
+ if (!html_appropriate_end_tag_token(html, &html->new_token))
+ goto HTML_STATE_SCRIPT_DATA_END_TAG_NAME_anything_else;
+ html->state = HTML_STATE_SELF_CLOSING_START_TAG;
+ break;
+ case '>':
+ if (!html_appropriate_end_tag_token(html, &html->new_token))
+ goto HTML_STATE_SCRIPT_DATA_END_TAG_NAME_anything_else;
+ html->state = HTML_STATE_DATA;
+ break;
+ default:
+ if (IS_UPPER_ALPHA(cc))
+ cc += 0x20;
+ if (IS_LOWER_ALPHA(cc)) {
+ STR_APPEND(html->new_token.tag.name,
+ html->new_token.tag.name_len, cc);
+ STR_APPEND(html->tmp, html->tmp_len, cc);
+ break;
+ }
+ /* FALLTHROUGH */
+ HTML_STATE_SCRIPT_DATA_END_TAG_NAME_anything_else:
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, '<');
+ html_emit_char_token(html, '/');
+ for (n = 0; n < html->tmp_len; n++)
+ html_emit_char_token(html, html->tmp[n]);
+ }
+ html->state = HTML_STATE_SCRIPT_DATA;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_ESCAPE_START:
+ switch (cc) {
+ case '-':
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPE_START_DASH;
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, '-');
+ }
+ break;
+ default:
+ html->state = HTML_STATE_SCRIPT_DATA;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_ESCAPE_START_DASH:
+ switch (cc) {
+ case '-':
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH;
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, '-');
+ }
+ break;
+ default:
+ html->state = HTML_STATE_SCRIPT_DATA;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_ESCAPED:
+ switch (cc) {
+ case '-':
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_DASH;
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, '-');
+ }
+ break;
+ case '<':
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
+ }
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT;
+ html_emit_eof_token(html);
+ break;
+ default:
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, cc);
+ }
+ break;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_ESCAPED_DASH:
+ switch (cc) {
+ case '-':
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH;
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, '-');
+ }
+ break;
+ case '<':
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
+ }
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT;
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, cc);
+ }
+ break;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH:
+ switch (cc) {
+ case '-':
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, '-');
+ }
+ break;
+ case '<':
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
+ break;
+ case '>':
+ html->state = HTML_STATE_SCRIPT_DATA;
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, '>');
+ }
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
+ }
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT;
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, cc);
+ }
+ break;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
+ switch (cc) {
+ case '/':
+ html->tmp_len = 0;
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN;
+ break;
+ default:
+ if (IS_ALPHA(cc)) {
+ html->tmp_len = 0;
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, '<');
+ }
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START;
+ goto reconsume;
+ }
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, '<');
+ }
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN:
+ if (IS_ALPHA(cc)) {
+ html_prep_new_token(html, HTML_TOKEN_END_TAG);
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME;
+ goto reconsume;
+ }
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, '<');
+ html_emit_char_token(html, '/');
+ }
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
+ goto reconsume;
+ case HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ if (!html_appropriate_end_tag_token(html, &html->new_token))
+ goto HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME_anything_else;
+ html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
+ break;
+ case '/':
+ if (!html_appropriate_end_tag_token(html, &html->new_token))
+ goto HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME_anything_else;
+ html->state = HTML_STATE_SELF_CLOSING_START_TAG;
+ break;
+ case '>':
+ if (!html_appropriate_end_tag_token(html, &html->new_token))
+ goto HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME_anything_else;
+ html->state = HTML_STATE_DATA;
+ break;
+ default:
+ if (IS_UPPER_ALPHA(cc))
+ cc += 0x20;
+ if (IS_LOWER_ALPHA(cc)) {
+ STR_APPEND(html->new_token.tag.name,
+ html->new_token.tag.name_len, cc);
+ STR_APPEND(html->tmp, html->tmp_len, cc);
+ break;
+ }
+ /* FALLTHROUGH */
+ HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME_anything_else:
+ if (!html->ignore_script_data) {
+ html_emit_char_token(html, '<');
+ html_emit_char_token(html, '/');
+ for (n = 0; n < html->tmp_len; n++)
+ html_emit_char_token(html, html->tmp[n]);
+ }
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ case '/':
+ case '>':
+ if (html->tmp_len == 6 &&
+ memcmp(html->tmp, "script", 6) == 0) {
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
+ } else {
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
+ html_emit_char_token(html, cc);
+ }
+ break;
+ default:
+ if (IS_UPPER_ALPHA(cc))
+ cc += 0x20;
+ if (IS_LOWER_ALPHA(cc)) {
+ STR_APPEND(html->tmp, html->tmp_len, cc);
+ break;
+ }
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED:
+ switch (cc) {
+ case '-':
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH;
+ html_emit_char_token(html, '-');
+ break;
+ case '<':
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
+ html_emit_char_token(html, '<');
+ break;
+ case '\0':
+ html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT;
+ html_emit_eof_token(html);
+ break;
+ default:
+ html_emit_char_token(html, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
+ switch (cc) {
+ case '-':
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH;
+ html_emit_char_token(html, '-');
+ break;
+ case '<':
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
+ html_emit_char_token(html, '<');
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
+ html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT;
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
+ html_emit_char_token(html, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
+ switch (cc) {
+ case '-':
+ html_emit_char_token(html, '-');
+ break;
+ case '<':
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
+ html_emit_char_token(html, '<');
+ break;
+ case '>':
+ html->state = HTML_STATE_SCRIPT_DATA;
+ html_emit_char_token(html, '<');
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
+ html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT;
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
+ html_emit_char_token(html, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
+ switch (cc) {
+ case '/':
+ html->tmp_len = 0;
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END;
+ html_emit_char_token(html, '/');
+ break;
+ default:
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ case '/':
+ case '>':
+ if (html->tmp_len == 6 &&
+ memcmp(html->tmp, "script", 6) == 0) {
+ html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
+ } else {
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
+ html_emit_char_token(html, cc);
+ }
+ break;
+ default:
+ if (IS_UPPER_ALPHA(cc))
+ cc += 0x20;
+ if (IS_LOWER_ALPHA(cc)) {
+ STR_APPEND(html->tmp, html->tmp_len, cc);
+ html_emit_char_token(html, cc);
+ break;
+ }
+ html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_BEFORE_ATTRIBUTE_NAME:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ /* ignore */
+ break;
+ case '/':
+ case '>':
+ case EOF:
+ html->state = HTML_STATE_AFTER_ATTRIBUTE_NAME;
+ goto reconsume;
+ case '=':
+ html->error =
+ HTML_ERROR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME;
+ attr = html_prep_new_attribute(html, &html->new_token.tag);
+ STR_APPEND(attr->name, attr->name_len, cc);
+ html->state = HTML_STATE_ATTRIBUTE_NAME;
+ break;
+ default:
+ html_prep_new_attribute(html, &html->new_token.tag);
+ html->state = HTML_STATE_ATTRIBUTE_NAME;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_ATTRIBUTE_NAME:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ case '/':
+ case '>':
+ case EOF:
+ html->state = HTML_STATE_AFTER_ATTRIBUTE_NAME;
+ goto reconsume;
+ case '=':
+ html->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ attr = &NEW_TOKEN_LAST_ATTR;
+ STR_APPEND(attr->name, attr->name_len,
+ HTML_REPLACEMENT_CHARACTER);
+ break;
+ case '"':
+ case '\'':
+ case '<':
+ html->error = HTML_ERROR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME;
+ goto HTML_STATE_ATTRIBUTE_NAME_anything_else;
+ default:
+ HTML_STATE_ATTRIBUTE_NAME_anything_else:
+ if (IS_UPPER_ALPHA(cc))
+ cc += 0x20;
+ attr = &NEW_TOKEN_LAST_ATTR;
+ STR_APPEND(attr->name, attr->name_len, cc);
+ /* TODO: check for duplicate attr names, discard this if match */
+ break;
+ }
+ break;
+ case HTML_STATE_AFTER_ATTRIBUTE_NAME:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ /* ignore */
+ break;
+ case '/':
+ html->state = HTML_STATE_SELF_CLOSING_START_TAG;
+ break;
+ case '=':
+ html->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE;
+ break;
+ case '>':
+ html_emit_token(html, &html->new_token);
+ html->state = HTML_STATE_DATA;
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_TAG;
+ html_emit_eof_token(html);
+ break;
+ default:
+ html_prep_new_attribute(html, &html->new_token.tag);
+ html->state = HTML_STATE_ATTRIBUTE_NAME;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_BEFORE_ATTRIBUTE_VALUE:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ /* ignore */
+ break;
+ case '"':
+ html->state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED;
+ break;
+ case '\'':
+ html->state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
+ break;
+ case '>':
+ html->error = HTML_ERROR_MISSING_ATTRIBUTE_VALUE;
+ html->state = HTML_STATE_DATA;
+ html_emit_token(html, &html->new_token);
+ break;
+ default:
+ html->state = HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED:
+ switch (cc) {
+ case '"':
+ html->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED;
+ break;
+ case '&':
+ html->return_state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED;
+ html->tmp_len = 0;
+ html->state = HTML_STATE_CHARACTER_REFERENCE;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ attr = &NEW_TOKEN_LAST_ATTR;
+ STR_APPEND(attr->val, attr->val_len, HTML_REPLACEMENT_CHARACTER);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_TAG;
+ html_emit_eof_token(html);
+ break;
+ default:
+ attr = &NEW_TOKEN_LAST_ATTR;
+ STR_APPEND(attr->val, attr->val_len, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED:
+ switch (cc) {
+ case '\'':
+ html->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED;
+ break;
+ case '&':
+ html->return_state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
+ html->tmp_len = 0;
+ html->state = HTML_STATE_CHARACTER_REFERENCE;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ attr = &NEW_TOKEN_LAST_ATTR;
+ STR_APPEND(attr->val, attr->val_len, HTML_REPLACEMENT_CHARACTER);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_TAG;
+ html_emit_eof_token(html);
+ break;
+ default:
+ attr = &NEW_TOKEN_LAST_ATTR;
+ STR_APPEND(attr->val, attr->val_len, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
+ break;
+ case '&':
+ html->return_state = HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED;
+ html->tmp_len = 0;
+ html->state = HTML_STATE_CHARACTER_REFERENCE;
+ break;
+ case '>':
+ html->state = HTML_STATE_DATA;
+ html_emit_token(html, &html->new_token);
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ attr = &NEW_TOKEN_LAST_ATTR;
+ STR_APPEND(attr->val, attr->val_len, HTML_REPLACEMENT_CHARACTER);
+ break;
+ case '"':
+ case '\'':
+ case '<':
+ case '=':
+ case '`':
+ html->error =
+ HTML_ERROR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE;
+ goto HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED_anything_else;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_TAG;
+ html_emit_eof_token(html);
+ break;
+ default:
+ HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED_anything_else:
+ attr = &NEW_TOKEN_LAST_ATTR;
+ STR_APPEND(attr->val, attr->val_len, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
+ break;
+ case '/':
+ html->state = HTML_STATE_SELF_CLOSING_START_TAG;
+ break;
+ case '>':
+ html->state = HTML_STATE_DATA;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_TAG;
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->error = HTML_ERROR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES;
+ html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_SELF_CLOSING_START_TAG:
+ switch (cc) {
+ case '>':
+ html->new_token.tag.self_closing = true;
+ html->state = HTML_STATE_DATA;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_TAG;
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->error = HTML_ERROR_UNEXPECTED_SOLIDUS_IN_TAG;
+ html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_BOGUS_COMMENT:
+ switch (cc) {
+ case '>':
+ html->state = HTML_STATE_DATA;
+ html_emit_comment(html, &html->new_token.comment);
+ break;
+ case EOF:
+ html_emit_comment(html, &html->new_token.comment);
+ html_emit_eof_token(html);
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ if (!html->ignore_comment_data) {
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, HTML_REPLACEMENT_CHARACTER);
+ }
+ break;
+ default:
+ if (!html->ignore_comment_data) {
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, cc);
+ }
+ break;
+ }
+ break;
+ case HTML_STATE_MARKUP_DECLARATION_OPEN:
+ /* "If the next few characters are" */
+ /* https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state */
+
+ if (cc == '-' && html->lookahead[0] == '-') {
+ html_lookahead_consume(html, 1);
+ html_prep_new_token(html, HTML_TOKEN_COMMENT);
+ html->state = HTML_STATE_COMMENT_START;
+ break;
+ } else if ((cc == 'd' || cc == 'D') &&
+ strncasecmp(html->lookahead, "octype", 6) == 0) {
+ html_lookahead_consume(html, 6);
+ html->state = HTML_STATE_DOCTYPE;
+ html_prep_new_token(html, HTML_TOKEN_DOCTYPE);
+ break;
+ } else if (cc == '[' && memcmp(html->lookahead, "CDATA[", 6) == 0) {
+ html_lookahead_consume(html, 6);
+ if (html->current_node->ns != HTML_NAMESPACE_HTML)
+ html->state = HTML_STATE_CDATA_SECTION;
+ else
+ html->error = HTML_ERROR_CDATA_IN_HTML_CONTENT;
+
+ html_prep_new_token(html, HTML_TOKEN_COMMENT);
+ if (!html->ignore_comment_data)
+ html->new_token.comment.len =
+ strlcpy(html->new_token.comment.data,
+ "[CDATA[", sizeof(html->new_token.comment.data));
+ html->state = HTML_STATE_BOGUS_COMMENT;
+ break;
+ } else {
+ html->error = HTML_ERROR_INCORRECTLY_OPENED_COMMENT;
+ html_prep_new_token(html, HTML_TOKEN_COMMENT);
+ html->state = HTML_STATE_BOGUS_COMMENT;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_COMMENT_START:
+ switch (cc) {
+ case '-':
+ html->state = HTML_STATE_COMMENT_START_DASH;
+ break;
+ case '>':
+ html->error = HTML_ERROR_ABRUPT_CLOSING_OF_EMPTY_COMMENT;
+ html->state = HTML_STATE_DATA;
+ html_emit_comment(html, &html->new_token.comment);
+ break;
+ default:
+ html->state = HTML_STATE_COMMENT;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_COMMENT_START_DASH:
+ switch (cc) {
+ case '-':
+ html->state = HTML_STATE_COMMENT_END_DASH;
+ break;
+ case '>':
+ html->error = HTML_ERROR_ABRUPT_CLOSING_OF_EMPTY_COMMENT;
+ html->state = HTML_STATE_DATA;
+ html->new_token.type = HTML_TOKEN_COMMENT;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_COMMENT;
+ html->new_token.type = HTML_TOKEN_COMMENT;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ if (!html->ignore_comment_data) {
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, '-');
+ }
+ html->state = HTML_STATE_COMMENT;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_COMMENT:
+ switch (cc) {
+ case '<':
+ if (!html->ignore_comment_data) {
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, cc);
+ }
+ html->state = HTML_STATE_COMMENT_LESS_THAN_SIGN;
+ break;
+ case '-':
+ html->state = HTML_STATE_COMMENT_END_DASH;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ if (!html->ignore_comment_data) {
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, HTML_REPLACEMENT_CHARACTER);
+ }
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_COMMENT;
+ html->new_token.type = HTML_TOKEN_COMMENT;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ if (!html->ignore_comment_data) {
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, cc);
+ }
+ break;
+ }
+ break;
+ case HTML_STATE_COMMENT_LESS_THAN_SIGN:
+ switch (cc) {
+ case '!':
+ if (!html->ignore_comment_data) {
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, cc);
+ }
+ html->state = HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG;
+ break;
+ case '<':
+ if (!html->ignore_comment_data) {
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, cc);
+ }
+ break;
+ default:
+ html->state = HTML_STATE_COMMENT;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG:
+ switch (cc) {
+ case '-':
+ html->state = HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH;
+ break;
+ default:
+ html->state = HTML_STATE_COMMENT;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH:
+ switch (cc) {
+ case '-':
+ html->state = HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH;
+ break;
+ default:
+ html->state = HTML_STATE_COMMENT_END_DASH;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH:
+ switch (cc) {
+ case '>':
+ case EOF:
+ html->state = HTML_STATE_COMMENT_END;
+ goto reconsume;
+ default:
+ html->error = HTML_ERROR_NESTED_COMMENT;
+ html->state = HTML_STATE_COMMENT_END;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_COMMENT_END_DASH:
+ switch (cc) {
+ case '-':
+ html->state = HTML_STATE_COMMENT_END;
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_COMMENT;
+ html->new_token.type = HTML_TOKEN_COMMENT;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ if (!html->ignore_comment_data) {
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, '-');
+ }
+ html->state = HTML_STATE_COMMENT;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_COMMENT_END:
+ switch (cc) {
+ case '>':
+ html->state = HTML_STATE_DATA;
+ html->new_token.type = HTML_TOKEN_COMMENT;
+ html_emit_token(html, &html->new_token);
+ break;
+ case '!':
+ html->state = HTML_STATE_COMMENT_END;
+ break;
+ case '-':
+ if (!html->ignore_comment_data) {
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, '-');
+ }
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_COMMENT;
+ html->new_token.type = HTML_TOKEN_COMMENT;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ if (!html->ignore_comment_data) {
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, '-');
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, '-');
+ }
+ html->state = HTML_STATE_COMMENT;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_COMMENT_END_BANG:
+ switch (cc) {
+ case '-':
+ if (!html->ignore_comment_data) {
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, '-');
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, '-');
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, '!');
+ }
+ html->state = HTML_STATE_COMMENT_END_DASH;
+ break;
+ case '>':
+ html->error = HTML_ERROR_INCORRECTLY_CLOSED_COMMENT;
+ html->state = HTML_STATE_DATA;
+ html->new_token.type = HTML_TOKEN_COMMENT;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_COMMENT;
+ html->new_token.type = HTML_TOKEN_COMMENT;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ if (!html->ignore_comment_data) {
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, '-');
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, '-');
+ STR_APPEND(html->new_token.comment.data,
+ html->new_token.comment.len, '!');
+ }
+ html->state = HTML_STATE_COMMENT;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_DOCTYPE:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ html->state = HTML_STATE_BEFORE_DOCTYPE_NAME;
+ break;
+ case '>':
+ html->state = HTML_STATE_BEFORE_DOCTYPE_NAME;
+ goto reconsume;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_prep_new_token(html, HTML_TOKEN_DOCTYPE);
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->error = HTML_ERROR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME;
+ html->state = HTML_STATE_BEFORE_DOCTYPE_NAME;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_BEFORE_DOCTYPE_NAME:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ /* ignore */
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ html_prep_new_token(html, HTML_TOKEN_DOCTYPE);
+ STR_APPEND(html->new_token.tag.name,
+ html->new_token.tag.name_len, '!');
+ html->state = HTML_STATE_DOCTYPE_NAME;
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_prep_new_token(html, HTML_TOKEN_DOCTYPE);
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ case '>':
+ html->error = HTML_ERROR_MISSING_DOCTYPE_NAME;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ html->state = HTML_STATE_DATA;
+ break;
+ default:
+ if (IS_UPPER_ALPHA(cc))
+ cc += 0x20;
+ html_prep_new_token(html, HTML_TOKEN_DOCTYPE);
+ html->state = HTML_STATE_DOCTYPE_NAME;
+ STR_APPEND(html->new_token.doctype.name,
+ html->new_token.doctype.name_len, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_DOCTYPE_NAME:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ html->state = HTML_STATE_AFTER_DOCTYPE_NAME;
+ html->tmp_len = 0;
+ break;
+ case '>':
+ html_emit_token(html, &html->new_token);
+ html->state = HTML_STATE_DATA;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ STR_APPEND(html->new_token.doctype.name,
+ html->new_token.doctype.name_len, HTML_REPLACEMENT_CHARACTER);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_prep_new_token(html, HTML_TOKEN_DOCTYPE);
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ if (IS_UPPER_ALPHA(cc))
+ cc += 0x20;
+ STR_APPEND(html->new_token.doctype.name,
+ html->new_token.doctype.name_len, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_AFTER_DOCTYPE_NAME:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ /* ignore */
+ break;
+ case '>':
+ html->state = HTML_STATE_DATA;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ if ((cc == 'p' || cc == 'P') &&
+ strncasecmp(html->lookahead, "ublic", 5) == 0) {
+ html->state = HTML_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD;
+ html->lookahead_len = 0;
+ } else if ((cc == 's' || cc == 'S') &&
+ strncasecmp(html->lookahead, "ystem", 5) == 0) {
+ html->state = HTML_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD;
+ html->lookahead_len = 0;
+ } else {
+ html->error =
+ HTML_ERROR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME;
+ html->new_token.doctype.force_quirks = true;
+ html->state = HTML_STATE_BOGUS_DOCTYPE;
+ goto reconsume;
+ }
+ break;
+ }
+ break;
+ case HTML_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ html->state = HTML_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER;
+ break;
+ case '"':
+ html->error =
+ HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD;
+ memset(html->new_token.doctype.public_identifier, 0,
+ sizeof(html->new_token.doctype.public_identifier));
+ html->new_token.doctype.public_identifier_len = 0;
+ html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED;
+ break;
+ case '\'':
+ html->error =
+ HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD;
+ memset(html->new_token.doctype.public_identifier, 0,
+ sizeof(html->new_token.doctype.public_identifier));
+ html->new_token.doctype.public_identifier_len = 0;
+ html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
+ break;
+ case '>':
+ html->error = HTML_ERROR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html->state = HTML_STATE_DATA;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->error =
+ HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html->state = HTML_STATE_BOGUS_DOCTYPE;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ /* ignore */
+ break;
+ case '"':
+ memset(html->new_token.doctype.public_identifier, 0,
+ sizeof(html->new_token.doctype.public_identifier));
+ html->new_token.doctype.public_identifier_len = 0;
+ html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED;
+ break;
+ case '\'':
+ memset(html->new_token.doctype.public_identifier, 0,
+ sizeof(html->new_token.doctype.public_identifier));
+ html->new_token.doctype.public_identifier_len = 0;
+ html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
+ break;
+ case '>':
+ html->error = HTML_ERROR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html->state = HTML_STATE_DATA;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->error =
+ HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html->state = HTML_STATE_BOGUS_DOCTYPE;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
+ switch (cc) {
+ case '"':
+ html->state = HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ STR_APPEND(html->new_token.doctype.public_identifier,
+ html->new_token.doctype.public_identifier_len,
+ HTML_REPLACEMENT_CHARACTER);
+ break;
+ case '>':
+ html->error = HTML_ERROR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html->state = HTML_STATE_DATA;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ STR_APPEND(html->new_token.doctype.public_identifier,
+ html->new_token.doctype.public_identifier_len, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
+ switch (cc) {
+ case '\'':
+ html->state = HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ STR_APPEND(html->new_token.doctype.public_identifier,
+ html->new_token.doctype.public_identifier_len,
+ HTML_REPLACEMENT_CHARACTER);
+ break;
+ case '>':
+ html->error = HTML_ERROR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html->state = HTML_STATE_DATA;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ STR_APPEND(html->new_token.doctype.public_identifier,
+ html->new_token.doctype.public_identifier_len, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ html->state =
+ HTML_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS;
+ break;
+ case '>':
+ html->state = HTML_STATE_DATA;
+ html_emit_token(html, &html->new_token);
+ break;
+ case '"':
+ html->error =
+ HTML_ERROR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS;
+ memset(html->new_token.doctype.system_identifier, 0,
+ sizeof(html->new_token.doctype.system_identifier));
+ html->new_token.doctype.system_identifier_len = 0;
+ html->state = HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
+ break;
+ case '\'':
+ html->error =
+ HTML_ERROR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS;
+ memset(html->new_token.doctype.system_identifier, 0,
+ sizeof(html->new_token.doctype.system_identifier));
+ html->new_token.doctype.system_identifier_len = 0;
+ html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->error =
+ HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html->state = HTML_STATE_BOGUS_DOCTYPE;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ /* ignore */
+ break;
+ case '>':
+ html->state = HTML_STATE_DATA;
+ html_emit_token(html, &html->new_token);
+ break;
+ case '"':
+ memset(html->new_token.doctype.system_identifier, 0,
+ sizeof(html->new_token.doctype.system_identifier));
+ html->new_token.doctype.system_identifier_len = 0;
+ html->state = HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
+ break;
+ case '\'':
+ memset(html->new_token.doctype.system_identifier, 0,
+ sizeof(html->new_token.doctype.system_identifier));
+ html->new_token.doctype.system_identifier_len = 0;
+ html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->error =
+ HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html->state = HTML_STATE_BOGUS_DOCTYPE;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ html->state = HTML_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
+ break;
+ case '"':
+ html->error =
+ HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD;
+ memset(html->new_token.doctype.system_identifier, 0,
+ sizeof(html->new_token.doctype.system_identifier));
+ html->new_token.doctype.system_identifier_len = 0;
+ html->state = HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
+ break;
+ case '\'':
+ html->error =
+ HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD;
+ memset(html->new_token.doctype.system_identifier, 0,
+ sizeof(html->new_token.doctype.system_identifier));
+ html->new_token.doctype.system_identifier_len = 0;
+ html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
+ break;
+ case '>':
+ html->error = HTML_ERROR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->error =
+ HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html->state = HTML_STATE_BOGUS_DOCTYPE;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ /* ignore */
+ break;
+ case '"':
+ memset(html->new_token.doctype.system_identifier, 0,
+ sizeof(html->new_token.doctype.system_identifier));
+ html->new_token.doctype.system_identifier_len = 0;
+ html->state = HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
+ break;
+ case '\'':
+ memset(html->new_token.doctype.system_identifier, 0,
+ sizeof(html->new_token.doctype.system_identifier));
+ html->new_token.doctype.system_identifier_len = 0;
+ html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
+ break;
+ case '>':
+ html->error = HTML_ERROR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->error =
+ HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html->state = HTML_STATE_BOGUS_DOCTYPE;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
+ switch (cc) {
+ case '"':
+ html->state = HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ STR_APPEND(html->new_token.doctype.system_identifier,
+ html->new_token.doctype.system_identifier_len,
+ HTML_REPLACEMENT_CHARACTER);
+ break;
+ case '>':
+ html->error = HTML_ERROR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ STR_APPEND(html->new_token.doctype.system_identifier,
+ html->new_token.doctype.system_identifier_len, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
+ switch (cc) {
+ case '\'':
+ html->state = HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER;
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ STR_APPEND(html->new_token.doctype.system_identifier,
+ html->new_token.doctype.system_identifier_len,
+ HTML_REPLACEMENT_CHARACTER);
+ break;
+ case '>':
+ html->error = HTML_ERROR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ STR_APPEND(html->new_token.doctype.system_identifier,
+ html->new_token.doctype.system_identifier_len, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
+ switch (cc) {
+ case '\t':
+ case '\n':
+ case '\f':
+ case ' ':
+ /* ignore */
+ break;
+ case '>':
+ html->state = HTML_STATE_DATA;
+ html_emit_token(html, &html->new_token);
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_DOCTYPE;
+ html->new_token.doctype.force_quirks = true;
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ html->error =
+ HTML_ERROR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER;
+ html->state = HTML_STATE_BOGUS_DOCTYPE;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_BOGUS_DOCTYPE:
+ switch (cc) {
+ case '>':
+ html->state = HTML_STATE_DATA;
+ html_emit_token(html, &html->new_token);
+ break;
+ case '\0':
+ html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
+ /* ignore */
+ break;
+ case EOF:
+ html_emit_token(html, &html->new_token);
+ html_emit_eof_token(html);
+ break;
+ default:
+ /* ignore */
+ break;
+ }
+ break;
+ case HTML_STATE_CDATA_SECTION:
+ switch (cc) {
+ case ']':
+ html->state = HTML_STATE_CDATA_SECTION_BRACKET;
+ break;
+ case EOF:
+ html->error = HTML_ERROR_EOF_IN_CDATA;
+ html_emit_eof_token(html);
+ break;
+ default:
+ if (!html->ignore_comment_data)
+ html_emit_char_token(html, cc);
+ break;
+ }
+ break;
+ case HTML_STATE_CDATA_SECTION_BRACKET:
+ switch (cc) {
+ case ']':
+ html->state = HTML_STATE_CDATA_SECTION_END;
+ break;
+ default:
+ if (!html->ignore_comment_data)
+ html_emit_char_token(html, ']');
+ html->state = HTML_STATE_CDATA_SECTION;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_CDATA_SECTION_END:
+ switch (cc) {
+ case ']':
+ if (!html->ignore_comment_data)
+ html_emit_char_token(html, ']');
+ break;
+ case '>':
+ html->state = HTML_STATE_DATA;
+ break;
+ default:
+ if (!html->ignore_comment_data) {
+ html_emit_char_token(html, ']');
+ html_emit_char_token(html, ']');
+ }
+ html->state = HTML_STATE_CDATA_SECTION;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_CHARACTER_REFERENCE:
+ STR_APPEND(html->tmp, html->tmp_len, '&');
+
+ if (cc == '#') {
+ STR_APPEND(html->tmp, html->tmp_len, cc);
+ html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE;
+ break;
+ }
+ if (IS_ALPHANUMERIC(cc)) {
+ html->state = HTML_STATE_NAMED_CHARACTER_REFERENCE;
+ goto reconsume;
+ }
+
+ /* flush consumed */
+ if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) {
+ /* consumed as part of an attribute */
+ for (n = 0; n < html->tmp_len; n++) {
+ attr = &NEW_TOKEN_LAST_ATTR;
+ STR_APPEND(attr->val, attr->val_len, html->tmp[n]);
+ }
+ } else {
+ /* TODO: check return state for comment ones if ignoring */
+
+ for (n = 0; n < html->tmp_len; n++)
+ html_emit_char_token(html, html->tmp[n]);
+ }
+ html->tmp_len = 0;
+ html->state = html->return_state;
+ goto reconsume;
+ case HTML_STATE_NAMED_CHARACTER_REFERENCE:
+ found_entity = NULL;
+
+ STR_APPEND(html->tmp, html->tmp_len, cc);
+
+ for (n = 0; n < html->lookahead_len; n++) {
+ STR_APPEND(html->tmp, html->tmp_len, html->lookahead[n]);
+ if (html->lookahead[n] == ';')
+ break;
+ }
+
+ HTML_DEBUG((": trying to match '%s'", html->tmp));
+
+ found_entity = NULL;
+ for (j = 0; html_entities[j].entity != NULL; j++) {
+ for (i = 0; ; i++) {
+ if (html_entities[j].entity[i] == '\0') {
+ /*
+ * If we have an ; in our buffer, match the longer
+ * version of this entity instead (& instead of
+ * &)
+ */
+ if (html_entities[j].entity[i - 1] != ';' &&
+ html->tmp[i] == ';')
+ goto next_entity;
+ found_entity = &html_entities[j];
+ HTML_DEBUG((": matched lookahead to entity '%s'",
+ found_entity->entity));
+ html_lookahead_consume(html, i - 2);
+ break;
+ }
+ if (i >= html->tmp_len ||
+ html_entities[j].entity[i] != html->tmp[i])
+ goto next_entity;
+ }
+next_entity:
+ continue;
+ }
+
+ if (found_entity != NULL) {
+ if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE &&
+ html->tmp[html->tmp_len - 1] != ';' &&
+ (html->lookahead[0] == '=' ||
+ IS_ALPHANUMERIC(html->lookahead[0]))) {
+ /*
+ * "for historical reasons, flush code points consumed as a
+ * character reference and switch to the return state."
+ */
+ HTML_DEBUG((": doing historical flush thing"));
+ attr = &NEW_TOKEN_LAST_ATTR;
+ for (n = 0; n < html->tmp_len; n++) {
+ STR_APPEND(attr->val, attr->val_len, html->tmp[n]);
+ }
+ html->tmp_len = 0;
+ html->state = html->return_state;
+ break;
+ }
+
+ /* otherwise... */
+ if (html->tmp[html->tmp_len - 1] != ';')
+ html->error =
+ HTML_ERROR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE;
+
+ html->tmp_len = 0;
+
+ if ((j = (found_entity->codepoint >> 24) & 0xff))
+ html->tmp[html->tmp_len++] = j;
+ if ((j = (found_entity->codepoint >> 16) & 0xff))
+ html->tmp[html->tmp_len++] = j;
+ if ((j = (found_entity->codepoint >> 8) & 0xff))
+ html->tmp[html->tmp_len++] = j;
+ if ((j = found_entity->codepoint & 0xff))
+ html->tmp[html->tmp_len++] = j;
+
+ /* fall through */
+ } else {
+ HTML_DEBUG((": no entity found for '%s'", html->tmp));
+
+ /* pretend we didn't copy anything into tmp after & and cc */
+ html->tmp_len = 2;
+ html->tmp[html->tmp_len] = '\0';
+ }
+
+ if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) {
+ attr = &NEW_TOKEN_LAST_ATTR;
+ for (n = 0; n < html->tmp_len; n++) {
+ STR_APPEND(attr->val, attr->val_len, html->tmp[n]);
+ }
+ HTML_DEBUG((": attribute %s=\"%s\"", attr->name, attr->val));
+ } else {
+ for (j = 0; j < html->tmp_len; j++)
+ html_emit_char_token(html, html->tmp[j]);
+ }
+
+ html->tmp_len = 0;
+ if (found_entity == NULL)
+ html->state = HTML_STATE_AMBIGUOUS_AMPERSAND;
+ else
+ html->state = html->return_state;
+ break;
+ case HTML_STATE_AMBIGUOUS_AMPERSAND:
+ if (IS_ALPHANUMERIC(cc)) {
+ if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) {
+ attr = &NEW_TOKEN_LAST_ATTR;
+ STR_APPEND(attr->val, attr->val_len, cc);
+ } else {
+ html_emit_char_token(html, cc);
+ }
+ break;
+ }
+ if (cc == ';') {
+ html->error = HTML_ERROR_UNKNOWN_NAMED_CHARACTER_REFERENCE;
+ html->state = html->return_state;
+ goto reconsume;
+ }
+ html->state = html->return_state;
+ goto reconsume;
+ case HTML_STATE_NUMERIC_CHARACTER_REFERENCE:
+ switch (cc) {
+ case 'x':
+ case 'X':
+ STR_APPEND(html->tmp, html->tmp_len, cc);
+ html->state = HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE_START;
+ break;
+ default:
+ html->state = HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START;
+ goto reconsume;
+ }
+ break;
+ case HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE_START:
+ if (IS_HEX_DIGIT(cc)) {
+ html->state = HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE;
+ goto reconsume;
+ }
+
+ html->error =
+ HTML_ERROR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE;
+ if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) {
+ attr = &NEW_TOKEN_LAST_ATTR;
+ STR_APPEND(attr->val, attr->val_len, cc);
+ } else {
+ html_emit_char_token(html, cc);
+ }
+ html->state = html->return_state;
+ goto reconsume;
+ case HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START:
+ case HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE:
+ case HTML_STATE_DECIMAL_CHARACTER_REFERENCE:
+ case HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END:
+ /* TODO */
+ panic("state %d not supported", html->state);
+ break;
+ default:
+ panic("bogus tokenize state %d", html->state);
+ }
+
+ if (html->state != was_state)
+ HTML_DEBUG((": exited state %d", html_state_names[html->state]));
+
+ if (html->error) {
+ HTML_DEBUG((": error %s", html_error_strings[html->error]));
+ html->error = 0;
+ }
+ HTML_DEBUG(("\r"));
+}
+
+void
+html_tokenize_finish(struct html_page *html)
+{
+ if (html->lookahead_len) {
+ HTML_DEBUG(("finish requested, tokenizing remaining %d lookahead\r",
+ html->lookahead_len));
+
+ while (html->lookahead_len)
+ html_tokenize(html, EOF);
+ }
+
+ html_tokenize(html, EOF);
+
+ html_stop_parsing(html);
+}
+
+void
+html_prep_new_token(struct html_page *html, html_token_type token_type)
+{
+ memset(&html->new_token, 0, sizeof(html_token));
+ html->new_token.type = token_type;
+}
+
+struct html_attr *
+html_prep_new_attribute(struct html_page *html, struct html_tag *tag)
+{
+ if (tag->attrs_count >= nitems(tag->attrs))
+ panic("tag attr overflow");
+
+ tag->attrs_count++;
+ tag->attrs[tag->attrs_count - 1].name_len = 0;
+ tag->attrs[tag->attrs_count - 1].name[0] = '\0';
+ tag->attrs[tag->attrs_count - 1].val_len = 0;
+ tag->attrs[tag->attrs_count - 1].val[0] = '\0';
+
+ return &tag->attrs[tag->attrs_count - 1];
+}
+
+bool
+html_appropriate_end_tag_token(struct html_page *html, html_token *token)
+{
+ /* https://html.spec.whatwg.org/multipage/parsing.html#tokenization
+ * "an end tag token whose tag name matches the tag name of the last start
+ * tag to have been emitted"
+ */
+ if (html->open_count <= 0)
+ return false;
+
+ /* TODO: fix */
+
+ return (strcmp(html->current_node->name, html->new_token.tag.name) == 0);
+}
+
+html_tag_type
+html_find_tag_type(char *name)
+{
+ short n;
+
+ for (n = 1; html_tag_names[n] != NULL; n++) {
+ if (strcasecmp(name, html_tag_names[n]) == 0)
+ return n;
+ }
+
+ HTML_DEBUG((": html_find_tag_type couldn't find %s", name));
+ return 0;
+}