| 1 |
/* |
| 2 |
* Copyright (c) 2024 joshua stein <jcs@jcs.org> |
| 3 |
* |
| 4 |
* Permission to use, copy, modify, and distribute this software for any |
| 5 |
* purpose with or without fee is hereby granted, provided that the above |
| 6 |
* copyright notice and this permission notice appear in all copies. |
| 7 |
* |
| 8 |
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| 9 |
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| 10 |
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
| 11 |
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 12 |
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| 13 |
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
| 14 |
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 15 |
*/ |
| 16 |
|
| 17 |
#include <stdlib.h> |
| 18 |
#include <stdarg.h> |
| 19 |
#include <stdio.h> |
| 20 |
#include <string.h> |
| 21 |
#include "stdint.h" |
| 22 |
|
| 23 |
#include "util.h" |
| 24 |
|
| 25 |
//#define HTML_ENABLE |
| 26 |
|
| 27 |
#ifdef HTML_ENABLE |
| 28 |
void html_output(void *cookie, struct html_page *html, char *str, |
| 29 |
size_t len); |
| 30 |
void html_output_margin(void *cookie, struct html_page *html); |
| 31 |
void html_output_field(void *cookie, struct html_page *html, |
| 32 |
struct html_element *el); |
| 33 |
void html_debug(const char *fmt, ...); |
| 34 |
void html_have_title(void *cookie, struct html_page *html, char *str, |
| 35 |
size_t len); |
| 36 |
|
| 37 |
//#define HTML_ENABLE_DEBUGGING |
| 38 |
#ifdef HTML_ENABLE_DEBUGGING |
| 39 |
extern struct html_page *the_html; |
| 40 |
# define HTML_DEBUG(x) do { html_debug x; } while (0) |
| 41 |
#else |
| 42 |
# define HTML_DEBUG(x) {} |
| 43 |
#endif |
| 44 |
|
| 45 |
/* |
| 46 |
* tunables |
| 47 |
*/ |
| 48 |
|
| 49 |
#define HTML_STACK_DEPTH 128 |
| 50 |
|
| 51 |
/* this should in theory be the max size of an html_entity but that's huge */ |
| 52 |
#define HTML_LOOKAHEAD_SIZE 10 |
| 53 |
|
| 54 |
#define HTML_OUTPUT_BUF_SIZE 64 |
| 55 |
#define HTML_TAG_TEXT_CHUNK_SIZE 512 |
| 56 |
|
| 57 |
/* |
| 58 |
* helpers |
| 59 |
*/ |
| 60 |
|
| 61 |
#define IS_WHITESPACE(c) ((c) == '\t' || (c) == '\n' || (c) == '\f' || \ |
| 62 |
(c) == '\r' || (c) == ' ') |
| 63 |
#define IS_LOWER_ALPHA(c) ((c) >= 'a' && (c) <= 'z') |
| 64 |
#define IS_UPPER_ALPHA(c) ((c) >= 'A' && (c) <= 'Z') |
| 65 |
#define IS_ALPHA(c) (IS_LOWER_ALPHA((c)) || IS_UPPER_ALPHA((c))) |
| 66 |
#define IS_DIGIT(c) (((c) >= '0' && (c) <= '9')) |
| 67 |
#define IS_ALPHANUMERIC(c) (IS_ALPHA((c)) || IS_DIGIT((c))) |
| 68 |
#define IS_LOWER_HEX_DIGIT(c) ((c) >= 'a' && (c) <= 'f') |
| 69 |
#define IS_UPPER_HEX_DIGIT(c) ((c) >= 'A' && (c) <= 'F') |
| 70 |
#define IS_HEX_DIGIT(c) (IS_LOWER_HEX_DIGIT(c) || IS_UPPER_HEX_DIGIT(c)) |
| 71 |
#define IS_LEADING_SURROGATE(c) ((c) >= 0xdb00 && (c) <= 0xdbff) |
| 72 |
#define IS_TRAILING_SURROGATE(c) ((c) >= 0xdc00 && (c) <= 0xdfff) |
| 73 |
#define IS_SURROGATE(c) (IS_LEADING_SURROGATE(c) || IS_TRAILING_SURROGATE(c)) |
| 74 |
#define IS_NONCHARACTER(c) (\ |
| 75 |
((c) >= 0xfdd0 && (c) <= 0xfdef) || \ |
| 76 |
(c) == 0xfffe || (c) == 0xffff || \ |
| 77 |
(c) == 0x1fffe || (c) == 0x1ffff || \ |
| 78 |
(c) == 0x2fffe || (c) == 0x2ffff || \ |
| 79 |
(c) == 0x3fffe || (c) == 0x3ffff || \ |
| 80 |
(c) == 0x4fffe || (c) == 0x4ffff || \ |
| 81 |
(c) == 0x5fffe || (c) == 0x5ffff || \ |
| 82 |
(c) == 0x6fffe || (c) == 0x6ffff || \ |
| 83 |
(c) == 0x7fffe || (c) == 0x7ffff || \ |
| 84 |
(c) == 0x8fffe || (c) == 0x8ffff || \ |
| 85 |
(c) == 0x9fffe || (c) == 0x9ffff || \ |
| 86 |
(c) == 0xafffe || (c) == 0xaffff || \ |
| 87 |
(c) == 0xbfffe || (c) == 0xbffff || \ |
| 88 |
(c) == 0xcfffe || (c) == 0xcffff || \ |
| 89 |
(c) == 0xdfffe || (c) == 0xdffff || \ |
| 90 |
(c) == 0xefffe || (c) == 0xeffff || \ |
| 91 |
(c) == 0xffffe || (c) == 0xfffff || \ |
| 92 |
(c) == 0x10fffe || (c) == 0x10ffff) |
| 93 |
#define IS_C0_CONTROL(c) ((c) >= 0 && (c) <= 0x1f) |
| 94 |
#define IS_CONTROL(c) (IS_C0_CONTROL((c)) || ((c) >= 0x7f && (c) <= 0x9f)) |
| 95 |
#define IS_BLOCK(tag) ((tag) < HTML_TAG_LAST_BLOCK) |
| 96 |
|
| 97 |
#define NEW_TOKEN_LAST_ATTR (html->new_token.tag.attrs[html->new_token.tag.attrs_count - 1]) |
| 98 |
|
| 99 |
/* only works on fixed-size char arrays */ |
| 100 |
#define STR_APPEND(field, len, ch) \ |
| 101 |
if ((len) < sizeof(field)) { \ |
| 102 |
(field)[(len)++] = (ch); \ |
| 103 |
(field)[(len)] = '\0'; \ |
| 104 |
} |
| 105 |
|
| 106 |
#define CONSUMED_AS_PART_OF_AN_ATTRIBUTE \ |
| 107 |
(html->return_state == HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED || \ |
| 108 |
html->return_state == HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED || \ |
| 109 |
html->return_state == HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED) |
| 110 |
|
| 111 |
#ifndef nitems |
| 112 |
#define nitems(what) (sizeof((what)) / sizeof((what)[0])) |
| 113 |
#endif |
| 114 |
|
| 115 |
/* insertion mode */ |
| 116 |
/* https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode */ |
| 117 |
extern const char *html_mode_names[]; |
| 118 |
typedef enum { |
| 119 |
HTML_MODE_NONE = 0, |
| 120 |
HTML_MODE_INITIAL, |
| 121 |
HTML_MODE_BEFORE_HTML, |
| 122 |
HTML_MODE_BEFORE_HEAD, |
| 123 |
HTML_MODE_IN_HEAD, |
| 124 |
HTML_MODE_IN_HEAD_NOSCRIPT, |
| 125 |
HTML_MODE_AFTER_HEAD, |
| 126 |
HTML_MODE_IN_BODY, |
| 127 |
HTML_MODE_TEXT, |
| 128 |
HTML_MODE_IN_TABLE, |
| 129 |
HTML_MODE_IN_TABLE_TEXT, |
| 130 |
HTML_MODE_IN_CAPTION, |
| 131 |
HTML_MODE_IN_COLUMN_GROUP, |
| 132 |
HTML_MODE_IN_TABLE_BODY, |
| 133 |
HTML_MODE_IN_ROW, |
| 134 |
HTML_MODE_IN_CELL, |
| 135 |
HTML_MODE_IN_SELECT, |
| 136 |
HTML_MODE_IN_SELECT_IN_TABLE, |
| 137 |
HTML_MODE_IN_TEMPLATE, |
| 138 |
HTML_MODE_AFTER_BODY, |
| 139 |
HTML_MODE_IN_FRAMESET, |
| 140 |
HTML_MODE_AFTER_FRAMESET, |
| 141 |
HTML_MODE_AFTER_AFTER_BODY, |
| 142 |
HTML_MODE_AFTER_AFTER_FRAMESET |
| 143 |
} html_mode; |
| 144 |
|
| 145 |
/* tokenization state */ |
| 146 |
/* https://html.spec.whatwg.org/multipage/parsing.html#tokenization */ |
| 147 |
extern const char *html_state_names[]; |
| 148 |
typedef enum { |
| 149 |
HTML_STATE_NONE = 0, |
| 150 |
HTML_STATE_DATA, |
| 151 |
HTML_STATE_RCDATA, |
| 152 |
HTML_STATE_RAWTEXT, |
| 153 |
HTML_STATE_SCRIPT_DATA, |
| 154 |
HTML_STATE_PLAINTEXT, |
| 155 |
HTML_STATE_TAG_OPEN, |
| 156 |
HTML_STATE_END_TAG_OPEN, |
| 157 |
HTML_STATE_TAG_NAME, |
| 158 |
HTML_STATE_RCDATA_LESS_THAN_SIGN, |
| 159 |
HTML_STATE_RCDATA_END_TAG_OPEN, |
| 160 |
HTML_STATE_RCDATA_END_TAG_NAME, |
| 161 |
HTML_STATE_RAWTEXT_LESS_THAN_SIGN, |
| 162 |
HTML_STATE_RAWTEXT_END_TAG_OPEN, |
| 163 |
HTML_STATE_RAWTEXT_END_TAG_NAME, |
| 164 |
HTML_STATE_SCRIPT_DATA_LESS_THAN_SIGN, |
| 165 |
HTML_STATE_SCRIPT_DATA_END_TAG_OPEN, |
| 166 |
HTML_STATE_SCRIPT_DATA_END_TAG_NAME, |
| 167 |
HTML_STATE_SCRIPT_DATA_ESCAPE_START, |
| 168 |
HTML_STATE_SCRIPT_DATA_ESCAPE_START_DASH, |
| 169 |
HTML_STATE_SCRIPT_DATA_ESCAPED, |
| 170 |
HTML_STATE_SCRIPT_DATA_ESCAPED_DASH, |
| 171 |
HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH, |
| 172 |
HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, |
| 173 |
HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN, |
| 174 |
HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME, |
| 175 |
HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START, |
| 176 |
HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED, |
| 177 |
HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH, |
| 178 |
HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, |
| 179 |
HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, |
| 180 |
HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END, |
| 181 |
HTML_STATE_BEFORE_ATTRIBUTE_NAME, |
| 182 |
HTML_STATE_ATTRIBUTE_NAME, |
| 183 |
HTML_STATE_AFTER_ATTRIBUTE_NAME, |
| 184 |
HTML_STATE_BEFORE_ATTRIBUTE_VALUE, |
| 185 |
HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED, |
| 186 |
HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED, |
| 187 |
HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED, |
| 188 |
HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED, |
| 189 |
HTML_STATE_SELF_CLOSING_START_TAG, |
| 190 |
HTML_STATE_BOGUS_COMMENT, |
| 191 |
HTML_STATE_MARKUP_DECLARATION_OPEN, |
| 192 |
HTML_STATE_COMMENT_START, |
| 193 |
HTML_STATE_COMMENT_START_DASH, |
| 194 |
HTML_STATE_COMMENT, |
| 195 |
HTML_STATE_COMMENT_LESS_THAN_SIGN, |
| 196 |
HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG, |
| 197 |
HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH, |
| 198 |
HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH, |
| 199 |
HTML_STATE_COMMENT_END_DASH, |
| 200 |
HTML_STATE_COMMENT_END, |
| 201 |
HTML_STATE_COMMENT_END_BANG, |
| 202 |
HTML_STATE_DOCTYPE, |
| 203 |
HTML_STATE_BEFORE_DOCTYPE_NAME, |
| 204 |
HTML_STATE_DOCTYPE_NAME, |
| 205 |
HTML_STATE_AFTER_DOCTYPE_NAME, |
| 206 |
HTML_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD, |
| 207 |
HTML_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, |
| 208 |
HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, |
| 209 |
HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, |
| 210 |
HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER, |
| 211 |
HTML_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, |
| 212 |
HTML_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD, |
| 213 |
HTML_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, |
| 214 |
HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, |
| 215 |
HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, |
| 216 |
HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER, |
| 217 |
HTML_STATE_BOGUS_DOCTYPE, |
| 218 |
HTML_STATE_CDATA_SECTION, |
| 219 |
HTML_STATE_CDATA_SECTION_BRACKET, |
| 220 |
HTML_STATE_CDATA_SECTION_END, |
| 221 |
HTML_STATE_CHARACTER_REFERENCE, |
| 222 |
HTML_STATE_NAMED_CHARACTER_REFERENCE, |
| 223 |
HTML_STATE_AMBIGUOUS_AMPERSAND, |
| 224 |
HTML_STATE_NUMERIC_CHARACTER_REFERENCE, |
| 225 |
HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE_START, |
| 226 |
HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START, |
| 227 |
HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE, |
| 228 |
HTML_STATE_DECIMAL_CHARACTER_REFERENCE, |
| 229 |
HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END |
| 230 |
} html_state; |
| 231 |
|
| 232 |
/* tokenization output */ |
| 233 |
/* https://html.spec.whatwg.org/multipage/parsing.html#tokenization */ |
| 234 |
#ifdef HTML_ENABLE_DEBUGGING |
| 235 |
extern const char *html_token_names[]; |
| 236 |
#endif |
| 237 |
typedef enum { |
| 238 |
HTML_TOKEN_DOCTYPE = 1, |
| 239 |
HTML_TOKEN_START_TAG, |
| 240 |
HTML_TOKEN_END_TAG, |
| 241 |
HTML_TOKEN_COMMENT, |
| 242 |
HTML_TOKEN_CHARACTER, |
| 243 |
HTML_TOKEN_EOF |
| 244 |
} html_token_type; |
| 245 |
|
| 246 |
/* html_process_token return states */ |
| 247 |
typedef enum { |
| 248 |
HTML_TOKEN_REPROCESS = 1, |
| 249 |
HTML_TOKEN_PROCESSED |
| 250 |
} html_token_act; |
| 251 |
|
| 252 |
/* parse errors */ |
| 253 |
/* https://html.spec.whatwg.org/multipage/parsing.html#parse-errors */ |
| 254 |
#ifdef HTML_ENABLE_DEBUGGING |
| 255 |
extern const char *html_error_strings[]; |
| 256 |
#endif |
| 257 |
typedef enum { |
| 258 |
HTML_ERROR_NONE, |
| 259 |
HTML_ERROR_ABRUPT_CLOSING_OF_EMPTY_COMMENT, |
| 260 |
HTML_ERROR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER, |
| 261 |
HTML_ERROR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER, |
| 262 |
HTML_ERROR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE, |
| 263 |
HTML_ERROR_CDATA_IN_HTML_CONTENT, |
| 264 |
HTML_ERROR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE, |
| 265 |
HTML_ERROR_CONTROL_CHARACTER_IN_INPUT_STREAM, |
| 266 |
HTML_ERROR_CONTROL_CHARACTER_REFERENCE, |
| 267 |
HTML_ERROR_DUPLICATE_ATTRIBUTE, |
| 268 |
HTML_ERROR_END_TAG_WITH_ATTRIBUTES, |
| 269 |
HTML_ERROR_END_TAG_WITH_TRAILING_SOLIDUS, |
| 270 |
HTML_ERROR_EOF_BEFORE_TAG_NAME, |
| 271 |
HTML_ERROR_EOF_IN_CDATA, |
| 272 |
HTML_ERROR_EOF_IN_COMMENT, |
| 273 |
HTML_ERROR_EOF_IN_DOCTYPE, |
| 274 |
HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT, |
| 275 |
HTML_ERROR_EOF_IN_TAG, |
| 276 |
HTML_ERROR_INCORRECTLY_CLOSED_COMMENT, |
| 277 |
HTML_ERROR_INCORRECTLY_OPENED_COMMENT, |
| 278 |
HTML_ERROR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME, |
| 279 |
HTML_ERROR_INVALID_FIRST_CHARACTER_OF_TAG_NAME, |
| 280 |
HTML_ERROR_MISSING_ATTRIBUTE_VALUE, |
| 281 |
HTML_ERROR_MISSING_DOCTYPE_NAME, |
| 282 |
HTML_ERROR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER, |
| 283 |
HTML_ERROR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER, |
| 284 |
HTML_ERROR_MISSING_END_TAG_NAME, |
| 285 |
HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, |
| 286 |
HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, |
| 287 |
HTML_ERROR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE, |
| 288 |
HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD, |
| 289 |
HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD, |
| 290 |
HTML_ERROR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME, |
| 291 |
HTML_ERROR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES, |
| 292 |
HTML_ERROR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, |
| 293 |
HTML_ERROR_NESTED_COMMENT, |
| 294 |
HTML_ERROR_NONCHARACTER_CHARACTER_REFERENCE, |
| 295 |
HTML_ERROR_NONCHARACTER_IN_INPUT_STREAM, |
| 296 |
HTML_ERROR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS, |
| 297 |
HTML_ERROR_NULL_CHARACTER_REFERENCE, |
| 298 |
HTML_ERROR_SURROGATE_CHARACTER_REFERENCE, |
| 299 |
HTML_ERROR_SURROGATE_IN_INPUT_STREAM, |
| 300 |
HTML_ERROR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER, |
| 301 |
HTML_ERROR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME, |
| 302 |
HTML_ERROR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE, |
| 303 |
HTML_ERROR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME, |
| 304 |
HTML_ERROR_UNEXPECTED_NULL_CHARACTER, |
| 305 |
HTML_ERROR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME, |
| 306 |
HTML_ERROR_UNEXPECTED_SOLIDUS_IN_TAG, |
| 307 |
HTML_ERROR_UNKNOWN_NAMED_CHARACTER_REFERENCE |
| 308 |
} html_error; |
| 309 |
|
| 310 |
/* keep this in same order as html_tag_names[] */ |
| 311 |
extern const char *html_tag_names[]; |
| 312 |
typedef enum { |
| 313 |
HTML_TAG__NONE = 0, |
| 314 |
HTML_TAG_A, |
| 315 |
HTML_TAG_ADDRESS, |
| 316 |
HTML_TAG_APPLET, |
| 317 |
HTML_TAG_AREA, |
| 318 |
HTML_TAG_ARTICLE, |
| 319 |
HTML_TAG_ASIDE, |
| 320 |
HTML_TAG_B, |
| 321 |
HTML_TAG_BASE, |
| 322 |
HTML_TAG_BASEFONT, |
| 323 |
HTML_TAG_BGSOUND, |
| 324 |
HTML_TAG_BIG, |
| 325 |
HTML_TAG_BLOCKQUOTE, |
| 326 |
HTML_TAG_BODY, |
| 327 |
HTML_TAG_BR, |
| 328 |
HTML_TAG_BUTTON, |
| 329 |
HTML_TAG_CAPTION, |
| 330 |
HTML_TAG_CENTER, |
| 331 |
HTML_TAG_CITE, |
| 332 |
HTML_TAG_CODE, |
| 333 |
HTML_TAG_COL, |
| 334 |
HTML_TAG_COLGROUP, |
| 335 |
HTML_TAG_DD, |
| 336 |
HTML_TAG_DETAILS, |
| 337 |
HTML_TAG_DFN, |
| 338 |
HTML_TAG_DIALOG, |
| 339 |
HTML_TAG_DIR, |
| 340 |
HTML_TAG_DIV, |
| 341 |
HTML_TAG_DL, |
| 342 |
HTML_TAG_DT, |
| 343 |
HTML_TAG_EM, |
| 344 |
HTML_TAG_EMBED, |
| 345 |
HTML_TAG_FIELDSET, |
| 346 |
HTML_TAG_FIGCAPTION, |
| 347 |
HTML_TAG_FIGURE, |
| 348 |
HTML_TAG_FONT, |
| 349 |
HTML_TAG_FOOTER, |
| 350 |
HTML_TAG_FORM, |
| 351 |
HTML_TAG_FRAME, |
| 352 |
HTML_TAG_FRAMESET, |
| 353 |
HTML_TAG_H1, |
| 354 |
HTML_TAG_H2, |
| 355 |
HTML_TAG_H3, |
| 356 |
HTML_TAG_H4, |
| 357 |
HTML_TAG_H5, |
| 358 |
HTML_TAG_H6, |
| 359 |
HTML_TAG_HEAD, |
| 360 |
HTML_TAG_HEADER, |
| 361 |
HTML_TAG_HGROUP, |
| 362 |
HTML_TAG_HR, |
| 363 |
HTML_TAG_HTML, |
| 364 |
HTML_TAG_I, |
| 365 |
HTML_TAG_IFRAME, |
| 366 |
HTML_TAG_IMAGE, |
| 367 |
HTML_TAG_IMG, |
| 368 |
HTML_TAG_INPUT, |
| 369 |
HTML_TAG_INS, |
| 370 |
HTML_TAG_KBD, |
| 371 |
HTML_TAG_KEYGEN, |
| 372 |
HTML_TAG_LI, |
| 373 |
HTML_TAG_LINK, |
| 374 |
HTML_TAG_LISTING, |
| 375 |
HTML_TAG_MAIN, |
| 376 |
HTML_TAG_MARQUEE, |
| 377 |
HTML_TAG_MATH, |
| 378 |
HTML_TAG_MENU, |
| 379 |
HTML_TAG_META, |
| 380 |
HTML_TAG_NAV, |
| 381 |
HTML_TAG_NOBR, |
| 382 |
HTML_TAG_NOEMBED, |
| 383 |
HTML_TAG_NOFRAMES, |
| 384 |
HTML_TAG_NOSCRIPT, |
| 385 |
HTML_TAG_OBJECT, |
| 386 |
HTML_TAG_OL, |
| 387 |
HTML_TAG_OPTGROUP, |
| 388 |
HTML_TAG_OPTION, |
| 389 |
HTML_TAG_P, |
| 390 |
HTML_TAG_PARAM, |
| 391 |
HTML_TAG_PLAINTEXT, |
| 392 |
HTML_TAG_PRE, |
| 393 |
HTML_TAG_RB, |
| 394 |
HTML_TAG_RP, |
| 395 |
HTML_TAG_RT, |
| 396 |
HTML_TAG_RTC, |
| 397 |
HTML_TAG_RUBY, |
| 398 |
HTML_TAG_S, |
| 399 |
HTML_TAG_SAMP, |
| 400 |
HTML_TAG_SCRIPT, |
| 401 |
HTML_TAG_SEARCH, |
| 402 |
HTML_TAG_SECTION, |
| 403 |
HTML_TAG_SELECT, |
| 404 |
HTML_TAG_SMALL, |
| 405 |
HTML_TAG_SOURCE, |
| 406 |
HTML_TAG_SPAN, |
| 407 |
HTML_TAG_STRIKE, |
| 408 |
HTML_TAG_STRONG, |
| 409 |
HTML_TAG_STYLE, |
| 410 |
HTML_TAG_SUB, |
| 411 |
HTML_TAG_SUP, |
| 412 |
HTML_TAG_SUMMARY, |
| 413 |
HTML_TAG_SVG, |
| 414 |
HTML_TAG_TABLE, |
| 415 |
HTML_TAG_TBODY, |
| 416 |
HTML_TAG_TD, |
| 417 |
HTML_TAG_TEMPLATE, |
| 418 |
HTML_TAG_TEXTAREA, |
| 419 |
HTML_TAG_TFOOT, |
| 420 |
HTML_TAG_TH, |
| 421 |
HTML_TAG_THEAD, |
| 422 |
HTML_TAG_TITLE, |
| 423 |
HTML_TAG_TR, |
| 424 |
HTML_TAG_TRACK, |
| 425 |
HTML_TAG_TT, |
| 426 |
HTML_TAG_U, |
| 427 |
HTML_TAG_UL, |
| 428 |
HTML_TAG_VAR, |
| 429 |
HTML_TAG_WBR, |
| 430 |
HTML_TAG_XMP, |
| 431 |
|
| 432 |
HTML_TAG_MAX_ID |
| 433 |
} html_tag_type; |
| 434 |
|
| 435 |
typedef enum { |
| 436 |
HTML_SCOPE_DEFAULT, |
| 437 |
HTML_SCOPE_LIST_ITEM, |
| 438 |
HTML_SCOPE_BUTTON, |
| 439 |
HTML_SCOPE_TABLE, |
| 440 |
HTML_SCOPE_SELECT |
| 441 |
} html_scope; |
| 442 |
|
| 443 |
typedef enum { |
| 444 |
HTML_NAMESPACE_HTML, |
| 445 |
HTML_NAMESPACE_MATHML, |
| 446 |
HTML_NAMESPACE_SVG, |
| 447 |
HTML_NAMESPACE_XLINK, |
| 448 |
HTML_NAMESPACE_XML, |
| 449 |
HTML_NAMESPACE_XMLNS |
| 450 |
} html_namespace; |
| 451 |
|
| 452 |
typedef struct { |
| 453 |
const char *entity; |
| 454 |
uint32_t codepoint; |
| 455 |
} html_entity; |
| 456 |
|
| 457 |
extern const html_entity html_entities[]; |
| 458 |
|
| 459 |
struct html_attr { |
| 460 |
char name[24]; |
| 461 |
short name_len; |
| 462 |
char val[128]; |
| 463 |
short val_len; |
| 464 |
}; |
| 465 |
|
| 466 |
struct html_tag { |
| 467 |
/* this must be first */ |
| 468 |
html_tag_type token_type; |
| 469 |
|
| 470 |
html_tag_type type; |
| 471 |
html_namespace ns; |
| 472 |
char name[16]; |
| 473 |
short name_len; |
| 474 |
struct html_attr attrs[16]; |
| 475 |
short attrs_count; |
| 476 |
bool emitted; |
| 477 |
bool self_closing; |
| 478 |
bool self_closing_acked; |
| 479 |
}; |
| 480 |
|
| 481 |
struct html_element { |
| 482 |
html_tag_type type; |
| 483 |
|
| 484 |
html_namespace ns; |
| 485 |
char name[16]; |
| 486 |
short name_len; |
| 487 |
struct html_attr attrs[8]; |
| 488 |
short attrs_count; |
| 489 |
|
| 490 |
char *text; |
| 491 |
size_t text_len; |
| 492 |
size_t text_off; |
| 493 |
size_t text_size; |
| 494 |
bool has_height; |
| 495 |
short margin_top; |
| 496 |
short margin_bottom; |
| 497 |
short ol_count; |
| 498 |
short renders; |
| 499 |
|
| 500 |
TEHandle input_te; |
| 501 |
|
| 502 |
short refs; |
| 503 |
struct html_element *next_need_free; |
| 504 |
}; |
| 505 |
|
| 506 |
struct html_comment { |
| 507 |
/* this must be first */ |
| 508 |
html_token_type token_type; |
| 509 |
|
| 510 |
char data[8]; |
| 511 |
short len; |
| 512 |
}; |
| 513 |
|
| 514 |
struct html_char { |
| 515 |
/* this must be first */ |
| 516 |
html_token_type token_type; |
| 517 |
|
| 518 |
char c; |
| 519 |
}; |
| 520 |
|
| 521 |
struct html_doctype { |
| 522 |
/* this must be first */ |
| 523 |
html_token_type _pad; |
| 524 |
|
| 525 |
char name[16]; |
| 526 |
short name_len; |
| 527 |
char public_identifier[16]; |
| 528 |
short public_identifier_len; |
| 529 |
char system_identifier[16]; |
| 530 |
short system_identifier_len; |
| 531 |
bool system_identifier_found; |
| 532 |
bool force_quirks; |
| 533 |
}; |
| 534 |
|
| 535 |
/* |
| 536 |
* THINK C doesn't support anonymous unions so we can't have a |
| 537 |
* struct html_token with tag/doctype/comment at the root |
| 538 |
*/ |
| 539 |
union html_token { |
| 540 |
/* every other type has html_token_type as its first member */ |
| 541 |
html_token_type type; |
| 542 |
|
| 543 |
struct html_tag tag; |
| 544 |
struct html_doctype doctype; |
| 545 |
struct html_comment comment; |
| 546 |
struct html_char ch; |
| 547 |
}; |
| 548 |
typedef union html_token html_token; |
| 549 |
|
| 550 |
struct html_formatting { |
| 551 |
bool marker; |
| 552 |
struct html_element *element; |
| 553 |
html_token_type token; |
| 554 |
}; |
| 555 |
|
| 556 |
struct html_page { |
| 557 |
void *cookie; |
| 558 |
|
| 559 |
size_t input_pos; |
| 560 |
bool eof; |
| 561 |
|
| 562 |
/* insertion mode */ |
| 563 |
html_mode mode; |
| 564 |
html_mode original_mode; |
| 565 |
|
| 566 |
html_state state; |
| 567 |
html_state return_state; |
| 568 |
|
| 569 |
html_error error; |
| 570 |
|
| 571 |
char *escaped_buf; |
| 572 |
size_t escaped_size; |
| 573 |
|
| 574 |
long char_ref_code; |
| 575 |
|
| 576 |
bool parse_last_cr; |
| 577 |
bool frameset_ok; |
| 578 |
bool parser_cannot_change_mode; |
| 579 |
bool foster_parenting; |
| 580 |
bool quirks_mode; |
| 581 |
|
| 582 |
/* rendering */ |
| 583 |
bool render_in_body; |
| 584 |
short render_list_depth; |
| 585 |
char last_output; |
| 586 |
bool last_margin_top; |
| 587 |
bool last_margin_bottom; |
| 588 |
|
| 589 |
/* configurables */ |
| 590 |
bool ignore_script_data; |
| 591 |
bool ignore_comment_data; |
| 592 |
bool scripting; |
| 593 |
bool styling; |
| 594 |
|
| 595 |
/* if the next character token should be skipped if it's \n */ |
| 596 |
bool skip_newline_char_token; |
| 597 |
|
| 598 |
/* "stack of open elements" */ |
| 599 |
struct html_element *open[HTML_STACK_DEPTH]; |
| 600 |
short open_count; |
| 601 |
struct html_element *current_node; |
| 602 |
struct html_element *need_free_list; |
| 603 |
struct html_element *need_free_tail; |
| 604 |
|
| 605 |
/* https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements */ |
| 606 |
struct html_formatting active_formatting[HTML_STACK_DEPTH]; |
| 607 |
short active_formatting_count; |
| 608 |
|
| 609 |
/* https://html.spec.whatwg.org/multipage/parsing.html#the-element-pointers */ |
| 610 |
struct html_element *head; |
| 611 |
struct html_element *form; |
| 612 |
|
| 613 |
union html_token new_token; |
| 614 |
|
| 615 |
/* we'll queue some characters up before actually parsing */ |
| 616 |
char lookahead[HTML_LOOKAHEAD_SIZE]; |
| 617 |
unsigned char lookahead_len; |
| 618 |
|
| 619 |
/* some tokens need a temporary buffer to store text */ |
| 620 |
char tmp[128]; |
| 621 |
unsigned char tmp_len; |
| 622 |
}; |
| 623 |
|
| 624 |
#define HTML_REPLACEMENT_CHARACTER 0xff |
| 625 |
|
| 626 |
void html_output(void *cookie, struct html_page *html, char *str, |
| 627 |
size_t len); |
| 628 |
void html_debug(const char *fmt, ...); |
| 629 |
void html_have_title(void *cookie, struct html_page *html, char *str, |
| 630 |
size_t len); |
| 631 |
|
| 632 |
/* html.c */ |
| 633 |
struct html_page * html_init_page(void *cookie); |
| 634 |
void html_page_finish(struct html_page **htmlp); |
| 635 |
void html_xfree(struct html_page **htmlp); |
| 636 |
bool html_parse(struct html_page *html, char *str, size_t len); |
| 637 |
void html_insert_character(struct html_page *html, short cc); |
| 638 |
bool html_is_block_tag(struct html_page *html, html_tag_type tag); |
| 639 |
long html_get_attribute_value(struct html_page *html, |
| 640 |
struct html_element *element, char *name, char **ret); |
| 641 |
void html_render_current_node(struct html_page *html, bool popping); |
| 642 |
void html_parse_error(struct html_page *html); |
| 643 |
void html_debug(const char *fmt, ...); |
| 644 |
#if 0 |
| 645 |
void html_emit_token(struct html_page *html, html_token *token); |
| 646 |
#else |
| 647 |
#define html_emit_token(a, b) html_process_token(a, b) |
| 648 |
#endif |
| 649 |
void html_buffer_output(struct html_page *html, char *str, size_t len); |
| 650 |
void html_flush_output_buffer(struct html_page *html); |
| 651 |
|
| 652 |
/* html_tokenize.c */ |
| 653 |
void html_tokenize(struct html_page *html, short cc); |
| 654 |
void html_prep_new_token(struct html_page *html, html_token_type token_type); |
| 655 |
struct html_attr * html_prep_new_attribute(struct html_page *html, |
| 656 |
struct html_tag *tag); |
| 657 |
void html_tokenize_finish(struct html_page *html); |
| 658 |
html_token_act html_process_token_in_foreign_content(struct html_page *html, |
| 659 |
html_token *token); |
| 660 |
|
| 661 |
/* html_tree.c */ |
| 662 |
void html_process_token(struct html_page *html, html_token *token); |
| 663 |
void html_append_comment(struct html_page *html, struct html_comment *comment); |
| 664 |
void html_stop_parsing(struct html_page *html); |
| 665 |
char * html_escape_string(struct html_page *html, char *str, size_t *len, |
| 666 |
bool attribute_mode); |
| 667 |
void html_emit_char_token(struct html_page *html, short cc); |
| 668 |
void html_emit_eof_token(struct html_page *html); |
| 669 |
void html_emit_comment(struct html_page *html, struct html_comment *comment); |
| 670 |
|
| 671 |
#endif /* HTML_ENABLE */ |