AmendHub

Download:

jcs

/

detritus

/

amendments

/

59

html: Import (basic) HTML render parsing the output of tree builder


jcs made amendment 59 about 1 year ago
--- html.c Thu Dec 12 12:08:30 2024 +++ html.c Thu Dec 12 12:08:30 2024 @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2024 joshua stein <jcs@jcs.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Glue for tying browser_page to the tokenizer, and then on the other end + * handling tags and text output by the tree builder. + */ + +#include <stdarg.h> +#include <stdio.h> +#include <string.h> + +#include "html.h" + +#ifdef HTML_ENABLE_DEBUGGING +struct html_page *the_html = NULL; +#endif + +struct html_page * +html_init_page(void *cookie) +{ + struct html_page *html; + + /* sanity check */ + if (strcmp(html_tag_names[HTML_TAG_XMP], "xmp") != 0) + panic("html_tag_names is out of sync with HTML_TAGs"); + + html = xmalloczero(sizeof(struct html_page)); + if (html == NULL) + return NULL; + html->cookie = cookie; + html->mode = HTML_MODE_INITIAL; + html->state = HTML_STATE_DATA; + html->frameset_ok = true; + + html->new_token.doctype.public_identifier_len = -1; + html->new_token.doctype.system_identifier_len = -1; + +#ifdef HTML_ENABLE_DEBUGGING + the_html = html; +#endif + return html; +} + +bool +html_parse(struct html_page *html, char *str, size_t len) +{ + size_t n; + register char cc; + + for (n = 0; n < len; n++) { + cc = str[n]; + + /* https://infra.spec.whatwg.org/#normalize-newlines */ + if (html->parse_last_cr) { + html->parse_last_cr = false; + if (cc != '\n') { + cc = '\n'; + n--; + } + } + if (cc == '\r') { + html->parse_last_cr = true; + continue; + } + + html_tokenize(html, cc); + + if (html->eof) { + HTML_DEBUG(("\rEOF\r")); + break; + } + } + + if (html->eof) + return false; + + return true; +} + +void +html_page_finish(struct html_page **htmlp) +{ + struct html_page *html = *htmlp; + + html_tokenize_finish(html); + html_xfree(htmlp); +} + +void +html_xfree(struct html_page **htmlp) +{ + struct html_page *html = *htmlp; + + if (html->escaped_buf) + xfree(&html->escaped_buf); + + xfree(&html); +} + +void +html_parse_error(struct html_page *html) +{ + HTML_DEBUG((": [[PARSE ERROR at %d]]", html->input_pos)); +} + +#if 0 +void +html_emit_token(struct html_page *html, html_token *token) +{ + /* + * html_tokenize handles each byte of html and runs it through the state + * machine, possibly emitting a token to us here. + * + * https://html.spec.whatwg.org/multipage/parsing.html#tree-construction + */ + + /* + * At some point we might collect tags and proces them all at once, or + * maybe just keep a buffer of a few, before handing them off to the tree + * constructor. + * + * But for now, just feed them all to the tree constructor as soon as we get + * them. + */ + + html_process_token(html, token); +} +#endif + +void +html_insert_character(struct html_page *html, short cc) +{ + register unsigned char c = cc; + + if (html->current_node == NULL) { + Debugger(); + return; + } + + if (html->current_node->text == NULL) { + html->current_node->text_size = HTML_TAG_TEXT_CHUNK_SIZE; + html->current_node->text = xmalloc(HTML_TAG_TEXT_CHUNK_SIZE); + } else if (html->current_node->text_len >= + html->current_node->text_size) { + html->current_node->text_size += HTML_TAG_TEXT_CHUNK_SIZE; + html->current_node->text = xrealloc(html->current_node->text, + html->current_node->text_size); + } + if (html->current_node->text == NULL) + panic("OOM"); + + if (html->current_node->type == HTML_TAG_TEXTAREA || + html->current_node->type == HTML_TAG_PRE) { + /* TODO: still remove leading newlines */ + goto append; + } + + if (c == '\t' || c == '\n' || c == '\f' || c == '\r') + c = ' '; + + if (c == ' ') { + /* collapse multiple whitespaces */ + if (html->current_node->text_len && html->current_node->text[ + html->current_node->text_len - 1] == ' ') + return; + } + +append: + html->current_node->text[html->current_node->text_len++] = c; +} + +bool +html_is_block_tag(struct html_page *html, html_tag_type tag) +{ + /* https://html.spec.whatwg.org/multipage/sections.html#sections */ + switch (tag) { + case HTML_TAG_ADDRESS: + case HTML_TAG_ARTICLE: + case HTML_TAG_ASIDE: + case HTML_TAG_BLOCKQUOTE: + case HTML_TAG_BODY: + case HTML_TAG_DD: + case HTML_TAG_DIV: + case HTML_TAG_DL: + case HTML_TAG_DT: + case HTML_TAG_FIGCAPTION: + case HTML_TAG_FIGURE: + case HTML_TAG_FOOTER: + case HTML_TAG_H1: + case HTML_TAG_H2: + case HTML_TAG_H3: + case HTML_TAG_H4: + case HTML_TAG_H5: + case HTML_TAG_H6: + case HTML_TAG_HEADER: + case HTML_TAG_HGROUP: + case HTML_TAG_HR: + case HTML_TAG_LI: + case HTML_TAG_MAIN: + case HTML_TAG_MENU: + case HTML_TAG_NAV: + case HTML_TAG_OL: + case HTML_TAG_P: + case HTML_TAG_PRE: + case HTML_TAG_SEARCH: + case HTML_TAG_SECTION: + case HTML_TAG_UL: + return true; + default: + return false; + } +} + +long +html_get_attribute_value(struct html_page *html, struct html_element *element, + char *name, char **ret) +{ + short n, namelen; + + namelen = strlen(name); + + for (n = 0; n < element->attrs_count; n++) { + if (element->attrs[n].name_len != namelen) + continue; + + if (strcasecmp(element->attrs[n].name, name) == 0) { + *ret = (char *)&element->attrs[n].val; + return element->attrs[n].val_len; + } + } + + *ret = NULL; + return 0; +} + +void +html_render_current_node(struct html_page *html, bool popping) +{ + struct html_element *el = html->current_node; + struct html_element *list_parent; + short n, len; + char ol_li[10]; + char *val; + bool have_height = false; + bool found; + + el->renders++; + + /* trim trailing whitespace */ + if (popping) { + while (el->text_len && el->text[el->text_len - 1] == ' ') + el->text_len--; + } + + /* ignore non-title tags before <body> */ + if (!html->render_in_body) { + for (n = 0; n < html->open_count; n++) { + if (html->open[n]->type == HTML_TAG_BODY) { + html->render_in_body = true; + break; + } + + if (n == html->open_count - 1) { + if (el->type == HTML_TAG_TITLE) + html_have_title(html->cookie, html, el->text, + el->text_len); + return; + } + } + } + + if (el->renders == 1) { + /* block elements should start on a new line */ + if (html->last_output != '\r' && html->last_output != '\0' && + html_is_block_tag(html, el->type)) { + HTML_DEBUG(("[block-separate:%s\\r]", html_tag_names[el->type])); + html_output_new_line(html->cookie, html); + } + + /* if the element has a top margin, add more space */ + if (el->margin_top) { + /* unless the last element had a bottom margin */ + if (html->last_margin_bottom || html->last_output == '\0') { + HTML_DEBUG(("[margin-top-but-merging:%s]", + html_tag_names[el->type])); + } else { + HTML_DEBUG(("[margin-top:%s\\r]", html_tag_names[el->type])); + html_output_new_line(html->cookie, html); + } + html->last_margin_bottom = 0; + } + + html->last_margin_top = el->margin_top; + + switch (el->type) { + case HTML_TAG_OL: + case HTML_TAG_UL: + html->render_list_depth++; + break; + case HTML_TAG_INPUT: + have_height = true; + + html_output(html->cookie, html, "[ input type=", 13); + + len = html_get_attribute_value(html, el, "type", &val); + if (val) + html_output(html->cookie, html, val, len); + else + html_output(html->cookie, html, "(none)", 6); + html_output(html->cookie, html, " ]", 2); + break; + case HTML_TAG_IMG: + have_height = true; + html_output(html->cookie, html, "[ img: ", 7); + /* show img alt text */ + len = html_get_attribute_value(html, el, "alt", &val); + if (!val || !len) + /* try img title */ + len = html_get_attribute_value(html, el, "title", &val); + if (val && len) + html_output(html->cookie, html, val, len); + else { + /* last resort, show img src filename */ + len = html_get_attribute_value(html, el, "src", &val); + if (val && len) { + for (n = len; n >= 0; n--) { + if (val[n] == '/') { + html_output(html->cookie, html, val + n + 1, + len - n - 1); + break; + } + } + } + } + html_output(html->cookie, html, " ]", 2); + break; + } + } + + /* remove leading whitespace */ + if (el->text_len && + (html->last_output == ' ' || html->last_output == '\r' || + html->last_output == '\0')) { + while (el->text_len && el->text[el->text_off] == ' ') { + el->text_off++; + el->text_len--; + } + } + + if (html->render_list_depth) { + if (el->type == HTML_TAG_LI && el->renders == 1) { + for (n = 1; n < html->render_list_depth; n++) + html_output(html->cookie, html, "\t", 1); + + list_parent = NULL; + for (n = html->open_count - 1; n >= 0; n--) { + if (html->open[n]->type == HTML_TAG_OL || + html->open[n]->type == HTML_TAG_UL) { + list_parent = html->open[n]; + break; + } + } + + if (list_parent && list_parent->type == HTML_TAG_UL) { + if (html->render_list_depth == 1) + html_output(html->cookie, html, " •\t", 5); + else if (html->render_list_depth == 2) + html_output(html->cookie, html, " o\t", 5); + else + html_output(html->cookie, html, " ◊\t", 5); + } else if (list_parent && list_parent->type == HTML_TAG_OL) { + list_parent->ol_count++; + len = snprintf(ol_li, sizeof(ol_li), "% 4d.\t", + list_parent->ol_count); + html_output(html->cookie, html, ol_li, len); + } + + html->last_output = ' '; + have_height = true; + } else if (el->text_len) { + /* in a list but not a direct child of <li>, what are we in? */ + for (n = html->open_count - 1; n >= 0; n--) { + if (html->open[n]->type == HTML_TAG_OL || + html->open[n]->type == HTML_TAG_UL) { + /* text in root of list not in an li, ident it */ + for (n = 0; n < html->render_list_depth; n++) + html_output(html->cookie, html, "\t", 1); + break; + } + + if (html->open[n]->type == HTML_TAG_LI) { + if (html->last_output == '\r') { + /* text after a <br> inside an <li>, re-indent */ + for (n = 0; n < html->render_list_depth; n++) + html_output(html->cookie, html, "\t", 1); + } + break; + } + } + + html->last_output = ' '; + } + } + + /* print inner text */ + if (el->text_len) { + html_output(html->cookie, html, el->text + el->text_off, + el->text_len); + have_height = true; + } + + /* brrrr */ + if (el->type == HTML_TAG_BR) { + HTML_DEBUG(("[br\\r]")); + html_output_new_line(html->cookie, html); + have_height = true; + } + + /* mark this block (or its nearest parent block) as having height */ + if (have_height) { + if (html_is_block_tag(html, el->type)) + el->has_height = true; + else { + /* find parent block */ + for (n = html->open_count - 1; n >= 0; n--) { + if (html_is_block_tag(html, html->open[n]->type)) { + html->open[n]->has_height = true; + break; + } + } + } + } + + if (popping) { + /* block elements that had text (or br) get a separating newline */ + if (el->has_height && + !(el->type == HTML_TAG_OL || el->type == HTML_TAG_UL) && + !(el->type == HTML_TAG_LI && html->last_output == '\r')) { + HTML_DEBUG(("[end-block:/%s\\r]", html_tag_names[el->type])); + html_output_new_line(html->cookie, html); + } + + if (el->margin_bottom) { + /* unless the last element had a bottom margin */ + if (!html->last_margin_bottom) { + HTML_DEBUG(("[margin-bottom\\r]")); + html_output_new_line(html->cookie, html); + html->last_margin_bottom = el->margin_bottom; + } + } + + if (el->has_height) { + HTML_DEBUG(("[new-last-margin-bottom:%d]", el->margin_bottom)); + html->last_margin_bottom = el->margin_bottom; + } + + if (el->type == HTML_TAG_OL || el->type == HTML_TAG_UL) + html->render_list_depth--; + } + + el->text_off = 0; + el->text_len = 0; +} + +#ifdef HTML_ENABLE_DEBUGGING +void +html_debug(const char *fmt, ...) +{ + static char buf[512]; + size_t len; + + va_list args; + va_start(args, fmt); + len = vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (len > sizeof(buf)) + len = sizeof(buf); + + html_output(the_html->cookie, the_html, buf, len); +} +#endif \ No newline at end of file --- html.h Wed Dec 11 11:24:31 2024 +++ html.h Wed Dec 11 11:24:31 2024 @@ -0,0 +1,648 @@ +/* + * Copyright (c) 2024 joshua stein <jcs@jcs.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <stdlib.h> +#include <stdarg.h> +#include <stdio.h> +#include <string.h> +#include "stdint.h" + +/* external functions the caller needs to provide */ +#if 0 +extern void panic(const char *format, ...); +extern void * xmalloc(size_t); +extern void xfree(void *ptrptr); +extern void * xmalloczero(size_t); +extern void * xrealloc(void *src, size_t size); +extern size_t strlcpy(char *dst, const char *src, size_t dsize); +extern size_t strlcat(char *dst, const char *src, size_t dsize); +extern short snprintf(char *s, size_t size, const char *fmt, ...); +extern short strcasecmp(const char *s1, const char *s2); +extern short strncasecmp(const char *s1, const char *s2, size_t n); +#else +#include "util.h" +#endif +void html_output(void *cookie, struct html_page *html, char *str, + size_t len); +void html_output_new_line(void *cookie, struct html_page *html); +void html_debug(const char *fmt, ...); +void html_have_title(void *cookie, struct html_page *html, char *str, + size_t len); + +//#define HTML_ENABLE_DEBUGGING +#ifdef HTML_ENABLE_DEBUGGING +extern struct html_page *the_html; +# define HTML_DEBUG(x) do { html_debug x; } while (0) +#else +# define HTML_DEBUG(x) {} +#endif + +/* + * tunables + */ + +#define HTML_STACK_DEPTH 128 + +/* this should in theory be the max size of an html_entity but that's huge */ +#define HTML_LOOKAHEAD_SIZE 10 + +#define HTML_OUTPUT_BUF_SIZE 64 +#define HTML_TAG_TEXT_CHUNK_SIZE 512 + +/* + * helpers + */ + +#define IS_WHITESPACE(c) ((c) == '\t' || (c) == '\n' || (c) == '\f' || \ + (c) == '\r' || (c) == ' ') +#define IS_LOWER_ALPHA(c) ((c) >= 'a' && (c) <= 'z') +#define IS_UPPER_ALPHA(c) ((c) >= 'A' && (c) <= 'Z') +#define IS_ALPHA(c) (IS_LOWER_ALPHA((c)) || IS_UPPER_ALPHA((c))) +#define IS_NUMERIC(c) (((c) >= '0' && (c) <= '9')) +#define IS_ALPHANUMERIC(c) (IS_ALPHA((c)) || IS_NUMERIC((c))) +#define IS_HEX_DIGIT(c) (IS_NUMERIC((c)) || ((c) >= 'a' && (c) <= 'f') || \ + ((c) >= 'A' && (c) <= 'F')) + +#define IS_BLOCK(tag) ((tag) < HTML_TAG_LAST_BLOCK) + +#define NEW_TOKEN_LAST_ATTR (html->new_token.tag.attrs[html->new_token.tag.attrs_count - 1]) + +/* only works on fixed-size char arrays */ +#define STR_APPEND(field, len, ch) \ + if ((len) < sizeof(field)) { \ + (field)[(len)++] = (ch); \ + (field)[(len)] = '\0'; \ + } + +#define CONSUMED_AS_PART_OF_AN_ATTRIBUTE \ + (html->return_state == HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED || \ + html->return_state == HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED || \ + html->return_state == HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED) + +#ifndef nitems +#define nitems(what) (sizeof((what)) / sizeof((what)[0])) +#endif + +/* insertion mode */ +/* https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode */ +extern const char *html_mode_names[]; +typedef enum { + HTML_MODE_NONE = 0, + HTML_MODE_INITIAL, + HTML_MODE_BEFORE_HTML, + HTML_MODE_BEFORE_HEAD, + HTML_MODE_IN_HEAD, + HTML_MODE_IN_HEAD_NOSCRIPT, + HTML_MODE_AFTER_HEAD, + HTML_MODE_IN_BODY, + HTML_MODE_TEXT, + HTML_MODE_IN_TABLE, + HTML_MODE_IN_TABLE_TEXT, + HTML_MODE_IN_CAPTION, + HTML_MODE_IN_COLUMN_GROUP, + HTML_MODE_IN_TABLE_BODY, + HTML_MODE_IN_ROW, + HTML_MODE_IN_CELL, + HTML_MODE_IN_SELECT, + HTML_MODE_IN_SELECT_IN_TABLE, + HTML_MODE_IN_TEMPLATE, + HTML_MODE_AFTER_BODY, + HTML_MODE_IN_FRAMESET, + HTML_MODE_AFTER_FRAMESET, + HTML_MODE_AFTER_AFTER_BODY, + HTML_MODE_AFTER_AFTER_FRAMESET +} html_mode; + +/* tokenization state */ +/* https://html.spec.whatwg.org/multipage/parsing.html#tokenization */ +extern const char *html_state_names[]; +typedef enum { + HTML_STATE_NONE = 0, + HTML_STATE_DATA, + HTML_STATE_RCDATA, + HTML_STATE_RAWTEXT, + HTML_STATE_SCRIPT_DATA, + HTML_STATE_PLAINTEXT, + HTML_STATE_TAG_OPEN, + HTML_STATE_END_TAG_OPEN, + HTML_STATE_TAG_NAME, + HTML_STATE_RCDATA_LESS_THAN_SIGN, + HTML_STATE_RCDATA_END_TAG_OPEN, + HTML_STATE_RCDATA_END_TAG_NAME, + HTML_STATE_RAWTEXT_LESS_THAN_SIGN, + HTML_STATE_RAWTEXT_END_TAG_OPEN, + HTML_STATE_RAWTEXT_END_TAG_NAME, + HTML_STATE_SCRIPT_DATA_LESS_THAN_SIGN, + HTML_STATE_SCRIPT_DATA_END_TAG_OPEN, + HTML_STATE_SCRIPT_DATA_END_TAG_NAME, + HTML_STATE_SCRIPT_DATA_ESCAPE_START, + HTML_STATE_SCRIPT_DATA_ESCAPE_START_DASH, + HTML_STATE_SCRIPT_DATA_ESCAPED, + HTML_STATE_SCRIPT_DATA_ESCAPED_DASH, + HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH, + HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, + HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN, + HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME, + HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START, + HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED, + HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH, + HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, + HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, + HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END, + HTML_STATE_BEFORE_ATTRIBUTE_NAME, + HTML_STATE_ATTRIBUTE_NAME, + HTML_STATE_AFTER_ATTRIBUTE_NAME, + HTML_STATE_BEFORE_ATTRIBUTE_VALUE, + HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED, + HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED, + HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED, + HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED, + HTML_STATE_SELF_CLOSING_START_TAG, + HTML_STATE_BOGUS_COMMENT, + HTML_STATE_MARKUP_DECLARATION_OPEN, + HTML_STATE_COMMENT_START, + HTML_STATE_COMMENT_START_DASH, + HTML_STATE_COMMENT, + HTML_STATE_COMMENT_LESS_THAN_SIGN, + HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG, + HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH, + HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH, + HTML_STATE_COMMENT_END_DASH, + HTML_STATE_COMMENT_END, + HTML_STATE_COMMENT_END_BANG, + HTML_STATE_DOCTYPE, + HTML_STATE_BEFORE_DOCTYPE_NAME, + HTML_STATE_DOCTYPE_NAME, + HTML_STATE_AFTER_DOCTYPE_NAME, + HTML_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD, + HTML_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, + HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, + HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, + HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER, + HTML_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, + HTML_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD, + HTML_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, + HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, + HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, + HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER, + HTML_STATE_BOGUS_DOCTYPE, + HTML_STATE_CDATA_SECTION, + HTML_STATE_CDATA_SECTION_BRACKET, + HTML_STATE_CDATA_SECTION_END, + HTML_STATE_CHARACTER_REFERENCE, + HTML_STATE_NAMED_CHARACTER_REFERENCE, + HTML_STATE_AMBIGUOUS_AMPERSAND, + HTML_STATE_NUMERIC_CHARACTER_REFERENCE, + HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE_START, + HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START, + HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE, + HTML_STATE_DECIMAL_CHARACTER_REFERENCE, + HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END +} html_state; + +/* tokenization output */ +/* https://html.spec.whatwg.org/multipage/parsing.html#tokenization */ +#ifdef HTML_ENABLE_DEBUGGING +extern const char *html_token_names[]; +#endif +typedef enum { + HTML_TOKEN_DOCTYPE = 1, + HTML_TOKEN_START_TAG, + HTML_TOKEN_END_TAG, + HTML_TOKEN_COMMENT, + HTML_TOKEN_CHARACTER, + HTML_TOKEN_EOF +} html_token_type; + +/* html_process_token return states */ +typedef enum { + HTML_TOKEN_REPROCESS = 1, + HTML_TOKEN_PROCESSED +} html_token_act; + +/* parse errors */ +/* https://html.spec.whatwg.org/multipage/parsing.html#parse-errors */ +#ifdef HTML_ENABLE_DEBUGGING +extern const char *html_error_strings[]; +#endif +typedef enum { + HTML_ERROR_NONE, + HTML_ERROR_ABRUPT_CLOSING_OF_EMPTY_COMMENT, + HTML_ERROR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER, + HTML_ERROR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER, + HTML_ERROR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE, + HTML_ERROR_CDATA_IN_HTML_CONTENT, + HTML_ERROR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE, + HTML_ERROR_CONTROL_CHARACTER_IN_INPUT_STREAM, + HTML_ERROR_CONTROL_CHARACTER_REFERENCE, + HTML_ERROR_DUPLICATE_ATTRIBUTE, + HTML_ERROR_END_TAG_WITH_ATTRIBUTES, + HTML_ERROR_END_TAG_WITH_TRAILING_SOLIDUS, + HTML_ERROR_EOF_BEFORE_TAG_NAME, + HTML_ERROR_EOF_IN_CDATA, + HTML_ERROR_EOF_IN_COMMENT, + HTML_ERROR_EOF_IN_DOCTYPE, + HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT, + HTML_ERROR_EOF_IN_TAG, + HTML_ERROR_INCORRECTLY_CLOSED_COMMENT, + HTML_ERROR_INCORRECTLY_OPENED_COMMENT, + HTML_ERROR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME, + HTML_ERROR_INVALID_FIRST_CHARACTER_OF_TAG_NAME, + HTML_ERROR_MISSING_ATTRIBUTE_VALUE, + HTML_ERROR_MISSING_DOCTYPE_NAME, + HTML_ERROR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER, + HTML_ERROR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER, + HTML_ERROR_MISSING_END_TAG_NAME, + HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, + HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, + HTML_ERROR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE, + HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD, + HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD, + HTML_ERROR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME, + HTML_ERROR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES, + HTML_ERROR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, + HTML_ERROR_NESTED_COMMENT, + HTML_ERROR_NONCHARACTER_CHARACTER_REFERENCE, + HTML_ERROR_NONCHARACTER_IN_INPUT_STREAM, + HTML_ERROR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS, + HTML_ERROR_NULL_CHARACTER_REFERENCE, + HTML_ERROR_SURROGATE_CHARACTER_REFERENCE, + HTML_ERROR_SURROGATE_IN_INPUT_STREAM, + HTML_ERROR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER, + HTML_ERROR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME, + HTML_ERROR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE, + HTML_ERROR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME, + HTML_ERROR_UNEXPECTED_NULL_CHARACTER, + HTML_ERROR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME, + HTML_ERROR_UNEXPECTED_SOLIDUS_IN_TAG, + HTML_ERROR_UNKNOWN_NAMED_CHARACTER_REFERENCE +} html_error; + +/* keep this in same order as html_tag_names[] */ +extern const char *html_tag_names[]; +typedef enum { + HTML_TAG_A = 1, + HTML_TAG_ADDRESS, + HTML_TAG_APPLET, + HTML_TAG_AREA, + HTML_TAG_ARTICLE, + HTML_TAG_ASIDE, + HTML_TAG_B, + HTML_TAG_BASE, + HTML_TAG_BASEFONT, + HTML_TAG_BGSOUND, + HTML_TAG_BIG, + HTML_TAG_BLOCKQUOTE, + HTML_TAG_BODY, + HTML_TAG_BR, + HTML_TAG_BUTTON, + HTML_TAG_CAPTION, + HTML_TAG_CENTER, + HTML_TAG_CITE, + HTML_TAG_CODE, + HTML_TAG_COL, + HTML_TAG_COLGROUP, + HTML_TAG_DD, + HTML_TAG_DETAILS, + HTML_TAG_DFN, + HTML_TAG_DIALOG, + HTML_TAG_DIR, + HTML_TAG_DIV, + HTML_TAG_DL, + HTML_TAG_DT, + HTML_TAG_EM, + HTML_TAG_EMBED, + HTML_TAG_FIELDSET, + HTML_TAG_FIGCAPTION, + HTML_TAG_FIGURE, + HTML_TAG_FONT, + HTML_TAG_FOOTER, + HTML_TAG_FORM, + HTML_TAG_FRAME, + HTML_TAG_FRAMESET, + HTML_TAG_H1, + HTML_TAG_H2, + HTML_TAG_H3, + HTML_TAG_H4, + HTML_TAG_H5, + HTML_TAG_H6, + HTML_TAG_HEAD, + HTML_TAG_HEADER, + HTML_TAG_HGROUP, + HTML_TAG_HR, + HTML_TAG_HTML, + HTML_TAG_I, + HTML_TAG_IFRAME, + HTML_TAG_IMAGE, + HTML_TAG_IMG, + HTML_TAG_INPUT, + HTML_TAG_INS, + HTML_TAG_KBD, + HTML_TAG_KEYGEN, + HTML_TAG_LI, + HTML_TAG_LINK, + HTML_TAG_LISTING, + HTML_TAG_MAIN, + HTML_TAG_MARQUEE, + HTML_TAG_MATH, + HTML_TAG_MENU, + HTML_TAG_META, + HTML_TAG_NAV, + HTML_TAG_NOBR, + HTML_TAG_NOEMBED, + HTML_TAG_NOFRAMES, + HTML_TAG_NOSCRIPT, + HTML_TAG_OBJECT, + HTML_TAG_OL, + HTML_TAG_OPTGROUP, + HTML_TAG_OPTION, + HTML_TAG_P, + HTML_TAG_PARAM, + HTML_TAG_PLAINTEXT, + HTML_TAG_PRE, + HTML_TAG_RB, + HTML_TAG_RP, + HTML_TAG_RT, + HTML_TAG_RTC, + HTML_TAG_RUBY, + HTML_TAG_S, + HTML_TAG_SAMP, + HTML_TAG_SCRIPT, + HTML_TAG_SEARCH, + HTML_TAG_SECTION, + HTML_TAG_SELECT, + HTML_TAG_SMALL, + HTML_TAG_SOURCE, + HTML_TAG_SPAN, + HTML_TAG_STRIKE, + HTML_TAG_STRONG, + HTML_TAG_STYLE, + HTML_TAG_SUB, + HTML_TAG_SUP, + HTML_TAG_SUMMARY, + HTML_TAG_SVG, + HTML_TAG_TABLE, + HTML_TAG_TBODY, + HTML_TAG_TD, + HTML_TAG_TEMPLATE, + HTML_TAG_TEXTAREA, + HTML_TAG_TFOOT, + HTML_TAG_TH, + HTML_TAG_THEAD, + HTML_TAG_TITLE, + HTML_TAG_TR, + HTML_TAG_TRACK, + HTML_TAG_TT, + HTML_TAG_U, + HTML_TAG_UL, + HTML_TAG_VAR, + HTML_TAG_WBR, + HTML_TAG_XMP, + + HTML_TAG_MAX_ID +} html_tag_type; + +typedef enum { + HTML_SCOPE_DEFAULT, + HTML_SCOPE_LIST_ITEM, + HTML_SCOPE_BUTTON, + HTML_SCOPE_TABLE, + HTML_SCOPE_SELECT +} html_scope; + +typedef enum { + HTML_NAMESPACE_HTML, + HTML_NAMESPACE_MATHML, + HTML_NAMESPACE_SVG, + HTML_NAMESPACE_XLINK, + HTML_NAMESPACE_XML, + HTML_NAMESPACE_XMLNS +} html_namespace; + +typedef struct { + const char *entity; + uint32_t codepoint; +} html_entity; + +extern const html_entity html_entities[]; + +struct html_attr { + char name[24]; + short name_len; + char val[128]; + short val_len; +}; + +struct html_tag { + /* this must be first */ + html_tag_type token_type; + + html_tag_type type; + html_namespace ns; + char name[16]; + short name_len; + /* TODO: make this dynamic so it's not so many KB on the stack */ + struct html_attr attrs[8]; + short attrs_count; + bool emitted; + bool self_closing; + bool self_closing_acked; +}; + +struct html_element { + html_tag_type type; + + html_namespace ns; + char name[16]; + short name_len; + struct html_attr attrs[8]; + short attrs_count; + + char *text; + size_t text_len; + size_t text_off; + size_t text_size; + bool has_height; + short margin_top; + short margin_bottom; + short ol_count; + short renders; + + short refs; + struct html_element *next_need_free; +}; + +struct html_comment { + /* this must be first */ + html_token_type token_type; + + char data[8]; + short len; +}; + +struct html_char { + /* this must be first */ + html_token_type token_type; + + char c; +}; + +struct html_doctype { + /* this must be first */ + html_token_type _pad; + + char name[32]; + short name_len; + char public_identifier[32]; + short public_identifier_len; + char system_identifier[32]; + short system_identifier_len; + bool system_identifier_found; + bool force_quirks; +}; + +/* + * THINK C doesn't support anonymous unions so we can't have a + * struct html_token with tag/doctype/comment at the root + */ +union html_token { + /* every other type has html_token_type as its first member */ + html_token_type type; + + struct html_tag tag; + struct html_doctype doctype; + struct html_comment comment; + struct html_char ch; +}; +typedef union html_token html_token; + +struct html_formatting { + bool marker; + struct html_element *element; + html_token_type token; +}; + +struct html_page { + void *cookie; + + size_t input_pos; + bool eof; + + /* insertion mode */ + html_mode mode; + html_mode original_mode; + + html_state state; + html_state return_state; + + html_error error; + + char *escaped_buf; + size_t escaped_size; + + bool parse_last_cr; + bool frameset_ok; + bool parser_cannot_change_mode; + bool foster_parenting; + bool quirks_mode; + + /* rendering */ + bool render_in_body; + short render_list_depth; + char last_output; + bool last_margin_top; + bool last_margin_bottom; + + /* configurables */ + bool ignore_script_data; + bool ignore_comment_data; + bool scripting; + + /* if the next character token should be skipped if it's \n */ + bool skip_newline_char_token; + + /* "stack of open elements" */ + struct html_element *open[HTML_STACK_DEPTH]; + short open_count; + struct html_element *current_node; + struct html_element *need_free_list; + struct html_element *need_free_tail; + + /* https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements */ + struct html_formatting active_formatting[HTML_STACK_DEPTH]; + short active_formatting_count; + + /* https://html.spec.whatwg.org/multipage/parsing.html#the-element-pointers */ + struct html_element *head; + struct html_element *form; + + union html_token new_token; + + /* we'll queue some characters up before actually parsing */ + char lookahead[HTML_LOOKAHEAD_SIZE]; + unsigned char lookahead_len; + + /* some tokens need a temporary buffer to store text */ + char tmp[128]; + unsigned char tmp_len; +}; + +#define HTML_REPLACEMENT_CHARACTER 0xff + +void html_output(void *cookie, struct html_page *html, char *str, + size_t len); +void html_debug(const char *fmt, ...); +void html_have_title(void *cookie, struct html_page *html, char *str, + size_t len); + +/* html.c */ +struct html_page * html_init_page(void *cookie); +void html_page_finish(struct html_page **htmlp); +void html_xfree(struct html_page **htmlp); +bool html_parse(struct html_page *html, char *str, size_t len); +void html_insert_character(struct html_page *html, short cc); +bool html_is_block_tag(struct html_page *html, html_tag_type tag); +long html_get_attribute_value(struct html_page *html, + struct html_element *element, char *name, char **ret); +void html_render_current_node(struct html_page *html, bool popping); +void html_parse_error(struct html_page *html); +void html_debug(const char *fmt, ...); +#if 0 +void html_emit_token(struct html_page *html, html_token *token); +#else +#define html_emit_token(a, b) html_process_token(a, b) +#endif +void html_buffer_output(struct html_page *html, char *str, size_t len); +void html_flush_output_buffer(struct html_page *html); + +/* html_tokenize.c */ +void html_tokenize(struct html_page *html, short cc); +void html_prep_new_token(struct html_page *html, html_token_type token_type); +struct html_attr * html_prep_new_attribute(struct html_page *html, + struct html_tag *tag); +void html_tokenize_finish(struct html_page *html); +html_token_act html_process_token_in_foreign_content(struct html_page *html, + html_token *token); + +/* html_tree.c */ +void html_process_token(struct html_page *html, html_token *token); +void html_append_comment(struct html_page *html, struct html_comment *comment); +void html_stop_parsing(struct html_page *html); +char * html_escape_string(struct html_page *html, char *str, size_t *len, + bool attribute_mode); +void html_emit_char_token(struct html_page *html, short cc); +void html_emit_eof_token(struct html_page *html); +void html_emit_comment(struct html_page *html, struct html_comment *comment); --- http.c Thu Nov 21 16:27:58 2024 +++ http.c Thu Dec 12 21:38:17 2024 @@ -19,6 +19,7 @@ #include <string.h> #include "detritus.h" +#include "html.h" #define HTTP_REQUEST_BUF_SIZE 512 @@ -28,8 +29,9 @@ enum { PARSE_STATE_DOWNLOAD }; -extern bool html_print(struct page *page); -extern void html_free(struct page *page); +struct http_page { + struct html_page *html; +}; bool http_accept_uri(struct URI *uri); bool http_request_init(page_handle pageh); @@ -37,6 +39,7 @@ bool http_process(page_handle pageh); void http_reset(page_handle pageh); void http_free(page_handle pageh); +bool html_parse_page(page_handle pageh); static void print_plaintext(struct page *page); struct page_handler http_handler = { @@ -146,6 +149,7 @@ http_process(page_handle pageh) strncasecmp(page->content_type, "text/html", 9) == 0) { page->parse_state = PARSE_STATE_BODY; browser_commit_to_loading_page(page->browser); + TVTabStop(page->browser->output_tv, 28); } else { page->parse_state = PARSE_STATE_DOWNLOAD; @@ -172,13 +176,48 @@ http_process(page_handle pageh) return true; if (strncasecmp(page->content_type, "text/html", 9) == 0) { - html_print(page); + html_parse_page(pageh); return PAGE_CAN_READ_MORE(page); } return page_print_plaintext(pageh); } +bool +html_parse_page(page_handle pageh) +{ + struct page *page = *pageh; + struct html_page *html; + size_t len; + + if (page->handler_cookie == NULL) { + html = html_init_page(pageh); + if (html == NULL) { + warn("Out of memory"); + return false; + } + html->ignore_script_data = true; + html->ignore_comment_data = true; + page->handler_cookie = html; + } else + html = (struct html_page *)page->handler_cookie; + + len = page->content_len - page->content_pos; + if (len) { + html_parse(html, page->content + page->content_pos, len); + TVUpdateScrollbar(page->browser->output_tv, + page->browser->output_tv_scroller); + page->content_pos += len; + return true; + } + + if (PAGE_CAN_READ_MORE(page)) + return true; + + html_page_finish(&html); + return false; +} + void http_reset(page_handle pageh) { @@ -187,6 +226,9 @@ http_reset(page_handle pageh) /* restart at body */ page->parse_state = PARSE_STATE_BODY; page->content_pos = page->header_len; + + if (page->handler_cookie != NULL) + html_xfree((struct html_page **)&page->handler_cookie); } void @@ -194,6 +236,154 @@ http_free(page_handle pageh) { struct page *page = *pageh; - if (page->handler_cookie) - html_free(page); -} + if (page->handler_cookie != NULL) + html_xfree((struct html_page **)&page->handler_cookie); +} + +void +html_output(void *cookie, struct html_page *html, char *str, size_t len) +{ + struct page *page = *((page_handle)cookie); + struct TVStyle style = { 0 }; + short n; + + style.font = geneva; + style.size = 10; + style.style = 0; + + for (n = 0; n < html->open_count; n++) { + switch (html->open[n]->type) { + case HTML_TAG_A: + style.style |= underline; + break; + case HTML_TAG_ADDRESS: + style.style |= italic; + break; + case HTML_TAG_B: + style.style |= bold | condense; + break; + case HTML_TAG_CITE: + style.style |= italic; + break; + case HTML_TAG_CODE: + style.font = monaco; + style.size = 9; + break; + case HTML_TAG_DFN: + style.style |= italic; + break; + case HTML_TAG_EM: + style.style |= italic; + break; + case HTML_TAG_H1: + /* 2em */ + style.size = 16; + style.style |= bold | condense; + break; + case HTML_TAG_H2: + /* 1.5em */ + style.size = 14; + style.style |= bold | condense; + break; + case HTML_TAG_H3: + /* 1.17em */ + style.size = 12; + style.style |= bold | condense; + break; + case HTML_TAG_H4: + /* 1em */ + style.size = 10; + style.style |= bold | condense; + break; + case HTML_TAG_H5: + /* 0.83em */ + style.size = 8; + style.style |= bold | condense; + break; + case HTML_TAG_H6: + /* 0.67em */ + style.size = 8; + style.style |= bold | condense; + break; + case HTML_TAG_I: + style.style |= italic; + break; + case HTML_TAG_INS: + style.style |= underline; + break; + case HTML_TAG_KBD: + style.font = monaco; + style.size = 9; + break; + case HTML_TAG_PRE: + style.font = monaco; + style.size = 9; + break; + case HTML_TAG_S: + /* TODO: line-through */ + break; + case HTML_TAG_SAMP: + style.font = monaco; + style.size = 9; + break; + case HTML_TAG_SMALL: + style.size -= 2; + break; + case HTML_TAG_STRIKE: + /* TODO: line-through */ + break; + case HTML_TAG_STRONG: + style.style |= bold | condense; + break; + case HTML_TAG_SUP: + style.size -= 2; + break; + case HTML_TAG_TH: + style.style |= bold | condense; + break; + case HTML_TAG_U: + style.style |= underline; + break; + case HTML_TAG_VAR: + style.style |= italic; + break; + } + } + + if (style.size < 8) + style.size = 8; + + if (!TVAppend(page->browser->output_tv, &style, str, len)) + panic("out of memory in TVAppend"); + + html->last_output = str[len - 1]; +} + +void +html_output_new_line(void *cookie, struct html_page *html) +{ + struct page *page = *((page_handle)cookie); + struct TVStyle style = { 0 }; + + style.font = geneva; + style.size = 10; + style.style = 0; + + if (!TVAppend(page->browser->output_tv, &style, "\r", 1)) + panic("out of memory in TVAppend"); + + html->last_output = '\r'; +} + +void +html_have_title(void *cookie, struct html_page *html, char *str, size_t len) +{ + Str255 pstr; + struct page *page = *((page_handle)cookie); + short plen; + + plen = MIN(len, 255); + memcpy((char *)pstr + 1, str, len); + pstr[0] = (unsigned char)plen; + SetWTitle(page->browser->win, pstr); +} \ No newline at end of file