/* * Copyright (c) 2024 joshua stein * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * Glue for tying browser_page to the tokenizer, and then on the other end * handling tags and text output by the tree builder. */ #include #include #include #include "html.h" #ifdef HTML_ENABLE #ifdef HTML_ENABLE_DEBUGGING struct html_page *the_html = NULL; #endif struct html_page * html_init_page(void *cookie) { struct html_page *html; /* sanity check */ if (strcmp(html_tag_names[HTML_TAG_XMP], "xmp") != 0) panic("html_tag_names is out of sync with HTML_TAGs"); html = xmalloczero(sizeof(struct html_page)); if (html == NULL) return NULL; html->cookie = cookie; html->mode = HTML_MODE_INITIAL; html->state = HTML_STATE_DATA; html->frameset_ok = true; html->new_token.doctype.public_identifier_len = -1; html->new_token.doctype.system_identifier_len = -1; #ifdef HTML_ENABLE_DEBUGGING the_html = html; #endif return html; } bool html_parse(struct html_page *html, char *str, size_t len) { size_t n; register char cc; for (n = 0; n < len; n++) { cc = str[n]; /* https://infra.spec.whatwg.org/#normalize-newlines */ if (html->parse_last_cr) { html->parse_last_cr = false; if (cc != '\n') { cc = '\n'; n--; } } if (cc == '\r') { html->parse_last_cr = true; continue; } html_tokenize(html, cc); if (html->eof) { HTML_DEBUG(("\rEOF\r")); break; } } if (html->eof) return false; return true; } void html_page_finish(struct html_page **htmlp) { struct html_page *html = *htmlp; html_tokenize_finish(html); html_xfree(htmlp); } void html_xfree(struct html_page **htmlp) { struct html_page *html = *htmlp; if (html->escaped_buf) xfree(&html->escaped_buf); xfree(&html); } void html_parse_error(struct html_page *html) { HTML_DEBUG((": [[PARSE ERROR at %d]]", html->input_pos)); } #if 0 void html_emit_token(struct html_page *html, html_token *token) { /* * html_tokenize handles each byte of html and runs it through the state * machine, possibly emitting a token to us here. * * https://html.spec.whatwg.org/multipage/parsing.html#tree-construction */ /* * At some point we might collect tags and proces them all at once, or * maybe just keep a buffer of a few, before handing them off to the tree * constructor. * * But for now, just feed them all to the tree constructor as soon as we get * them. */ html_process_token(html, token); } #endif void html_insert_character(struct html_page *html, short cc) { register unsigned char c = cc; if (html->current_node == NULL) { Debugger(); return; } if (html->current_node->text == NULL) { html->current_node->text_size = HTML_TAG_TEXT_CHUNK_SIZE; html->current_node->text = xmalloc(HTML_TAG_TEXT_CHUNK_SIZE); } else if (html->current_node->text_len >= html->current_node->text_size) { html->current_node->text_size += HTML_TAG_TEXT_CHUNK_SIZE; html->current_node->text = xrealloc(html->current_node->text, html->current_node->text_size); } if (html->current_node->text == NULL) panic("OOM"); if (html->current_node->type == HTML_TAG_TEXTAREA || html->current_node->type == HTML_TAG_PRE) { /* TODO: still remove leading newlines */ goto append; } if (c == '\t' || c == '\n' || c == '\f' || c == '\r') c = ' '; if (c == ' ') { /* collapse multiple whitespaces */ if (html->current_node->text_len && html->current_node->text[ html->current_node->text_len - 1] == ' ') return; } append: html->current_node->text[html->current_node->text_len++] = c; } bool html_is_block_tag(struct html_page *html, html_tag_type tag) { /* https://html.spec.whatwg.org/multipage/sections.html#sections */ switch (tag) { case HTML_TAG_ADDRESS: case HTML_TAG_ARTICLE: case HTML_TAG_ASIDE: case HTML_TAG_BLOCKQUOTE: case HTML_TAG_BODY: case HTML_TAG_DD: case HTML_TAG_DIV: case HTML_TAG_DL: case HTML_TAG_DT: case HTML_TAG_FIGCAPTION: case HTML_TAG_FIGURE: case HTML_TAG_FOOTER: case HTML_TAG_H1: case HTML_TAG_H2: case HTML_TAG_H3: case HTML_TAG_H4: case HTML_TAG_H5: case HTML_TAG_H6: case HTML_TAG_HEADER: case HTML_TAG_HGROUP: case HTML_TAG_HR: case HTML_TAG_LI: case HTML_TAG_MAIN: case HTML_TAG_MENU: case HTML_TAG_NAV: case HTML_TAG_OL: case HTML_TAG_P: case HTML_TAG_PRE: case HTML_TAG_SEARCH: case HTML_TAG_SECTION: case HTML_TAG_UL: return true; case HTML_TAG_CENTER: return true; default: return false; } } long html_get_attribute_value(struct html_page *html, struct html_element *element, char *name, char **ret) { short n, namelen; namelen = strlen(name); for (n = 0; n < element->attrs_count; n++) { if (element->attrs[n].name_len != namelen) continue; if (strcasecmp(element->attrs[n].name, name) == 0) { *ret = (char *)&element->attrs[n].val; return element->attrs[n].val_len; } } *ret = NULL; return 0; } void html_render_current_node(struct html_page *html, bool popping) { struct html_element *el = html->current_node; struct html_element *list_parent; short n, len; char ol_li[10]; char *val; bool have_height = false; bool found; el->renders++; /* trim trailing whitespace */ if (popping) { while (el->text_len && el->text[el->text_len - 1] == ' ') el->text_len--; } /* ignore non-title tags before */ if (!html->render_in_body) { for (n = 0; n < html->open_count; n++) { if (html->open[n]->type == HTML_TAG_BODY) { html->render_in_body = true; break; } if (n == html->open_count - 1) { if (el->type == HTML_TAG_TITLE) html_have_title(html->cookie, html, el->text, el->text_len); return; } } } if (el->renders == 1) { /* block elements should start on a new line */ if (html->last_output != '\r' && html->last_output != '\0' && html_is_block_tag(html, el->type)) { HTML_DEBUG(("[block-separate:%s\\r]", html_tag_names[el->type])); html_output(html->cookie, html, "\r", 1); } /* if the element has a top margin, add more space */ if (el->margin_top) { /* unless the last element had a bottom margin */ if (html->last_margin_bottom || html->last_output == '\0') { HTML_DEBUG(("[margin-top-but-merging:%s]", html_tag_names[el->type])); } else { HTML_DEBUG(("[margin-top:%s\\r]", html_tag_names[el->type])); html_output_margin(html->cookie, html); } html->last_margin_bottom = 0; } html->last_margin_top = el->margin_top; switch (el->type) { case HTML_TAG_OL: case HTML_TAG_UL: html->render_list_depth++; break; case HTML_TAG_INPUT: have_height = true; html_output_field(html->cookie, html, el); break; case HTML_TAG_IMG: have_height = true; html_output(html->cookie, html, "[ img: ", 7); /* show img alt text */ len = html_get_attribute_value(html, el, "alt", &val); if (!val || !len) /* try img title */ len = html_get_attribute_value(html, el, "title", &val); if (val && len) html_output(html->cookie, html, val, len); else { /* last resort, show img src filename */ len = html_get_attribute_value(html, el, "src", &val); if (val && len) { for (n = len; n >= 0; n--) { if (val[n] == '/') { html_output(html->cookie, html, val + n + 1, len - n - 1); break; } } } } html_output(html->cookie, html, " ]", 2); break; } } /* remove leading whitespace */ if (el->text_len && (html->last_output == ' ' || html->last_output == '\r' || html->last_output == '\0')) { while (el->text_len && el->text[el->text_off] == ' ') { el->text_off++; el->text_len--; } } if (html->render_list_depth) { if (el->type == HTML_TAG_LI && el->renders == 1) { for (n = 1; n < html->render_list_depth; n++) html_output(html->cookie, html, "\t", 1); list_parent = NULL; for (n = html->open_count - 1; n >= 0; n--) { if (html->open[n]->type == HTML_TAG_OL || html->open[n]->type == HTML_TAG_UL) { list_parent = html->open[n]; break; } } if (list_parent && list_parent->type == HTML_TAG_UL) { if (html->render_list_depth == 1) html_output(html->cookie, html, " ¥\t", 5); else if (html->render_list_depth == 2) html_output(html->cookie, html, " o\t", 5); else html_output(html->cookie, html, " ×\t", 5); } else if (list_parent && list_parent->type == HTML_TAG_OL) { list_parent->ol_count++; len = snprintf(ol_li, sizeof(ol_li), "% 4d.\t", list_parent->ol_count); html_output(html->cookie, html, ol_li, len); } html->last_output = ' '; have_height = true; } else if (el->text_len) { /* in a list but not a direct child of
  • , what are we in? */ for (n = html->open_count - 1; n >= 0; n--) { if (html->open[n]->type == HTML_TAG_OL || html->open[n]->type == HTML_TAG_UL) { /* text in root of list not in an li, ident it */ for (n = 0; n < html->render_list_depth; n++) html_output(html->cookie, html, "\t", 1); break; } if (html->open[n]->type == HTML_TAG_LI) { if (html->last_output == '\r') { /* text after a
    inside an
  • , re-indent */ for (n = 0; n < html->render_list_depth; n++) html_output(html->cookie, html, "\t", 1); } break; } } html->last_output = ' '; } } /* print inner text */ if (el->text_len) { html_output(html->cookie, html, el->text + el->text_off, el->text_len); have_height = true; } /* brrrr */ if (el->type == HTML_TAG_BR) { HTML_DEBUG(("[br\\r]")); html_output(html->cookie, html, "\r", 1); have_height = true; } /* mark this block (or its nearest parent block) as having height */ if (have_height) { if (html_is_block_tag(html, el->type)) el->has_height = true; else { /* find parent block */ for (n = html->open_count - 1; n >= 0; n--) { if (html_is_block_tag(html, html->open[n]->type)) { html->open[n]->has_height = true; break; } } } } if (popping) { /* block elements that had text (or br) get a separating newline */ if (el->has_height && !(el->type == HTML_TAG_OL || el->type == HTML_TAG_UL) && !(el->type == HTML_TAG_LI && html->last_output == '\r')) { HTML_DEBUG(("[end-block:/%s\\r]", html_tag_names[el->type])); html_output(html->cookie, html, "\r", 1); } if (el->margin_bottom) { /* unless the last element had a bottom margin */ if (!html->last_margin_bottom) { HTML_DEBUG(("[margin-bottom\\r]")); html_output_margin(html->cookie, html); html->last_margin_bottom = el->margin_bottom; } } if (el->has_height) { HTML_DEBUG(("[new-last-margin-bottom:%d]", el->margin_bottom)); html->last_margin_bottom = el->margin_bottom; } if (el->type == HTML_TAG_OL || el->type == HTML_TAG_UL) html->render_list_depth--; } el->text_off = 0; el->text_len = 0; } #ifdef HTML_ENABLE_DEBUGGING void html_debug(const char *fmt, ...) { static char buf[512]; size_t len; va_list args; va_start(args, fmt); len = vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); if (len > sizeof(buf)) len = sizeof(buf); html_output(the_html->cookie, the_html, buf, len); } #endif #endif /* HTML_ENABLE */