AmendHub

Download

jcs

/

detritus

/

html_tokenize.c

 

(View History)

jcs   html: Put all of this behind HTML_ENABLE Latest amendment: 68 on 2025-03-04

1 /*
2 * Copyright (c) 2024 joshua stein <jcs@jcs.org>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 /*
18 * Tokenization
19 * https://html.spec.whatwg.org/multipage/parsing.html#tokenization
20 *
21 * Handles characters output from html_parse() and turns them into tokens,
22 * which are emitted to the tree builder.
23 */
24
25 #include "html.h"
26
27 #ifdef HTML_ENABLE
28
29 void html_tokenize(struct html_page *html, short cc);
30 bool html_appropriate_end_tag_token(struct html_page *html, html_token *token);
31 html_tag_type html_find_tag_type(char *tag_name);
32 void html_lookahead_consume(struct html_page *html, short count);
33
34 void
35 html_lookahead_consume(struct html_page *html, short count)
36 {
37 short n, j;
38
39 for (n = 0; n < count && html->lookahead_len; n++) {
40 HTML_DEBUG((": consuming '%c' from lookahead", html->lookahead[0]));
41 for (j = 0; j < HTML_LOOKAHEAD_SIZE - 1; j++)
42 html->lookahead[j] = html->lookahead[j + 1];
43 html->lookahead_len--;
44 }
45 }
46
47 void
48 html_tokenize(struct html_page *html, short cc)
49 {
50 html_state was_state;
51 struct html_attr *attr;
52 const html_entity *found_entity;
53 short tcc, n, j, i;
54
55 was_state = html->state;
56
57 if (html->lookahead_len < HTML_LOOKAHEAD_SIZE && cc != EOF) {
58 /* fill lookahead */
59 html->lookahead[html->lookahead_len++] = cc;
60 return;
61 }
62
63 if (html->lookahead_len) {
64 /* take a character from the head of lookahead and shift down */
65 tcc = html->lookahead[0];
66 for (n = 0; n < HTML_LOOKAHEAD_SIZE - 1; n++)
67 html->lookahead[n] = html->lookahead[n + 1];
68 if (cc == EOF) {
69 if (html->lookahead_len)
70 html->lookahead_len--;
71 } else
72 html->lookahead[HTML_LOOKAHEAD_SIZE - 1] = cc;
73 cc = tcc;
74 }
75
76 #ifdef HTML_ENABLE_DEBUGGING
77 HTML_DEBUG(("pos % 4ld:", html->input_pos++));
78
79 if (cc == '\n')
80 HTML_DEBUG((" \\n"));
81 else if (cc == '\r')
82 HTML_DEBUG((" \\r"));
83 else if (cc == '\t')
84 HTML_DEBUG((" \\t"));
85 else if (cc == '\f')
86 HTML_DEBUG((" \\f"));
87 else if (cc == '\0')
88 HTML_DEBUG((" \\0"));
89 else if (cc == ' ')
90 HTML_DEBUG((" "));
91 else if (cc == EOF)
92 HTML_DEBUG(("EOF"));
93 else
94 HTML_DEBUG((" %c", cc));
95
96 HTML_DEBUG((": state %s", html_state_names[html->state]));
97 #endif
98
99 was_state = html->state;
100
101 reconsume:
102 if (html->state != was_state) {
103 HTML_DEBUG((": reconsume as %s", html_state_names[html->state]));
104 was_state = html->state;
105 }
106
107 switch (html->state) {
108 case HTML_STATE_DATA:
109 switch (cc) {
110 case '&':
111 html->return_state = html->state;
112 html->tmp_len = 0;
113 html->state = HTML_STATE_CHARACTER_REFERENCE;
114 break;
115 case '<':
116 html->state = HTML_STATE_TAG_OPEN;
117 break;
118 case '\0':
119 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
120 html_emit_char_token(html, cc);
121 break;
122 case EOF:
123 html_emit_eof_token(html);
124 break;
125 default:
126 html_emit_char_token(html, cc);
127 break;
128 }
129 break;
130 case HTML_STATE_RCDATA:
131 switch (cc) {
132 case '&':
133 html->return_state = html->state;
134 html->tmp_len = 0;
135 html->state = HTML_STATE_CHARACTER_REFERENCE;
136 break;
137 case '<':
138 html->state = HTML_STATE_RCDATA_LESS_THAN_SIGN;
139 break;
140 case '\0':
141 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
142 html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
143 break;
144 case EOF:
145 html_emit_eof_token(html);
146 break;
147 default:
148 html_emit_char_token(html, cc);
149 break;
150 }
151 break;
152 case HTML_STATE_RAWTEXT:
153 switch (cc) {
154 case '<':
155 html->state = HTML_STATE_RAWTEXT_LESS_THAN_SIGN;
156 break;
157 case '\0':
158 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
159 html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
160 break;
161 case EOF:
162 html_emit_eof_token(html);
163 break;
164 default:
165 html_emit_char_token(html, cc);
166 break;
167 }
168 break;
169 case HTML_STATE_SCRIPT_DATA:
170 switch (cc) {
171 case '<':
172 html->state = HTML_STATE_SCRIPT_DATA_LESS_THAN_SIGN;
173 break;
174 case '\0':
175 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
176 if (!html->ignore_script_data)
177 html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
178 break;
179 case EOF:
180 html_emit_eof_token(html);
181 break;
182 default:
183 if (!html->ignore_script_data)
184 html_emit_char_token(html, cc);
185 break;
186 }
187 break;
188 case HTML_STATE_PLAINTEXT:
189 switch (cc) {
190 case '\0':
191 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
192 html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
193 break;
194 case EOF:
195 html_emit_eof_token(html);
196 break;
197 default:
198 html_emit_char_token(html, cc);
199 break;
200 }
201 break;
202 case HTML_STATE_TAG_OPEN:
203 switch (cc) {
204 case '!':
205 html->state = HTML_STATE_MARKUP_DECLARATION_OPEN;
206 html->tmp_len = 0;
207 break;
208 case '/':
209 html->state = HTML_STATE_END_TAG_OPEN;
210 break;
211 case '?':
212 html->error =
213 HTML_ERROR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME;
214 html_emit_comment(html, &html->new_token.comment);
215 html->state = HTML_STATE_BOGUS_COMMENT;
216 goto reconsume;
217 case EOF:
218 html->error = HTML_ERROR_EOF_BEFORE_TAG_NAME;
219 html_emit_char_token(html, '<');
220 html_emit_eof_token(html);
221 break;
222 default:
223 if (IS_ALPHA(cc)) {
224 html_prep_new_token(html, HTML_TOKEN_START_TAG);
225 html->state = HTML_STATE_TAG_NAME;
226 goto reconsume;
227 }
228 html->error = HTML_ERROR_INVALID_FIRST_CHARACTER_OF_TAG_NAME;
229 html_emit_char_token(html, '<');
230 html->state = HTML_STATE_DATA;
231 goto reconsume;
232 }
233 break;
234 case HTML_STATE_END_TAG_OPEN:
235 switch (cc) {
236 case '>':
237 html->error = HTML_ERROR_MISSING_END_TAG_NAME;
238 html->state = HTML_STATE_DATA;
239 break;
240 case EOF:
241 html->error = HTML_ERROR_EOF_BEFORE_TAG_NAME;
242 html_emit_char_token(html, '<');
243 html_emit_char_token(html, '/');
244 html_emit_eof_token(html);
245 break;
246 default:
247 if (IS_ALPHA(cc)) {
248 html_prep_new_token(html, HTML_TOKEN_END_TAG);
249 html->state = HTML_STATE_TAG_NAME;
250 goto reconsume;
251 }
252 html->error = HTML_ERROR_INVALID_FIRST_CHARACTER_OF_TAG_NAME;
253 html_prep_new_token(html, HTML_TOKEN_COMMENT);
254 html->state = HTML_STATE_BOGUS_COMMENT;
255 goto reconsume;
256 }
257 break;
258 case HTML_STATE_TAG_NAME:
259 switch (cc) {
260 case '\t':
261 case '\n':
262 case '\f':
263 case ' ':
264 html->new_token.tag.type =
265 html_find_tag_type(html->new_token.tag.name);
266 html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
267 break;
268 case '/':
269 html->new_token.tag.type =
270 html_find_tag_type(html->new_token.tag.name);
271 html->state = HTML_STATE_SELF_CLOSING_START_TAG;
272 break;
273 case '>':
274 html->state = HTML_STATE_DATA;
275 html->new_token.tag.type =
276 html_find_tag_type(html->new_token.tag.name);
277 html_emit_token(html, &html->new_token);
278 break;
279 case '\0':
280 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
281 STR_APPEND(html->new_token.tag.name,
282 html->new_token.tag.name_len, HTML_REPLACEMENT_CHARACTER);
283 html->new_token.tag.type = 0;
284 break;
285 case EOF:
286 html->error = HTML_ERROR_EOF_IN_TAG;
287 html_emit_eof_token(html);
288 break;
289 default:
290 if (IS_UPPER_ALPHA(cc))
291 cc += 0x20;
292 STR_APPEND(html->new_token.tag.name,
293 html->new_token.tag.name_len, cc);
294 break;
295 }
296 break;
297 case HTML_STATE_RCDATA_LESS_THAN_SIGN:
298 switch (cc) {
299 case '/':
300 html->state = HTML_STATE_RCDATA_END_TAG_OPEN;
301 html->tmp_len = 0;
302 break;
303 default:
304 html->state = HTML_STATE_RCDATA;
305 if (!html->ignore_comment_data)
306 html_emit_char_token(html, '<');
307 goto reconsume;
308 }
309 break;
310 case HTML_STATE_RCDATA_END_TAG_OPEN:
311 if (IS_ALPHA(cc)) {
312 html_prep_new_token(html, HTML_TOKEN_END_TAG);
313 html->state = HTML_STATE_RCDATA_END_TAG_NAME;
314 goto reconsume;
315 }
316 if (!html->ignore_comment_data) {
317 html_emit_char_token(html, '<');
318 html_emit_char_token(html, '/');
319 }
320 html->state = HTML_STATE_RCDATA;
321 goto reconsume;
322 case HTML_STATE_RCDATA_END_TAG_NAME:
323 switch (cc) {
324 case '\t':
325 case '\n':
326 case '\f':
327 case ' ':
328 if (!html_appropriate_end_tag_token(html, &html->new_token))
329 goto HTML_STATE_RCDATA_END_TAG_NAME_anything_else;
330 html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
331 break;
332 case '/':
333 if (!html_appropriate_end_tag_token(html, &html->new_token))
334 goto HTML_STATE_RCDATA_END_TAG_NAME_anything_else;
335 html->state = HTML_STATE_SELF_CLOSING_START_TAG;
336 break;
337 case '>':
338 if (!html_appropriate_end_tag_token(html, &html->new_token))
339 goto HTML_STATE_RCDATA_END_TAG_NAME_anything_else;
340 html->state = HTML_STATE_DATA;
341 html_emit_token(html, &html->new_token);
342 break;
343 default:
344 if (IS_UPPER_ALPHA(cc))
345 cc += 0x20;
346 if (IS_LOWER_ALPHA(cc)) {
347 STR_APPEND(html->new_token.tag.name,
348 html->new_token.tag.name_len, cc);
349 STR_APPEND(html->tmp, html->tmp_len, cc);
350 break;
351 }
352 /* FALLTHROUGH */
353 HTML_STATE_RCDATA_END_TAG_NAME_anything_else:
354 if (!html->ignore_comment_data) {
355 html_emit_char_token(html, '<');
356 html_emit_char_token(html, '/');
357 for (n = 0; n < html->tmp_len; n++)
358 html_emit_char_token(html, html->tmp[n]);
359 }
360 html->state = HTML_STATE_RCDATA;
361 goto reconsume;
362 }
363 break;
364 case HTML_STATE_RAWTEXT_LESS_THAN_SIGN:
365 switch (cc) {
366 case '/':
367 html->tmp_len = 0;
368 html->state = HTML_STATE_RAWTEXT_END_TAG_OPEN;
369 break;
370 default:
371 if (!html->ignore_comment_data) {
372 html_emit_char_token(html, '<');
373 }
374 html->state = HTML_STATE_RAWTEXT;
375 goto reconsume;
376 }
377 break;
378 case HTML_STATE_RAWTEXT_END_TAG_OPEN:
379 if (IS_ALPHA(cc)) {
380 html_prep_new_token(html, HTML_TOKEN_END_TAG);
381 html->state = HTML_STATE_RAWTEXT_END_TAG_NAME;
382 goto reconsume;
383 }
384 if (!html->ignore_comment_data) {
385 html_emit_char_token(html, '<');
386 html_emit_char_token(html, '/');
387 }
388 html->state = HTML_STATE_RAWTEXT;
389 goto reconsume;
390 case HTML_STATE_RAWTEXT_END_TAG_NAME:
391 switch (cc) {
392 case '\t':
393 case '\n':
394 case '\f':
395 case ' ':
396 if (!html_appropriate_end_tag_token(html, &html->new_token))
397 goto HTML_STATE_RAWTEXT_END_TAG_NAME_anything_else;
398 html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
399 break;
400 case '/':
401 if (!html_appropriate_end_tag_token(html, &html->new_token))
402 goto HTML_STATE_RAWTEXT_END_TAG_NAME_anything_else;
403 html->state = HTML_STATE_SELF_CLOSING_START_TAG;
404 break;
405 case '>':
406 if (!html_appropriate_end_tag_token(html, &html->new_token))
407 goto HTML_STATE_RAWTEXT_END_TAG_NAME_anything_else;
408 html->state = HTML_STATE_DATA;
409 html_emit_token(html, &html->new_token);
410 break;
411 default:
412 if (IS_UPPER_ALPHA(cc))
413 cc += 0x20;
414 if (IS_LOWER_ALPHA(cc)) {
415 STR_APPEND(html->new_token.tag.name,
416 html->new_token.tag.name_len, cc);
417 STR_APPEND(html->tmp, html->tmp_len, cc);
418 break;
419 }
420 /* FALLTHROUGH */
421 HTML_STATE_RAWTEXT_END_TAG_NAME_anything_else:
422 if (!html->ignore_comment_data) {
423 html_emit_char_token(html, '<');
424 html_emit_char_token(html, '/');
425 for (n = 0; n < html->tmp_len; n++)
426 html_emit_char_token(html, html->tmp[n]);
427 }
428 html->state = HTML_STATE_RAWTEXT;
429 goto reconsume;
430 }
431 break;
432 case HTML_STATE_SCRIPT_DATA_LESS_THAN_SIGN:
433 switch (cc) {
434 case '/':
435 html->tmp_len = 0;
436 html->state = HTML_STATE_SCRIPT_DATA_END_TAG_OPEN;
437 break;
438 case '!':
439 html->state = HTML_STATE_SCRIPT_DATA_ESCAPE_START;
440 if (!html->ignore_comment_data) {
441 html_emit_char_token(html, '<');
442 html_emit_char_token(html, '!');
443 }
444 break;
445 default:
446 if (!html->ignore_comment_data) {
447 html_emit_char_token(html, '<');
448 }
449 html->state = HTML_STATE_SCRIPT_DATA;
450 goto reconsume;
451 }
452 break;
453 case HTML_STATE_SCRIPT_DATA_END_TAG_OPEN:
454 if (IS_ALPHA(cc)) {
455 html_prep_new_token(html, HTML_TOKEN_END_TAG);
456 html->state = HTML_STATE_SCRIPT_DATA_END_TAG_NAME;
457 goto reconsume;
458 }
459 if (!html->ignore_script_data) {
460 html_emit_char_token(html, '<');
461 html_emit_char_token(html, '/');
462 }
463 html->state = HTML_STATE_SCRIPT_DATA;
464 goto reconsume;
465 case HTML_STATE_SCRIPT_DATA_END_TAG_NAME:
466 switch (cc) {
467 case '\t':
468 case '\n':
469 case '\f':
470 case ' ':
471 if (!html_appropriate_end_tag_token(html, &html->new_token))
472 goto HTML_STATE_SCRIPT_DATA_END_TAG_NAME_anything_else;
473 html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
474 break;
475 case '/':
476 if (!html_appropriate_end_tag_token(html, &html->new_token))
477 goto HTML_STATE_SCRIPT_DATA_END_TAG_NAME_anything_else;
478 html->state = HTML_STATE_SELF_CLOSING_START_TAG;
479 break;
480 case '>':
481 if (!html_appropriate_end_tag_token(html, &html->new_token))
482 goto HTML_STATE_SCRIPT_DATA_END_TAG_NAME_anything_else;
483 html->state = HTML_STATE_DATA;
484 html_emit_token(html, &html->new_token);
485 break;
486 default:
487 if (IS_UPPER_ALPHA(cc))
488 cc += 0x20;
489 if (IS_LOWER_ALPHA(cc)) {
490 STR_APPEND(html->new_token.tag.name,
491 html->new_token.tag.name_len, cc);
492 STR_APPEND(html->tmp, html->tmp_len, cc);
493 break;
494 }
495 /* FALLTHROUGH */
496 HTML_STATE_SCRIPT_DATA_END_TAG_NAME_anything_else:
497 if (!html->ignore_script_data) {
498 html_emit_char_token(html, '<');
499 html_emit_char_token(html, '/');
500 for (n = 0; n < html->tmp_len; n++)
501 html_emit_char_token(html, html->tmp[n]);
502 }
503 html->state = HTML_STATE_SCRIPT_DATA;
504 goto reconsume;
505 }
506 break;
507 case HTML_STATE_SCRIPT_DATA_ESCAPE_START:
508 switch (cc) {
509 case '-':
510 html->state = HTML_STATE_SCRIPT_DATA_ESCAPE_START_DASH;
511 if (!html->ignore_script_data) {
512 html_emit_char_token(html, '-');
513 }
514 break;
515 default:
516 html->state = HTML_STATE_SCRIPT_DATA;
517 goto reconsume;
518 }
519 break;
520 case HTML_STATE_SCRIPT_DATA_ESCAPE_START_DASH:
521 switch (cc) {
522 case '-':
523 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH;
524 if (!html->ignore_script_data) {
525 html_emit_char_token(html, '-');
526 }
527 break;
528 default:
529 html->state = HTML_STATE_SCRIPT_DATA;
530 goto reconsume;
531 }
532 break;
533 case HTML_STATE_SCRIPT_DATA_ESCAPED:
534 switch (cc) {
535 case '-':
536 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_DASH;
537 if (!html->ignore_script_data) {
538 html_emit_char_token(html, '-');
539 }
540 break;
541 case '<':
542 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
543 break;
544 case '\0':
545 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
546 if (!html->ignore_script_data) {
547 html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
548 }
549 break;
550 case EOF:
551 html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT;
552 html_emit_eof_token(html);
553 break;
554 default:
555 if (!html->ignore_script_data) {
556 html_emit_char_token(html, cc);
557 }
558 break;
559 }
560 break;
561 case HTML_STATE_SCRIPT_DATA_ESCAPED_DASH:
562 switch (cc) {
563 case '-':
564 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH;
565 if (!html->ignore_script_data) {
566 html_emit_char_token(html, '-');
567 }
568 break;
569 case '<':
570 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
571 break;
572 case '\0':
573 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
574 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
575 if (!html->ignore_script_data) {
576 html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
577 }
578 break;
579 case EOF:
580 html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT;
581 html_emit_eof_token(html);
582 break;
583 default:
584 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
585 if (!html->ignore_script_data) {
586 html_emit_char_token(html, cc);
587 }
588 break;
589 }
590 break;
591 case HTML_STATE_SCRIPT_DATA_ESCAPED_DASH_DASH:
592 switch (cc) {
593 case '-':
594 if (!html->ignore_script_data) {
595 html_emit_char_token(html, '-');
596 }
597 break;
598 case '<':
599 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN;
600 break;
601 case '>':
602 html->state = HTML_STATE_SCRIPT_DATA;
603 if (!html->ignore_script_data) {
604 html_emit_char_token(html, '>');
605 }
606 break;
607 case '\0':
608 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
609 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
610 if (!html->ignore_script_data) {
611 html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
612 }
613 break;
614 case EOF:
615 html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT;
616 html_emit_eof_token(html);
617 break;
618 default:
619 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
620 if (!html->ignore_script_data) {
621 html_emit_char_token(html, cc);
622 }
623 break;
624 }
625 break;
626 case HTML_STATE_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN:
627 switch (cc) {
628 case '/':
629 html->tmp_len = 0;
630 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN;
631 break;
632 default:
633 if (IS_ALPHA(cc)) {
634 html->tmp_len = 0;
635 if (!html->ignore_script_data) {
636 html_emit_char_token(html, '<');
637 }
638 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START;
639 goto reconsume;
640 }
641 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
642 if (!html->ignore_script_data) {
643 html_emit_char_token(html, '<');
644 }
645 goto reconsume;
646 }
647 break;
648 case HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_OPEN:
649 if (IS_ALPHA(cc)) {
650 html_prep_new_token(html, HTML_TOKEN_END_TAG);
651 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME;
652 goto reconsume;
653 }
654 if (!html->ignore_script_data) {
655 html_emit_char_token(html, '<');
656 html_emit_char_token(html, '/');
657 }
658 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
659 goto reconsume;
660 case HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME:
661 switch (cc) {
662 case '\t':
663 case '\n':
664 case '\f':
665 case ' ':
666 if (!html_appropriate_end_tag_token(html, &html->new_token))
667 goto HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME_anything_else;
668 html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
669 break;
670 case '/':
671 if (!html_appropriate_end_tag_token(html, &html->new_token))
672 goto HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME_anything_else;
673 html->state = HTML_STATE_SELF_CLOSING_START_TAG;
674 break;
675 case '>':
676 if (!html_appropriate_end_tag_token(html, &html->new_token))
677 goto HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME_anything_else;
678 html->state = HTML_STATE_DATA;
679 html_emit_token(html, &html->new_token);
680 break;
681 default:
682 if (IS_UPPER_ALPHA(cc))
683 cc += 0x20;
684 if (IS_LOWER_ALPHA(cc)) {
685 STR_APPEND(html->new_token.tag.name,
686 html->new_token.tag.name_len, cc);
687 STR_APPEND(html->tmp, html->tmp_len, cc);
688 break;
689 }
690 /* FALLTHROUGH */
691 HTML_STATE_SCRIPT_DATA_ESCAPED_END_TAG_NAME_anything_else:
692 if (!html->ignore_script_data) {
693 html_emit_char_token(html, '<');
694 html_emit_char_token(html, '/');
695 for (n = 0; n < html->tmp_len; n++)
696 html_emit_char_token(html, html->tmp[n]);
697 }
698 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
699 goto reconsume;
700 }
701 break;
702 case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_START:
703 switch (cc) {
704 case '\t':
705 case '\n':
706 case '\f':
707 case ' ':
708 case '/':
709 case '>':
710 if (html->tmp_len == 6 &&
711 memcmp(html->tmp, "script", 6) == 0) {
712 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
713 } else {
714 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
715 html_emit_char_token(html, cc);
716 }
717 break;
718 default:
719 if (IS_UPPER_ALPHA(cc))
720 cc += 0x20;
721 if (IS_LOWER_ALPHA(cc)) {
722 STR_APPEND(html->tmp, html->tmp_len, cc);
723 break;
724 }
725 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
726 goto reconsume;
727 }
728 break;
729 case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED:
730 switch (cc) {
731 case '-':
732 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH;
733 html_emit_char_token(html, '-');
734 break;
735 case '<':
736 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
737 html_emit_char_token(html, '<');
738 break;
739 case '\0':
740 html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT;
741 html_emit_eof_token(html);
742 break;
743 default:
744 html_emit_char_token(html, cc);
745 break;
746 }
747 break;
748 case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH:
749 switch (cc) {
750 case '-':
751 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH;
752 html_emit_char_token(html, '-');
753 break;
754 case '<':
755 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
756 html_emit_char_token(html, '<');
757 break;
758 case '\0':
759 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
760 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
761 html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
762 break;
763 case EOF:
764 html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT;
765 html_emit_eof_token(html);
766 break;
767 default:
768 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
769 html_emit_char_token(html, cc);
770 break;
771 }
772 break;
773 case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH:
774 switch (cc) {
775 case '-':
776 html_emit_char_token(html, '-');
777 break;
778 case '<':
779 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN;
780 html_emit_char_token(html, '<');
781 break;
782 case '>':
783 html->state = HTML_STATE_SCRIPT_DATA;
784 html_emit_char_token(html, '<');
785 break;
786 case '\0':
787 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
788 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
789 html_emit_char_token(html, HTML_REPLACEMENT_CHARACTER);
790 break;
791 case EOF:
792 html->error = HTML_ERROR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT;
793 html_emit_eof_token(html);
794 break;
795 default:
796 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
797 html_emit_char_token(html, cc);
798 break;
799 }
800 break;
801 case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN:
802 switch (cc) {
803 case '/':
804 html->tmp_len = 0;
805 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END;
806 html_emit_char_token(html, '/');
807 break;
808 default:
809 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
810 goto reconsume;
811 }
812 break;
813 case HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPE_END:
814 switch (cc) {
815 case '\t':
816 case '\n':
817 case '\f':
818 case ' ':
819 case '/':
820 case '>':
821 if (html->tmp_len == 6 &&
822 memcmp(html->tmp, "script", 6) == 0) {
823 html->state = HTML_STATE_SCRIPT_DATA_ESCAPED;
824 } else {
825 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
826 html_emit_char_token(html, cc);
827 }
828 break;
829 default:
830 if (IS_UPPER_ALPHA(cc))
831 cc += 0x20;
832 if (IS_LOWER_ALPHA(cc)) {
833 STR_APPEND(html->tmp, html->tmp_len, cc);
834 html_emit_char_token(html, cc);
835 break;
836 }
837 html->state = HTML_STATE_SCRIPT_DATA_DOUBLE_ESCAPED;
838 goto reconsume;
839 }
840 break;
841 case HTML_STATE_BEFORE_ATTRIBUTE_NAME:
842 switch (cc) {
843 case '\t':
844 case '\n':
845 case '\f':
846 case ' ':
847 /* ignore */
848 break;
849 case '/':
850 case '>':
851 case EOF:
852 html->state = HTML_STATE_AFTER_ATTRIBUTE_NAME;
853 goto reconsume;
854 case '=':
855 html->error =
856 HTML_ERROR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME;
857 attr = html_prep_new_attribute(html, &html->new_token.tag);
858 STR_APPEND(attr->name, attr->name_len, cc);
859 html->state = HTML_STATE_ATTRIBUTE_NAME;
860 break;
861 default:
862 html_prep_new_attribute(html, &html->new_token.tag);
863 html->state = HTML_STATE_ATTRIBUTE_NAME;
864 goto reconsume;
865 }
866 break;
867 case HTML_STATE_ATTRIBUTE_NAME:
868 switch (cc) {
869 case '\t':
870 case '\n':
871 case '\f':
872 case ' ':
873 case '/':
874 case '>':
875 case EOF:
876 html->state = HTML_STATE_AFTER_ATTRIBUTE_NAME;
877 goto reconsume;
878 case '=':
879 html->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE;
880 break;
881 case '\0':
882 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
883 attr = &NEW_TOKEN_LAST_ATTR;
884 STR_APPEND(attr->name, attr->name_len,
885 HTML_REPLACEMENT_CHARACTER);
886 break;
887 case '"':
888 case '\'':
889 case '<':
890 html->error = HTML_ERROR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME;
891 goto HTML_STATE_ATTRIBUTE_NAME_anything_else;
892 default:
893 HTML_STATE_ATTRIBUTE_NAME_anything_else:
894 if (IS_UPPER_ALPHA(cc))
895 cc += 0x20;
896 attr = &NEW_TOKEN_LAST_ATTR;
897 STR_APPEND(attr->name, attr->name_len, cc);
898 /* TODO: check for duplicate attr names, discard this if match */
899 break;
900 }
901 break;
902 case HTML_STATE_AFTER_ATTRIBUTE_NAME:
903 switch (cc) {
904 case '\t':
905 case '\n':
906 case '\f':
907 case ' ':
908 /* ignore */
909 break;
910 case '/':
911 html->state = HTML_STATE_SELF_CLOSING_START_TAG;
912 break;
913 case '=':
914 html->state = HTML_STATE_BEFORE_ATTRIBUTE_VALUE;
915 break;
916 case '>':
917 html_emit_token(html, &html->new_token);
918 html->state = HTML_STATE_DATA;
919 break;
920 case EOF:
921 html->error = HTML_ERROR_EOF_IN_TAG;
922 html_emit_eof_token(html);
923 break;
924 default:
925 html_prep_new_attribute(html, &html->new_token.tag);
926 html->state = HTML_STATE_ATTRIBUTE_NAME;
927 goto reconsume;
928 }
929 break;
930 case HTML_STATE_BEFORE_ATTRIBUTE_VALUE:
931 switch (cc) {
932 case '\t':
933 case '\n':
934 case '\f':
935 case ' ':
936 /* ignore */
937 break;
938 case '"':
939 html->state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED;
940 break;
941 case '\'':
942 html->state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
943 break;
944 case '>':
945 html->error = HTML_ERROR_MISSING_ATTRIBUTE_VALUE;
946 html->state = HTML_STATE_DATA;
947 html_emit_token(html, &html->new_token);
948 break;
949 default:
950 html->state = HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED;
951 goto reconsume;
952 }
953 break;
954 case HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED:
955 switch (cc) {
956 case '"':
957 html->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED;
958 break;
959 case '&':
960 html->return_state = HTML_STATE_ATTRIBUTE_VALUE_DOUBLE_QUOTED;
961 html->tmp_len = 0;
962 html->state = HTML_STATE_CHARACTER_REFERENCE;
963 break;
964 case '\0':
965 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
966 attr = &NEW_TOKEN_LAST_ATTR;
967 STR_APPEND(attr->val, attr->val_len, HTML_REPLACEMENT_CHARACTER);
968 break;
969 case EOF:
970 html->error = HTML_ERROR_EOF_IN_TAG;
971 html_emit_eof_token(html);
972 break;
973 default:
974 attr = &NEW_TOKEN_LAST_ATTR;
975 STR_APPEND(attr->val, attr->val_len, cc);
976 break;
977 }
978 break;
979 case HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED:
980 switch (cc) {
981 case '\'':
982 html->state = HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED;
983 break;
984 case '&':
985 html->return_state = HTML_STATE_ATTRIBUTE_VALUE_SINGLE_QUOTED;
986 html->tmp_len = 0;
987 html->state = HTML_STATE_CHARACTER_REFERENCE;
988 break;
989 case '\0':
990 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
991 attr = &NEW_TOKEN_LAST_ATTR;
992 STR_APPEND(attr->val, attr->val_len, HTML_REPLACEMENT_CHARACTER);
993 break;
994 case EOF:
995 html->error = HTML_ERROR_EOF_IN_TAG;
996 html_emit_eof_token(html);
997 break;
998 default:
999 attr = &NEW_TOKEN_LAST_ATTR;
1000 STR_APPEND(attr->val, attr->val_len, cc);
1001 break;
1002 }
1003 break;
1004 case HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED:
1005 switch (cc) {
1006 case '\t':
1007 case '\n':
1008 case '\f':
1009 case ' ':
1010 html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
1011 break;
1012 case '&':
1013 html->return_state = HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED;
1014 html->tmp_len = 0;
1015 html->state = HTML_STATE_CHARACTER_REFERENCE;
1016 break;
1017 case '>':
1018 html->state = HTML_STATE_DATA;
1019 html_emit_token(html, &html->new_token);
1020 break;
1021 case '\0':
1022 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
1023 attr = &NEW_TOKEN_LAST_ATTR;
1024 STR_APPEND(attr->val, attr->val_len, HTML_REPLACEMENT_CHARACTER);
1025 break;
1026 case '"':
1027 case '\'':
1028 case '<':
1029 case '=':
1030 case '`':
1031 html->error =
1032 HTML_ERROR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE;
1033 goto HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED_anything_else;
1034 case EOF:
1035 html->error = HTML_ERROR_EOF_IN_TAG;
1036 html_emit_eof_token(html);
1037 break;
1038 default:
1039 HTML_STATE_ATTRIBUTE_VALUE_UNQUOTED_anything_else:
1040 attr = &NEW_TOKEN_LAST_ATTR;
1041 STR_APPEND(attr->val, attr->val_len, cc);
1042 break;
1043 }
1044 break;
1045 case HTML_STATE_AFTER_ATTRIBUTE_VALUE_QUOTED:
1046 switch (cc) {
1047 case '\t':
1048 case '\n':
1049 case '\f':
1050 case ' ':
1051 html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
1052 break;
1053 case '/':
1054 html->state = HTML_STATE_SELF_CLOSING_START_TAG;
1055 break;
1056 case '>':
1057 html->state = HTML_STATE_DATA;
1058 html_emit_token(html, &html->new_token);
1059 break;
1060 case EOF:
1061 html->error = HTML_ERROR_EOF_IN_TAG;
1062 html_emit_eof_token(html);
1063 break;
1064 default:
1065 html->error = HTML_ERROR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES;
1066 html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
1067 goto reconsume;
1068 }
1069 break;
1070 case HTML_STATE_SELF_CLOSING_START_TAG:
1071 switch (cc) {
1072 case '>':
1073 html->new_token.tag.self_closing = true;
1074 html->state = HTML_STATE_DATA;
1075 html_emit_token(html, &html->new_token);
1076 break;
1077 case EOF:
1078 html->error = HTML_ERROR_EOF_IN_TAG;
1079 html_emit_eof_token(html);
1080 break;
1081 default:
1082 html->error = HTML_ERROR_UNEXPECTED_SOLIDUS_IN_TAG;
1083 html->state = HTML_STATE_BEFORE_ATTRIBUTE_NAME;
1084 goto reconsume;
1085 }
1086 break;
1087 case HTML_STATE_BOGUS_COMMENT:
1088 switch (cc) {
1089 case '>':
1090 html->state = HTML_STATE_DATA;
1091 html_emit_comment(html, &html->new_token.comment);
1092 break;
1093 case EOF:
1094 html_emit_comment(html, &html->new_token.comment);
1095 html_emit_eof_token(html);
1096 break;
1097 case '\0':
1098 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
1099 if (!html->ignore_comment_data) {
1100 STR_APPEND(html->new_token.comment.data,
1101 html->new_token.comment.len, HTML_REPLACEMENT_CHARACTER);
1102 }
1103 break;
1104 default:
1105 if (!html->ignore_comment_data) {
1106 STR_APPEND(html->new_token.comment.data,
1107 html->new_token.comment.len, cc);
1108 }
1109 break;
1110 }
1111 break;
1112 case HTML_STATE_MARKUP_DECLARATION_OPEN:
1113 /* "If the next few characters are" */
1114 /* https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state */
1115
1116 if (cc == '-' && html->lookahead[0] == '-') {
1117 html_lookahead_consume(html, 1);
1118 html_prep_new_token(html, HTML_TOKEN_COMMENT);
1119 html->state = HTML_STATE_COMMENT_START;
1120 break;
1121 } else if ((cc == 'd' || cc == 'D') &&
1122 strncasecmp(html->lookahead, "octype", 6) == 0) {
1123 html_lookahead_consume(html, 6);
1124 html->state = HTML_STATE_DOCTYPE;
1125 html_prep_new_token(html, HTML_TOKEN_DOCTYPE);
1126 break;
1127 } else if (cc == '[' && memcmp(html->lookahead, "CDATA[", 6) == 0) {
1128 html_lookahead_consume(html, 6);
1129 if (html->current_node->ns != HTML_NAMESPACE_HTML)
1130 html->state = HTML_STATE_CDATA_SECTION;
1131 else
1132 html->error = HTML_ERROR_CDATA_IN_HTML_CONTENT;
1133
1134 html_prep_new_token(html, HTML_TOKEN_COMMENT);
1135 if (!html->ignore_comment_data)
1136 html->new_token.comment.len =
1137 strlcpy(html->new_token.comment.data,
1138 "[CDATA[", sizeof(html->new_token.comment.data));
1139 html->state = HTML_STATE_BOGUS_COMMENT;
1140 break;
1141 } else {
1142 html->error = HTML_ERROR_INCORRECTLY_OPENED_COMMENT;
1143 html_prep_new_token(html, HTML_TOKEN_COMMENT);
1144 html->state = HTML_STATE_BOGUS_COMMENT;
1145 goto reconsume;
1146 }
1147 break;
1148 case HTML_STATE_COMMENT_START:
1149 switch (cc) {
1150 case '-':
1151 html->state = HTML_STATE_COMMENT_START_DASH;
1152 break;
1153 case '>':
1154 html->error = HTML_ERROR_ABRUPT_CLOSING_OF_EMPTY_COMMENT;
1155 html->state = HTML_STATE_DATA;
1156 html_emit_comment(html, &html->new_token.comment);
1157 break;
1158 default:
1159 html->state = HTML_STATE_COMMENT;
1160 goto reconsume;
1161 }
1162 break;
1163 case HTML_STATE_COMMENT_START_DASH:
1164 switch (cc) {
1165 case '-':
1166 html->state = HTML_STATE_COMMENT_END_DASH;
1167 break;
1168 case '>':
1169 html->error = HTML_ERROR_ABRUPT_CLOSING_OF_EMPTY_COMMENT;
1170 html->state = HTML_STATE_DATA;
1171 html->new_token.type = HTML_TOKEN_COMMENT;
1172 html_emit_token(html, &html->new_token);
1173 break;
1174 case EOF:
1175 html->error = HTML_ERROR_EOF_IN_COMMENT;
1176 html->new_token.type = HTML_TOKEN_COMMENT;
1177 html_emit_token(html, &html->new_token);
1178 html_emit_eof_token(html);
1179 break;
1180 default:
1181 if (!html->ignore_comment_data) {
1182 STR_APPEND(html->new_token.comment.data,
1183 html->new_token.comment.len, '-');
1184 }
1185 html->state = HTML_STATE_COMMENT;
1186 goto reconsume;
1187 }
1188 break;
1189 case HTML_STATE_COMMENT:
1190 switch (cc) {
1191 case '<':
1192 if (!html->ignore_comment_data) {
1193 STR_APPEND(html->new_token.comment.data,
1194 html->new_token.comment.len, cc);
1195 }
1196 html->state = HTML_STATE_COMMENT_LESS_THAN_SIGN;
1197 break;
1198 case '-':
1199 html->state = HTML_STATE_COMMENT_END_DASH;
1200 break;
1201 case '\0':
1202 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
1203 if (!html->ignore_comment_data) {
1204 STR_APPEND(html->new_token.comment.data,
1205 html->new_token.comment.len, HTML_REPLACEMENT_CHARACTER);
1206 }
1207 break;
1208 case EOF:
1209 html->error = HTML_ERROR_EOF_IN_COMMENT;
1210 html->new_token.type = HTML_TOKEN_COMMENT;
1211 html_emit_token(html, &html->new_token);
1212 html_emit_eof_token(html);
1213 break;
1214 default:
1215 if (!html->ignore_comment_data) {
1216 STR_APPEND(html->new_token.comment.data,
1217 html->new_token.comment.len, cc);
1218 }
1219 break;
1220 }
1221 break;
1222 case HTML_STATE_COMMENT_LESS_THAN_SIGN:
1223 switch (cc) {
1224 case '!':
1225 if (!html->ignore_comment_data) {
1226 STR_APPEND(html->new_token.comment.data,
1227 html->new_token.comment.len, cc);
1228 }
1229 html->state = HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG;
1230 break;
1231 case '<':
1232 if (!html->ignore_comment_data) {
1233 STR_APPEND(html->new_token.comment.data,
1234 html->new_token.comment.len, cc);
1235 }
1236 break;
1237 default:
1238 html->state = HTML_STATE_COMMENT;
1239 goto reconsume;
1240 }
1241 break;
1242 case HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG:
1243 switch (cc) {
1244 case '-':
1245 html->state = HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH;
1246 break;
1247 default:
1248 html->state = HTML_STATE_COMMENT;
1249 goto reconsume;
1250 }
1251 break;
1252 case HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH:
1253 switch (cc) {
1254 case '-':
1255 html->state = HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH;
1256 break;
1257 default:
1258 html->state = HTML_STATE_COMMENT_END_DASH;
1259 goto reconsume;
1260 }
1261 break;
1262 case HTML_STATE_COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH:
1263 switch (cc) {
1264 case '>':
1265 case EOF:
1266 html->state = HTML_STATE_COMMENT_END;
1267 goto reconsume;
1268 default:
1269 html->error = HTML_ERROR_NESTED_COMMENT;
1270 html->state = HTML_STATE_COMMENT_END;
1271 goto reconsume;
1272 }
1273 break;
1274 case HTML_STATE_COMMENT_END_DASH:
1275 switch (cc) {
1276 case '-':
1277 html->state = HTML_STATE_COMMENT_END;
1278 break;
1279 case EOF:
1280 html->error = HTML_ERROR_EOF_IN_COMMENT;
1281 html->new_token.type = HTML_TOKEN_COMMENT;
1282 html_emit_token(html, &html->new_token);
1283 html_emit_eof_token(html);
1284 break;
1285 default:
1286 if (!html->ignore_comment_data) {
1287 STR_APPEND(html->new_token.comment.data,
1288 html->new_token.comment.len, '-');
1289 }
1290 html->state = HTML_STATE_COMMENT;
1291 goto reconsume;
1292 }
1293 break;
1294 case HTML_STATE_COMMENT_END:
1295 switch (cc) {
1296 case '>':
1297 html->state = HTML_STATE_DATA;
1298 html->new_token.type = HTML_TOKEN_COMMENT;
1299 html_emit_token(html, &html->new_token);
1300 break;
1301 case '!':
1302 html->state = HTML_STATE_COMMENT_END;
1303 break;
1304 case '-':
1305 if (!html->ignore_comment_data) {
1306 STR_APPEND(html->new_token.comment.data,
1307 html->new_token.comment.len, '-');
1308 }
1309 break;
1310 case EOF:
1311 html->error = HTML_ERROR_EOF_IN_COMMENT;
1312 html->new_token.type = HTML_TOKEN_COMMENT;
1313 html_emit_token(html, &html->new_token);
1314 html_emit_eof_token(html);
1315 break;
1316 default:
1317 if (!html->ignore_comment_data) {
1318 STR_APPEND(html->new_token.comment.data,
1319 html->new_token.comment.len, '-');
1320 STR_APPEND(html->new_token.comment.data,
1321 html->new_token.comment.len, '-');
1322 }
1323 html->state = HTML_STATE_COMMENT;
1324 goto reconsume;
1325 }
1326 break;
1327 case HTML_STATE_COMMENT_END_BANG:
1328 switch (cc) {
1329 case '-':
1330 if (!html->ignore_comment_data) {
1331 STR_APPEND(html->new_token.comment.data,
1332 html->new_token.comment.len, '-');
1333 STR_APPEND(html->new_token.comment.data,
1334 html->new_token.comment.len, '-');
1335 STR_APPEND(html->new_token.comment.data,
1336 html->new_token.comment.len, '!');
1337 }
1338 html->state = HTML_STATE_COMMENT_END_DASH;
1339 break;
1340 case '>':
1341 html->error = HTML_ERROR_INCORRECTLY_CLOSED_COMMENT;
1342 html->state = HTML_STATE_DATA;
1343 html->new_token.type = HTML_TOKEN_COMMENT;
1344 html_emit_token(html, &html->new_token);
1345 break;
1346 case EOF:
1347 html->error = HTML_ERROR_EOF_IN_COMMENT;
1348 html->new_token.type = HTML_TOKEN_COMMENT;
1349 html_emit_token(html, &html->new_token);
1350 html_emit_eof_token(html);
1351 break;
1352 default:
1353 if (!html->ignore_comment_data) {
1354 STR_APPEND(html->new_token.comment.data,
1355 html->new_token.comment.len, '-');
1356 STR_APPEND(html->new_token.comment.data,
1357 html->new_token.comment.len, '-');
1358 STR_APPEND(html->new_token.comment.data,
1359 html->new_token.comment.len, '!');
1360 }
1361 html->state = HTML_STATE_COMMENT;
1362 goto reconsume;
1363 }
1364 break;
1365 case HTML_STATE_DOCTYPE:
1366 switch (cc) {
1367 case '\t':
1368 case '\n':
1369 case '\f':
1370 case ' ':
1371 html->state = HTML_STATE_BEFORE_DOCTYPE_NAME;
1372 break;
1373 case '>':
1374 html->state = HTML_STATE_BEFORE_DOCTYPE_NAME;
1375 goto reconsume;
1376 case EOF:
1377 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1378 html->new_token.doctype.force_quirks = true;
1379 html_prep_new_token(html, HTML_TOKEN_DOCTYPE);
1380 html_emit_token(html, &html->new_token);
1381 html_emit_eof_token(html);
1382 break;
1383 default:
1384 html->error = HTML_ERROR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME;
1385 html->state = HTML_STATE_BEFORE_DOCTYPE_NAME;
1386 goto reconsume;
1387 }
1388 break;
1389 case HTML_STATE_BEFORE_DOCTYPE_NAME:
1390 switch (cc) {
1391 case '\t':
1392 case '\n':
1393 case '\f':
1394 case ' ':
1395 /* ignore */
1396 break;
1397 case '\0':
1398 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
1399 html_prep_new_token(html, HTML_TOKEN_DOCTYPE);
1400 STR_APPEND(html->new_token.tag.name,
1401 html->new_token.tag.name_len, '!');
1402 html->state = HTML_STATE_DOCTYPE_NAME;
1403 break;
1404 case EOF:
1405 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1406 html->new_token.doctype.force_quirks = true;
1407 html_prep_new_token(html, HTML_TOKEN_DOCTYPE);
1408 html_emit_token(html, &html->new_token);
1409 html_emit_eof_token(html);
1410 break;
1411 case '>':
1412 html->error = HTML_ERROR_MISSING_DOCTYPE_NAME;
1413 html->new_token.doctype.force_quirks = true;
1414 html_emit_token(html, &html->new_token);
1415 html->state = HTML_STATE_DATA;
1416 break;
1417 default:
1418 if (IS_UPPER_ALPHA(cc))
1419 cc += 0x20;
1420 html_prep_new_token(html, HTML_TOKEN_DOCTYPE);
1421 html->state = HTML_STATE_DOCTYPE_NAME;
1422 STR_APPEND(html->new_token.doctype.name,
1423 html->new_token.doctype.name_len, cc);
1424 break;
1425 }
1426 break;
1427 case HTML_STATE_DOCTYPE_NAME:
1428 switch (cc) {
1429 case '\t':
1430 case '\n':
1431 case '\f':
1432 case ' ':
1433 html->state = HTML_STATE_AFTER_DOCTYPE_NAME;
1434 html->tmp_len = 0;
1435 break;
1436 case '>':
1437 html_emit_token(html, &html->new_token);
1438 html->state = HTML_STATE_DATA;
1439 break;
1440 case '\0':
1441 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
1442 STR_APPEND(html->new_token.doctype.name,
1443 html->new_token.doctype.name_len, HTML_REPLACEMENT_CHARACTER);
1444 break;
1445 case EOF:
1446 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1447 html->new_token.doctype.force_quirks = true;
1448 html_prep_new_token(html, HTML_TOKEN_DOCTYPE);
1449 html_emit_token(html, &html->new_token);
1450 html_emit_eof_token(html);
1451 break;
1452 default:
1453 if (IS_UPPER_ALPHA(cc))
1454 cc += 0x20;
1455 STR_APPEND(html->new_token.doctype.name,
1456 html->new_token.doctype.name_len, cc);
1457 break;
1458 }
1459 break;
1460 case HTML_STATE_AFTER_DOCTYPE_NAME:
1461 switch (cc) {
1462 case '\t':
1463 case '\n':
1464 case '\f':
1465 case ' ':
1466 /* ignore */
1467 break;
1468 case '>':
1469 html->state = HTML_STATE_DATA;
1470 html_emit_token(html, &html->new_token);
1471 break;
1472 case EOF:
1473 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1474 html->new_token.doctype.force_quirks = true;
1475 html_emit_token(html, &html->new_token);
1476 html_emit_eof_token(html);
1477 break;
1478 default:
1479 if ((cc == 'p' || cc == 'P') &&
1480 strncasecmp(html->lookahead, "ublic", 5) == 0) {
1481 html->state = HTML_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD;
1482 html->lookahead_len = 0;
1483 } else if ((cc == 's' || cc == 'S') &&
1484 strncasecmp(html->lookahead, "ystem", 5) == 0) {
1485 html->state = HTML_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD;
1486 html->lookahead_len = 0;
1487 } else {
1488 html->error =
1489 HTML_ERROR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME;
1490 html->new_token.doctype.force_quirks = true;
1491 html->state = HTML_STATE_BOGUS_DOCTYPE;
1492 goto reconsume;
1493 }
1494 break;
1495 }
1496 break;
1497 case HTML_STATE_AFTER_DOCTYPE_PUBLIC_KEYWORD:
1498 switch (cc) {
1499 case '\t':
1500 case '\n':
1501 case '\f':
1502 case ' ':
1503 html->state = HTML_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER;
1504 break;
1505 case '"':
1506 html->error =
1507 HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD;
1508 memset(html->new_token.doctype.public_identifier, 0,
1509 sizeof(html->new_token.doctype.public_identifier));
1510 html->new_token.doctype.public_identifier_len = 0;
1511 html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED;
1512 break;
1513 case '\'':
1514 html->error =
1515 HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD;
1516 memset(html->new_token.doctype.public_identifier, 0,
1517 sizeof(html->new_token.doctype.public_identifier));
1518 html->new_token.doctype.public_identifier_len = 0;
1519 html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
1520 break;
1521 case '>':
1522 html->error = HTML_ERROR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER;
1523 html->new_token.doctype.force_quirks = true;
1524 html->state = HTML_STATE_DATA;
1525 html_emit_token(html, &html->new_token);
1526 break;
1527 case EOF:
1528 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1529 html->new_token.doctype.force_quirks = true;
1530 html_emit_token(html, &html->new_token);
1531 html_emit_eof_token(html);
1532 break;
1533 default:
1534 html->error =
1535 HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER;
1536 html->new_token.doctype.force_quirks = true;
1537 html->state = HTML_STATE_BOGUS_DOCTYPE;
1538 goto reconsume;
1539 }
1540 break;
1541 case HTML_STATE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
1542 switch (cc) {
1543 case '\t':
1544 case '\n':
1545 case '\f':
1546 case ' ':
1547 /* ignore */
1548 break;
1549 case '"':
1550 memset(html->new_token.doctype.public_identifier, 0,
1551 sizeof(html->new_token.doctype.public_identifier));
1552 html->new_token.doctype.public_identifier_len = 0;
1553 html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED;
1554 break;
1555 case '\'':
1556 memset(html->new_token.doctype.public_identifier, 0,
1557 sizeof(html->new_token.doctype.public_identifier));
1558 html->new_token.doctype.public_identifier_len = 0;
1559 html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
1560 break;
1561 case '>':
1562 html->error = HTML_ERROR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER;
1563 html->new_token.doctype.force_quirks = true;
1564 html->state = HTML_STATE_DATA;
1565 html_emit_token(html, &html->new_token);
1566 break;
1567 case EOF:
1568 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1569 html->new_token.doctype.force_quirks = true;
1570 html_emit_token(html, &html->new_token);
1571 html_emit_eof_token(html);
1572 break;
1573 default:
1574 html->error =
1575 HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER;
1576 html->new_token.doctype.force_quirks = true;
1577 html->state = HTML_STATE_BOGUS_DOCTYPE;
1578 goto reconsume;
1579 }
1580 break;
1581 case HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED:
1582 switch (cc) {
1583 case '"':
1584 html->state = HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER;
1585 break;
1586 case '\0':
1587 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
1588 STR_APPEND(html->new_token.doctype.public_identifier,
1589 html->new_token.doctype.public_identifier_len,
1590 HTML_REPLACEMENT_CHARACTER);
1591 break;
1592 case '>':
1593 html->error = HTML_ERROR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER;
1594 html->new_token.doctype.force_quirks = true;
1595 html->state = HTML_STATE_DATA;
1596 html_emit_token(html, &html->new_token);
1597 break;
1598 case EOF:
1599 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1600 html->new_token.doctype.force_quirks = true;
1601 html_emit_token(html, &html->new_token);
1602 html_emit_eof_token(html);
1603 break;
1604 default:
1605 STR_APPEND(html->new_token.doctype.public_identifier,
1606 html->new_token.doctype.public_identifier_len, cc);
1607 break;
1608 }
1609 break;
1610 case HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED:
1611 switch (cc) {
1612 case '\'':
1613 html->state = HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER;
1614 break;
1615 case '\0':
1616 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
1617 STR_APPEND(html->new_token.doctype.public_identifier,
1618 html->new_token.doctype.public_identifier_len,
1619 HTML_REPLACEMENT_CHARACTER);
1620 break;
1621 case '>':
1622 html->error = HTML_ERROR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER;
1623 html->new_token.doctype.force_quirks = true;
1624 html->state = HTML_STATE_DATA;
1625 html_emit_token(html, &html->new_token);
1626 break;
1627 case EOF:
1628 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1629 html->new_token.doctype.force_quirks = true;
1630 html_emit_token(html, &html->new_token);
1631 html_emit_eof_token(html);
1632 break;
1633 default:
1634 STR_APPEND(html->new_token.doctype.public_identifier,
1635 html->new_token.doctype.public_identifier_len, cc);
1636 break;
1637 }
1638 break;
1639 case HTML_STATE_AFTER_DOCTYPE_PUBLIC_IDENTIFIER:
1640 switch (cc) {
1641 case '\t':
1642 case '\n':
1643 case '\f':
1644 case ' ':
1645 html->state =
1646 HTML_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS;
1647 break;
1648 case '>':
1649 html->state = HTML_STATE_DATA;
1650 html_emit_token(html, &html->new_token);
1651 break;
1652 case '"':
1653 html->error =
1654 HTML_ERROR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS;
1655 memset(html->new_token.doctype.system_identifier, 0,
1656 sizeof(html->new_token.doctype.system_identifier));
1657 html->new_token.doctype.system_identifier_len = 0;
1658 html->state = HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
1659 break;
1660 case '\'':
1661 html->error =
1662 HTML_ERROR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS;
1663 memset(html->new_token.doctype.system_identifier, 0,
1664 sizeof(html->new_token.doctype.system_identifier));
1665 html->new_token.doctype.system_identifier_len = 0;
1666 html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
1667 break;
1668 case EOF:
1669 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1670 html->new_token.doctype.force_quirks = true;
1671 html_emit_token(html, &html->new_token);
1672 html_emit_eof_token(html);
1673 break;
1674 default:
1675 html->error =
1676 HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
1677 html->new_token.doctype.force_quirks = true;
1678 html->state = HTML_STATE_BOGUS_DOCTYPE;
1679 goto reconsume;
1680 }
1681 break;
1682 case HTML_STATE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
1683 switch (cc) {
1684 case '\t':
1685 case '\n':
1686 case '\f':
1687 case ' ':
1688 /* ignore */
1689 break;
1690 case '>':
1691 html->state = HTML_STATE_DATA;
1692 html_emit_token(html, &html->new_token);
1693 break;
1694 case '"':
1695 memset(html->new_token.doctype.system_identifier, 0,
1696 sizeof(html->new_token.doctype.system_identifier));
1697 html->new_token.doctype.system_identifier_len = 0;
1698 html->state = HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
1699 break;
1700 case '\'':
1701 memset(html->new_token.doctype.system_identifier, 0,
1702 sizeof(html->new_token.doctype.system_identifier));
1703 html->new_token.doctype.system_identifier_len = 0;
1704 html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
1705 break;
1706 case EOF:
1707 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1708 html->new_token.doctype.force_quirks = true;
1709 html_emit_token(html, &html->new_token);
1710 html_emit_eof_token(html);
1711 break;
1712 default:
1713 html->error =
1714 HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
1715 html->new_token.doctype.force_quirks = true;
1716 html->state = HTML_STATE_BOGUS_DOCTYPE;
1717 goto reconsume;
1718 }
1719 break;
1720 case HTML_STATE_AFTER_DOCTYPE_SYSTEM_KEYWORD:
1721 switch (cc) {
1722 case '\t':
1723 case '\n':
1724 case '\f':
1725 case ' ':
1726 html->state = HTML_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
1727 break;
1728 case '"':
1729 html->error =
1730 HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD;
1731 memset(html->new_token.doctype.system_identifier, 0,
1732 sizeof(html->new_token.doctype.system_identifier));
1733 html->new_token.doctype.system_identifier_len = 0;
1734 html->state = HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
1735 break;
1736 case '\'':
1737 html->error =
1738 HTML_ERROR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD;
1739 memset(html->new_token.doctype.system_identifier, 0,
1740 sizeof(html->new_token.doctype.system_identifier));
1741 html->new_token.doctype.system_identifier_len = 0;
1742 html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
1743 break;
1744 case '>':
1745 html->error = HTML_ERROR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER;
1746 html->new_token.doctype.force_quirks = true;
1747 html_emit_token(html, &html->new_token);
1748 break;
1749 case EOF:
1750 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1751 html->new_token.doctype.force_quirks = true;
1752 html_emit_token(html, &html->new_token);
1753 html_emit_eof_token(html);
1754 break;
1755 default:
1756 html->error =
1757 HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
1758 html->new_token.doctype.force_quirks = true;
1759 html->state = HTML_STATE_BOGUS_DOCTYPE;
1760 goto reconsume;
1761 }
1762 break;
1763 case HTML_STATE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
1764 switch (cc) {
1765 case '\t':
1766 case '\n':
1767 case '\f':
1768 case ' ':
1769 /* ignore */
1770 break;
1771 case '"':
1772 memset(html->new_token.doctype.system_identifier, 0,
1773 sizeof(html->new_token.doctype.system_identifier));
1774 html->new_token.doctype.system_identifier_len = 0;
1775 html->state = HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED;
1776 break;
1777 case '\'':
1778 memset(html->new_token.doctype.system_identifier, 0,
1779 sizeof(html->new_token.doctype.system_identifier));
1780 html->new_token.doctype.system_identifier_len = 0;
1781 html->state = HTML_STATE_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED;
1782 break;
1783 case '>':
1784 html->error = HTML_ERROR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER;
1785 html->new_token.doctype.force_quirks = true;
1786 html_emit_token(html, &html->new_token);
1787 break;
1788 case EOF:
1789 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1790 html->new_token.doctype.force_quirks = true;
1791 html_emit_token(html, &html->new_token);
1792 html_emit_eof_token(html);
1793 break;
1794 default:
1795 html->error =
1796 HTML_ERROR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER;
1797 html->new_token.doctype.force_quirks = true;
1798 html->state = HTML_STATE_BOGUS_DOCTYPE;
1799 goto reconsume;
1800 }
1801 break;
1802 case HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED:
1803 switch (cc) {
1804 case '"':
1805 html->state = HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER;
1806 break;
1807 case '\0':
1808 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
1809 STR_APPEND(html->new_token.doctype.system_identifier,
1810 html->new_token.doctype.system_identifier_len,
1811 HTML_REPLACEMENT_CHARACTER);
1812 break;
1813 case '>':
1814 html->error = HTML_ERROR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER;
1815 html->new_token.doctype.force_quirks = true;
1816 html_emit_token(html, &html->new_token);
1817 break;
1818 case EOF:
1819 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1820 html->new_token.doctype.force_quirks = true;
1821 html_emit_token(html, &html->new_token);
1822 html_emit_eof_token(html);
1823 break;
1824 default:
1825 STR_APPEND(html->new_token.doctype.system_identifier,
1826 html->new_token.doctype.system_identifier_len, cc);
1827 break;
1828 }
1829 break;
1830 case HTML_STATE_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED:
1831 switch (cc) {
1832 case '\'':
1833 html->state = HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER;
1834 break;
1835 case '\0':
1836 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
1837 STR_APPEND(html->new_token.doctype.system_identifier,
1838 html->new_token.doctype.system_identifier_len,
1839 HTML_REPLACEMENT_CHARACTER);
1840 break;
1841 case '>':
1842 html->error = HTML_ERROR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER;
1843 html->new_token.doctype.force_quirks = true;
1844 html_emit_token(html, &html->new_token);
1845 break;
1846 case EOF:
1847 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1848 html->new_token.doctype.force_quirks = true;
1849 html_emit_token(html, &html->new_token);
1850 html_emit_eof_token(html);
1851 break;
1852 default:
1853 STR_APPEND(html->new_token.doctype.system_identifier,
1854 html->new_token.doctype.system_identifier_len, cc);
1855 break;
1856 }
1857 break;
1858 case HTML_STATE_AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
1859 switch (cc) {
1860 case '\t':
1861 case '\n':
1862 case '\f':
1863 case ' ':
1864 /* ignore */
1865 break;
1866 case '>':
1867 html->state = HTML_STATE_DATA;
1868 html_emit_token(html, &html->new_token);
1869 break;
1870 case EOF:
1871 html->error = HTML_ERROR_EOF_IN_DOCTYPE;
1872 html->new_token.doctype.force_quirks = true;
1873 html_emit_token(html, &html->new_token);
1874 html_emit_eof_token(html);
1875 break;
1876 default:
1877 html->error =
1878 HTML_ERROR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER;
1879 html->state = HTML_STATE_BOGUS_DOCTYPE;
1880 goto reconsume;
1881 }
1882 break;
1883 case HTML_STATE_BOGUS_DOCTYPE:
1884 switch (cc) {
1885 case '>':
1886 html->state = HTML_STATE_DATA;
1887 html_emit_token(html, &html->new_token);
1888 break;
1889 case '\0':
1890 html->error = HTML_ERROR_UNEXPECTED_NULL_CHARACTER;
1891 /* ignore */
1892 break;
1893 case EOF:
1894 html_emit_token(html, &html->new_token);
1895 html_emit_eof_token(html);
1896 break;
1897 default:
1898 /* ignore */
1899 break;
1900 }
1901 break;
1902 case HTML_STATE_CDATA_SECTION:
1903 switch (cc) {
1904 case ']':
1905 html->state = HTML_STATE_CDATA_SECTION_BRACKET;
1906 break;
1907 case EOF:
1908 html->error = HTML_ERROR_EOF_IN_CDATA;
1909 html_emit_eof_token(html);
1910 break;
1911 default:
1912 if (!html->ignore_comment_data)
1913 html_emit_char_token(html, cc);
1914 break;
1915 }
1916 break;
1917 case HTML_STATE_CDATA_SECTION_BRACKET:
1918 switch (cc) {
1919 case ']':
1920 html->state = HTML_STATE_CDATA_SECTION_END;
1921 break;
1922 default:
1923 if (!html->ignore_comment_data)
1924 html_emit_char_token(html, ']');
1925 html->state = HTML_STATE_CDATA_SECTION;
1926 goto reconsume;
1927 }
1928 break;
1929 case HTML_STATE_CDATA_SECTION_END:
1930 switch (cc) {
1931 case ']':
1932 if (!html->ignore_comment_data)
1933 html_emit_char_token(html, ']');
1934 break;
1935 case '>':
1936 html->state = HTML_STATE_DATA;
1937 break;
1938 default:
1939 if (!html->ignore_comment_data) {
1940 html_emit_char_token(html, ']');
1941 html_emit_char_token(html, ']');
1942 }
1943 html->state = HTML_STATE_CDATA_SECTION;
1944 goto reconsume;
1945 }
1946 break;
1947 case HTML_STATE_CHARACTER_REFERENCE:
1948 STR_APPEND(html->tmp, html->tmp_len, '&');
1949
1950 if (cc == '#') {
1951 STR_APPEND(html->tmp, html->tmp_len, cc);
1952 html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE;
1953 break;
1954 }
1955 if (IS_ALPHANUMERIC(cc)) {
1956 html->state = HTML_STATE_NAMED_CHARACTER_REFERENCE;
1957 goto reconsume;
1958 }
1959
1960 /* "flush code points consumed as a character reference" */
1961 if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) {
1962 /* consumed as part of an attribute */
1963 for (n = 0; n < html->tmp_len; n++) {
1964 attr = &NEW_TOKEN_LAST_ATTR;
1965 STR_APPEND(attr->val, attr->val_len, html->tmp[n]);
1966 }
1967 } else {
1968 /* TODO: check return state for comment ones if ignoring */
1969
1970 for (n = 0; n < html->tmp_len; n++)
1971 html_emit_char_token(html, html->tmp[n]);
1972 }
1973 html->tmp_len = 0;
1974 html->state = html->return_state;
1975 goto reconsume;
1976 case HTML_STATE_NAMED_CHARACTER_REFERENCE:
1977 found_entity = NULL;
1978
1979 STR_APPEND(html->tmp, html->tmp_len, cc);
1980
1981 for (n = 0; n < html->lookahead_len; n++) {
1982 STR_APPEND(html->tmp, html->tmp_len, html->lookahead[n]);
1983 if (html->lookahead[n] == ';')
1984 break;
1985 }
1986
1987 HTML_DEBUG((": trying to match '%s'", html->tmp));
1988
1989 found_entity = NULL;
1990 for (j = 0; html_entities[j].entity != NULL; j++) {
1991 for (i = 0; ; i++) {
1992 if (html_entities[j].entity[i] == '\0') {
1993 /*
1994 * If we have an ; in our buffer, match the longer
1995 * version of this entity instead (&amp; instead of
1996 * &amp)
1997 */
1998 if (html_entities[j].entity[i - 1] != ';' &&
1999 html->tmp[i] == ';')
2000 goto next_entity;
2001 found_entity = &html_entities[j];
2002 HTML_DEBUG((": matched lookahead to entity '%s'",
2003 found_entity->entity));
2004 html_lookahead_consume(html, i - 2);
2005 break;
2006 }
2007 if (i >= html->tmp_len ||
2008 html_entities[j].entity[i] != html->tmp[i])
2009 goto next_entity;
2010 }
2011 next_entity:
2012 continue;
2013 }
2014
2015 if (found_entity != NULL) {
2016 if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE &&
2017 html->tmp[html->tmp_len - 1] != ';' &&
2018 (html->lookahead[0] == '=' ||
2019 IS_ALPHANUMERIC(html->lookahead[0]))) {
2020 /*
2021 * "for historical reasons, flush code points consumed as a
2022 * character reference and switch to the return state."
2023 */
2024 HTML_DEBUG((": doing historical flush thing"));
2025 attr = &NEW_TOKEN_LAST_ATTR;
2026 for (n = 0; n < html->tmp_len; n++) {
2027 STR_APPEND(attr->val, attr->val_len, html->tmp[n]);
2028 }
2029 html->tmp_len = 0;
2030 html->state = html->return_state;
2031 break;
2032 }
2033
2034 /* otherwise... */
2035 if (html->tmp[html->tmp_len - 1] != ';')
2036 html->error =
2037 HTML_ERROR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE;
2038
2039 html->tmp_len = 0;
2040
2041 if ((j = (found_entity->codepoint >> 24) & 0xff))
2042 html->tmp[html->tmp_len++] = j;
2043 if ((j = (found_entity->codepoint >> 16) & 0xff))
2044 html->tmp[html->tmp_len++] = j;
2045 if ((j = (found_entity->codepoint >> 8) & 0xff))
2046 html->tmp[html->tmp_len++] = j;
2047 if ((j = found_entity->codepoint & 0xff))
2048 html->tmp[html->tmp_len++] = j;
2049
2050 /* fall through */
2051 } else {
2052 HTML_DEBUG((": no entity found for '%s'", html->tmp));
2053
2054 /* pretend we didn't copy anything into tmp after & and cc */
2055 html->tmp_len = 2;
2056 html->tmp[html->tmp_len] = '\0';
2057 }
2058
2059 if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) {
2060 attr = &NEW_TOKEN_LAST_ATTR;
2061 for (n = 0; n < html->tmp_len; n++) {
2062 STR_APPEND(attr->val, attr->val_len, html->tmp[n]);
2063 }
2064 HTML_DEBUG((": attribute %s=\"%s\"", attr->name, attr->val));
2065 } else {
2066 for (j = 0; j < html->tmp_len; j++)
2067 html_emit_char_token(html, html->tmp[j]);
2068 }
2069
2070 html->tmp_len = 0;
2071 if (found_entity == NULL)
2072 html->state = HTML_STATE_AMBIGUOUS_AMPERSAND;
2073 else
2074 html->state = html->return_state;
2075 break;
2076 case HTML_STATE_AMBIGUOUS_AMPERSAND:
2077 if (IS_ALPHANUMERIC(cc)) {
2078 if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) {
2079 attr = &NEW_TOKEN_LAST_ATTR;
2080 STR_APPEND(attr->val, attr->val_len, cc);
2081 } else {
2082 html_emit_char_token(html, cc);
2083 }
2084 break;
2085 }
2086 if (cc == ';') {
2087 html->error = HTML_ERROR_UNKNOWN_NAMED_CHARACTER_REFERENCE;
2088 html->state = html->return_state;
2089 goto reconsume;
2090 }
2091 html->state = html->return_state;
2092 goto reconsume;
2093 case HTML_STATE_NUMERIC_CHARACTER_REFERENCE:
2094 html->char_ref_code = 0;
2095
2096 switch (cc) {
2097 case 'x':
2098 case 'X':
2099 STR_APPEND(html->tmp, html->tmp_len, cc);
2100 html->state = HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE_START;
2101 break;
2102 default:
2103 html->state = HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START;
2104 goto reconsume;
2105 }
2106 break;
2107 case HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE_START:
2108 if (IS_HEX_DIGIT(cc)) {
2109 html->state = HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE;
2110 goto reconsume;
2111 }
2112
2113 html->error =
2114 HTML_ERROR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE;
2115
2116 /* "flush code points consumed as a character reference" */
2117 if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) {
2118 /* consumed as part of an attribute */
2119 for (n = 0; n < html->tmp_len; n++) {
2120 attr = &NEW_TOKEN_LAST_ATTR;
2121 STR_APPEND(attr->val, attr->val_len, html->tmp[n]);
2122 }
2123 } else {
2124 for (n = 0; n < html->tmp_len; n++)
2125 html_emit_char_token(html, html->tmp[n]);
2126 }
2127 html->state = html->return_state;
2128 goto reconsume;
2129 case HTML_STATE_DECIMAL_CHARACTER_REFERENCE_START:
2130 if (IS_DIGIT(cc)) {
2131 html->state = HTML_STATE_DECIMAL_CHARACTER_REFERENCE;
2132 goto reconsume;
2133 }
2134
2135 html->error =
2136 HTML_ERROR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE;
2137
2138 /* "flush code points consumed as a character reference" */
2139 if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) {
2140 /* consumed as part of an attribute */
2141 for (n = 0; n < html->tmp_len; n++) {
2142 attr = &NEW_TOKEN_LAST_ATTR;
2143 STR_APPEND(attr->val, attr->val_len, html->tmp[n]);
2144 }
2145 } else {
2146 for (n = 0; n < html->tmp_len; n++)
2147 html_emit_char_token(html, html->tmp[n]);
2148 }
2149 html->state = html->return_state;
2150 goto reconsume;
2151 case HTML_STATE_HEXADECIMAL_CHARACTER_REFERENCE:
2152 if (IS_DIGIT(cc)) {
2153 html->char_ref_code *= 16;
2154 html->char_ref_code += (cc - 0x30);
2155 } else if (IS_UPPER_HEX_DIGIT(cc)) {
2156 html->char_ref_code *= 16;
2157 html->char_ref_code += (cc - 0x37);
2158 } else if (IS_LOWER_HEX_DIGIT(cc)) {
2159 html->char_ref_code *= 16;
2160 html->char_ref_code += (cc - 0x57);
2161 } else if (cc == ';') {
2162 html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END;
2163 goto reconsume;
2164 } else {
2165 html->error =
2166 HTML_ERROR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE;
2167 html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END;
2168 goto reconsume;
2169 }
2170 break;
2171 case HTML_STATE_DECIMAL_CHARACTER_REFERENCE:
2172 if (IS_DIGIT(cc)) {
2173 html->char_ref_code *= 10;
2174 html->char_ref_code += (cc - 0x30);
2175 } else if (cc == ';') {
2176 html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END;
2177 goto reconsume;
2178 } else {
2179 html->error =
2180 HTML_ERROR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE;
2181 html->state = HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END;
2182 goto reconsume;
2183 }
2184 break;
2185 case HTML_STATE_NUMERIC_CHARACTER_REFERENCE_END:
2186 /* this state does not consume a character */
2187
2188 if (html->char_ref_code == 0) {
2189 html->error = HTML_ERROR_NULL_CHARACTER_REFERENCE;
2190 html->char_ref_code = 0xfffd;
2191 } else if (html->char_ref_code > 0x10ffff) {
2192 html->error =
2193 HTML_ERROR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE;
2194 html->char_ref_code = 0xfffd;
2195 } else if (IS_SURROGATE(html->char_ref_code)) {
2196 html->error = HTML_ERROR_SURROGATE_CHARACTER_REFERENCE;
2197 html->char_ref_code = 0xfffd;
2198 } else if (IS_NONCHARACTER(html->char_ref_code)) {
2199 html->error = HTML_ERROR_NONCHARACTER_CHARACTER_REFERENCE;
2200 } else if (html->char_ref_code == 0x0d ||
2201 (IS_CONTROL(html->char_ref_code) &&
2202 !IS_WHITESPACE(html->char_ref_code))) {
2203 html->error = HTML_ERROR_CONTROL_CHARACTER_REFERENCE;
2204 /* TODO: lookup in table */
2205 }
2206
2207 html->tmp[0] = html->char_ref_code;
2208 html->tmp_len = 1;
2209
2210 /* "flush code points consumed as a character reference" */
2211 if (CONSUMED_AS_PART_OF_AN_ATTRIBUTE) {
2212 /* consumed as part of an attribute */
2213 for (n = 0; n < html->tmp_len; n++) {
2214 attr = &NEW_TOKEN_LAST_ATTR;
2215 STR_APPEND(attr->val, attr->val_len, html->tmp[n]);
2216 }
2217 } else {
2218 for (n = 0; n < html->tmp_len; n++)
2219 html_emit_char_token(html, html->tmp[n]);
2220 }
2221
2222 html->state = html->return_state;
2223 break;
2224 default:
2225 panic("bogus tokenize state %d", html->state);
2226 }
2227
2228 if (html->state != was_state)
2229 HTML_DEBUG((": exited state %d", html_state_names[html->state]));
2230
2231 if (html->error) {
2232 HTML_DEBUG((": error %s", html_error_strings[html->error]));
2233 html->error = 0;
2234 }
2235 HTML_DEBUG(("\r"));
2236 }
2237
2238 void
2239 html_tokenize_finish(struct html_page *html)
2240 {
2241 if (html->lookahead_len) {
2242 HTML_DEBUG(("finish requested, tokenizing remaining %d lookahead\r",
2243 html->lookahead_len));
2244
2245 while (html->lookahead_len)
2246 html_tokenize(html, EOF);
2247 }
2248
2249 html_tokenize(html, EOF);
2250
2251 html_stop_parsing(html);
2252 }
2253
2254 void
2255 html_prep_new_token(struct html_page *html, html_token_type token_type)
2256 {
2257 memset(&html->new_token, 0, sizeof(html_token));
2258 html->new_token.type = token_type;
2259 }
2260
2261 struct html_attr *
2262 html_prep_new_attribute(struct html_page *html, struct html_tag *tag)
2263 {
2264 if (tag->attrs_count >= nitems(tag->attrs))
2265 panic("tag attr overflow");
2266
2267 tag->attrs_count++;
2268 tag->attrs[tag->attrs_count - 1].name_len = 0;
2269 tag->attrs[tag->attrs_count - 1].name[0] = '\0';
2270 tag->attrs[tag->attrs_count - 1].val_len = 0;
2271 tag->attrs[tag->attrs_count - 1].val[0] = '\0';
2272
2273 return &tag->attrs[tag->attrs_count - 1];
2274 }
2275
2276 bool
2277 html_appropriate_end_tag_token(struct html_page *html, html_token *token)
2278 {
2279 /* https://html.spec.whatwg.org/multipage/parsing.html#tokenization
2280 * "an end tag token whose tag name matches the tag name of the last start
2281 * tag to have been emitted"
2282 */
2283 if (html->open_count <= 0)
2284 return false;
2285
2286 /* TODO: store last start tag to have been emitted and check that */
2287 return (strcmp(html->current_node->name, html->new_token.tag.name) == 0);
2288 }
2289
2290 html_tag_type
2291 html_find_tag_type(char *name)
2292 {
2293 long idx;
2294
2295 idx = strcaseidx(name, html_tag_names);
2296 if (idx >= 0)
2297 return idx;
2298
2299 HTML_DEBUG((": html_find_tag_type couldn't find %s", name));
2300 return 0;
2301 }
2302
2303 #endif /* HTML_ENABLE */