blob: 749a74eca830a3a4bb6dbfa2aee6949ba77507ed [file] [log] [blame]
Brian Silverman70325d62015-09-20 17:00:43 -04001/*
2 * Copyright (c) 2007, Google Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are
7 * met:
8 *
9 * * Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following disclaimer
13 * in the documentation and/or other materials provided with the
14 * distribution.
15 * * Neither the name of Google Inc. nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * ---
31 *
32 * Author: falmeida@google.com (Filipe Almeida)
33 */
34
35/* TODO(falmeida): Breaks on NULL characters in the stream. fix.
36 */
37
38#include <stdio.h>
39#include <stdlib.h>
40#include <string.h>
41#include <ctype.h>
42#include <assert.h>
43
44#include "htmlparser/statemachine.h"
45#include "htmlparser/htmlparser.h"
46#include "htmlparser/jsparser.h"
47
48/* So we can support both C and C++ compilers, we use the CAST() macro instead
49 * of using C style casts or static_cast<>() directly.
50 */
51#ifdef __cplusplus
52 #define CAST(type, expression) (static_cast<type>(expression))
53#else
54 #define CAST(type, expression) ((type)(expression))
55#endif
56
57#ifdef __cplusplus
58namespace ctemplate_htmlparser {
59#endif
60
61/* Generated state machine definition. */
62#include "htmlparser/htmlparser_fsm.h"
63
64#define is_js_attribute(attr) ((attr)[0] == 'o' && (attr)[1] == 'n')
65#define is_style_attribute(attr) (strcmp((attr), "style") == 0)
66
67/* html entity filter */
68static struct entityfilter_table_s {
69 const char *entity;
70 const char *value;
71} entityfilter_table[] = {
72 { "lt", "<" },
73 { "gt", ">" },
74 { "quot", "\"" },
75 { "amp", "&" },
76 { "apos", "\'" },
77 { NULL, NULL }
78};
79
80/* Utility functions */
81
82/* Similar to strncpy() but avoids the NULL padding. */
83static inline void nopad_strncpy(char *dst, const char *src, size_t dst_size,
84 size_t src_size)
85{
86 size_t size;
87
88 /* size = min(dst_size, src_size) */
89 size = dst_size > src_size ? src_size : dst_size;
90 strncpy(dst, src, size);
91 if (size > 0)
92 dst[size - 1] = '\0';
93}
94
95/* Converts the internal state into the external superstate.
96 */
97static int state_external(int st)
98{
99 if (st == STATEMACHINE_ERROR)
100 return HTMLPARSER_STATE_ERROR;
101 else
102 return htmlparser_states_external[st];
103}
104
105/* Returns true if the character is considered an html whitespace character.
106 *
107 * From: http://www.w3.org/TR/html401/struct/text.html#h-9.1
108 */
109static inline int html_isspace(char chr)
110{
111 if (chr == ' ' || chr == '\t' || chr == '\n' || chr == '\r') {
112 return 1;
113 } else {
114 return 0;
115 }
116}
117
118/* Returns true if the attribute is expected to contain a url
119 * This list was taken from: http://www.w3.org/TR/html4/index/attributes.html
120 */
121static int is_uri_attribute(char *attr)
122{
123 if (attr == NULL)
124 return 0;
125
126 switch (attr[0]) {
127 case 'a':
128 if (strcmp(attr, "action") == 0)
129 return 1;
130 /* TODO(falmeida): This is a uri list. Should we treat it diferently? */
131 if (strcmp(attr, "archive") == 0) /* This is a uri list */
132 return 1;
133 break;
134
135 case 'b':
136 if (strcmp(attr, "background") == 0)
137 return 1;
138 break;
139
140 case 'c':
141 if (strcmp(attr, "cite") == 0)
142 return 1;
143 if (strcmp(attr, "classid") == 0)
144 return 1;
145 if (strcmp(attr, "codebase") == 0)
146 return 1;
147 break;
148
149 case 'd':
150 if (strcmp(attr, "data") == 0)
151 return 1;
152 if (strcmp(attr, "dynsrc") == 0) /* from msdn */
153 return 1;
154 break;
155
156 case 'h':
157 if (strcmp(attr, "href") == 0)
158 return 1;
159 break;
160
161 case 'l':
162 if (strcmp(attr, "longdesc") == 0)
163 return 1;
164 break;
165
166 case 's':
167 if (strcmp(attr, "src") == 0)
168 return 1;
169 break;
170
171 case 'u':
172 if (strcmp(attr, "usemap") == 0)
173 return 1;
174 break;
175 }
176
177 return 0;
178
179}
180
181/* Convert a string to lower case characters inplace.
182 */
183static void tolower_str(char *s)
184{
185 while (*s != '\0') {
186 *s = CAST(char, tolower(CAST(unsigned char,*s)));
187 s++;
188 }
189}
190
191static const char *ignore_spaces_or_digits(const char *value) {
192 while (html_isspace(*value) || ((*value >= '0' && *value <= '9')))
193 value++;
194
195 return value;
196}
197
198static const char *ignore_spaces(const char *value) {
199 while (html_isspace(*value))
200 value++;
201
202 return value;
203}
204
205/* Return type of the function meta_redirect_type.
206 */
207enum meta_redirect_type_enum {
208 META_REDIRECT_TYPE_NONE,
209 META_REDIRECT_TYPE_URL_START,
210 META_REDIRECT_TYPE_URL
211};
212
213/* Analyzes a string for the presence of a meta refresh type url.
214 *
215 * This function receives the value of the content attribute of a meta tag and
216 * parses it in order to identify if a url is going to be present. This is the
217 * format of such tag:
218 *
219 * <meta http-equiv="refresh" content="5; URL=http://www.google.com">
220 *
221 * Using a regular expression library would be the most obvious way to implement
222 * this functionality, but introducing such a dependency is undesirable. We
223 * opted instead to parse programmaticly since the expression is simple enough.
224 *
225 * For reference, this is the spec on the meta http refresh tag:
226 * http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh
227 *
228 * If the value has no content after the expression, we know we are at the start
229 * of the URL. Otherwise we are past the start of the URL.
230 *
231 *
232 * Returns:
233 *
234 * This functions returns one of the following values:
235 * META_REDIRECT_TYPE_NONE - A url was not identified in the input string.
236 * META_REDIRECT_TYPE_URL_START - The input string ends exactly at the start
237 * of the url.
238 * META_REDIRECT_TYPE_URL - The input string ends somewhere in the middle or
239 * the end of the url.
240 *
241 * A few examples:
242 * "5"
243 * Returns META_REDIRECT_TYPE_NONE since we don't expect a url to follow.
244 *
245 * "5; URL = "
246 * The function returns META_REDIRECT_TYPE_URL_START since we expect a url to
247 * follow.
248 *
249 * "5; URL = http://www.google.com/?"
250 * Returns META_REDIRECT_TYPE_URL since the input value terminates in the
251 * middle or end of a url.
252 *
253 *
254 * Caveats: We are only recording up to 256 characters of attribute values, so
255 * our analysis is limited to that. This shouldn't be an issue in practice
256 * though as it would be unexpected for the part of the string that we are
257 * matching to be so long.
258 */
259enum meta_redirect_type_enum meta_redirect_type(const char *value) {
260
261 if (value == NULL)
262 return META_REDIRECT_TYPE_NONE;
263
264 /* Match while [ \t\r\n0-9]* */
265 value = ignore_spaces_or_digits(value);
266
267 /* Verify that we got a semi-colon character */
268 if (*value != ';')
269 return META_REDIRECT_TYPE_NONE;
270 value++;
271
272 /* Match while [ \t\r\n]* */
273 value = ignore_spaces(value);
274
275 /* Validate that we have 'URL' */
276 if (strncasecmp(value, "url", strlen("url")) != 0)
277 return META_REDIRECT_TYPE_NONE;
278
279 value += strlen("url");
280
281 /* Match while [ \t\r\n]* */
282 value = ignore_spaces(value);
283
284 if (*value != '=')
285 return META_REDIRECT_TYPE_NONE;
286 value++;
287
288 /* Match while [ \t\r\n]* */
289 value = ignore_spaces(value);
290
291 /* The HTML5 spec allows for the url to be quoted, so we skip a single or
292 * double quote if we find one.
293 */
294 if (*value == '"' || *value == '\'')
295 value++;
296
297 if (*value == '\0')
298 return META_REDIRECT_TYPE_URL_START;
299 else
300 return META_REDIRECT_TYPE_URL;
301}
302
303
304/* Resets the entityfilter to it's initial state so it can be reused.
305 */
306void entityfilter_reset(entityfilter_ctx *ctx)
307{
308 ctx->buffer[0] = 0;
309 ctx->buffer_pos = 0;
310 ctx->in_entity = 0;
311}
312
313/* Initializes a new entity filter object.
314 */
315entityfilter_ctx *entityfilter_new()
316{
317 entityfilter_ctx *ctx;
318 ctx = CAST(entityfilter_ctx *,
319 malloc(sizeof(entityfilter_ctx)));
320
321 if (ctx == NULL)
322 return NULL;
323 ctx->buffer[0] = 0;
324 ctx->buffer_pos = 0;
325 ctx->in_entity = 0;
326
327 return ctx;
328}
329
330/* Copies the context of the entityfilter pointed to by src to the entityfilter
331 * dst.
332 */
333void entityfilter_copy(entityfilter_ctx *dst, entityfilter_ctx *src)
334{
335 assert(src != NULL);
336 assert(dst != NULL);
337 assert(src != dst);
338 memcpy(dst, src, sizeof(entityfilter_ctx));
339}
340
341
342/* Deallocates an entity filter object.
343 */
344void entityfilter_delete(entityfilter_ctx *ctx)
345{
346 free(ctx);
347}
348
349/* Converts a string containing an hexadecimal number to a string containing
350 * one character with the corresponding ascii value.
351 *
352 * The provided output char array must be at least 2 chars long.
353 */
354static const char *parse_hex(const char *s, char *output)
355{
356 int n;
357 n = strtol(s, NULL, 16);
358 output[0] = n;
359 output[1] = 0;
360 /* TODO(falmeida): Make this function return void */
361 return output;
362}
363
364/* Converts a string containing a decimal number to a string containing one
365 * character with the corresponding ascii value.
366 *
367 * The provided output char array must be at least 2 chars long.
368 */
369static const char *parse_dec(const char *s, char *output)
370{
371 int n;
372 n = strtol(s, NULL, 10);
373 output[0] = n;
374 output[1] = 0;
375 return output;
376}
377
378/* Converts a string with an html entity to it's encoded form, which is written
379 * to the output string.
380 */
381static const char *entity_convert(const char *s, char *output, char terminator)
382{
383 /* TODO(falmeida): Handle wide char encodings */
384 struct entityfilter_table_s *t = entityfilter_table;
385
386 if (s[0] == '#') {
387 if (s[1] == 'x' || s[1] == 'X') { /* hex */
388 return parse_hex(s + 2, output);
389 } else { /* decimal */
390 return parse_dec(s + 1, output);
391 }
392 }
393
394 while (t->entity != NULL) {
395 if (strcasecmp(t->entity, s) == 0)
396 return t->value;
397 t++;
398 }
399
400 snprintf(output, HTMLPARSER_MAX_ENTITY_SIZE, "&%s%c", s, terminator);
401 output[HTMLPARSER_MAX_ENTITY_SIZE - 1] = '\0';
402
403 return output;
404}
405
406
407/* Processes a character from the input stream and decodes any html entities
408 * in the processed input stream.
409 */
410const char *entityfilter_process(entityfilter_ctx *ctx, char c)
411{
412 if (ctx->in_entity) {
413 if (c == ';' || html_isspace(c)) {
414 ctx->in_entity = 0;
415 ctx->buffer[ctx->buffer_pos] = '\0';
416 ctx->buffer_pos = 0;
417 return entity_convert(ctx->buffer, ctx->output, c);
418 } else {
419 ctx->buffer[ctx->buffer_pos++] = c;
420 if (ctx->buffer_pos >= HTMLPARSER_MAX_ENTITY_SIZE - 2) {
421 /* No more buffer to use, finalize and return.
422 * We need two characters left, one for the '&' character and
423 * another for the NULL termination. */
424 ctx->buffer[ctx->buffer_pos] = '\0';
425 ctx->in_entity=0;
426 ctx->buffer_pos = 0;
427 snprintf(ctx->output, HTMLPARSER_MAX_ENTITY_SIZE, "&%s",
428 ctx->buffer);
429 ctx->output[HTMLPARSER_MAX_ENTITY_SIZE - 1] = '\0';
430 return ctx->output;
431 }
432 }
433 } else {
434 if (c == '&') {
435 ctx->in_entity = 1;
436 ctx->buffer_pos = 0;
437 } else {
438 ctx->output[0] = c;
439 ctx->output[1] = 0;
440 return ctx->output;
441 }
442 }
443 return "";
444}
445
446/* Called when the parser enters a new tag. Starts recording it's name into
447 * html->tag.
448 */
449static void enter_tag_name(statemachine_ctx *ctx, int start, char chr, int end)
450{
451 htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user);
452 assert(html != NULL);
453
454 html->tag[0] = '\0';
455 statemachine_start_record(ctx);
456}
457
458/* Called when the parser exits the tag name in order to finalize the recording.
459 *
460 * It converts the tag name to lowercase, and if the tag was closed, just
461 * clears html->tag.
462 */
463static void exit_tag_name(statemachine_ctx *ctx, int start, char chr, int end)
464{
465 htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user);
466 assert(html != NULL);
467
468 nopad_strncpy(html->tag, statemachine_stop_record(ctx),
469 HTMLPARSER_MAX_STRING, statemachine_record_length(ctx));
470
471 tolower_str(html->tag);
472
473 if (html->tag[0] == '/')
474 html->tag[0] = '\0';
475}
476
477/* Called when the parser enters a new tag. Starts recording it's name into
478 * html->attr
479 */
480static void enter_attr(statemachine_ctx *ctx, int start, char chr, int end)
481{
482 htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user);
483 assert(html != NULL);
484
485 html->attr[0] = '\0';
486 statemachine_start_record(ctx);
487}
488
489/* Called when the parser exits the attribute name in order to finalize the
490 * recording.
491 *
492 * It converts the tag name to lowercase.
493 */
494static void exit_attr(statemachine_ctx *ctx, int start, char chr, int end)
495{
496 htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user);
497 assert(html != NULL);
498
499 nopad_strncpy(html->attr, statemachine_stop_record(ctx),
500 HTMLPARSER_MAX_STRING, statemachine_record_length(ctx));
501
502 tolower_str(html->attr);
503}
504
505/* Called when we enter an attribute value.
506 *
507 * Keeps track of a position index inside the value and initializes the
508 * javascript state machine for attributes that accept javascript.
509 */
510static void enter_value(statemachine_ctx *ctx, int start, char chr, int end)
511{
512 htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user);
513 assert(html != NULL);
514
515 html->value_index = 0;
516
517 if (is_js_attribute(html->attr)) {
518 entityfilter_reset(html->entityfilter);
519 jsparser_reset(html->jsparser);
520 html->in_js = 1;
521 } else {
522 html->in_js = 0;
523 }
524}
525
526/* Called when we enter the contents of an attribute value.
527 *
528 * Initializes the recording of the contents of the value.
529 */
530static void enter_value_content(statemachine_ctx *ctx, int start, char chr,
531 int end)
532{
533 htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user);
534 assert(html != NULL);
535
536 html->value[0] = '\0';
537 statemachine_start_record(ctx);
538}
539
540/* Called when we exit the contents of an attribute value.
541 *
542 * Finalizes the recording of the contents of the value.
543 */
544static void exit_value_content(statemachine_ctx *ctx, int start, char chr,
545 int end)
546{
547 htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user);
548 assert(html != NULL);
549
550 nopad_strncpy(html->value, statemachine_stop_record(ctx),
551 HTMLPARSER_MAX_STRING, statemachine_record_length(ctx));
552
553 html->in_js = 0;
554}
555
556/* Called for every character inside an attribute value.
557 *
558 * Used to process javascript and keep track of the position index inside the
559 * attribute value.
560 */
561static void in_state_value(statemachine_ctx *ctx, int start, char chr, int end)
562{
563 htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user);
564 assert(html != NULL);
565
566 html->value_index++;
567
568 if (html->in_js == 1) {
569 const char *output;
570 output = entityfilter_process(html->entityfilter, chr);
571 jsparser_parse_str(html->jsparser, output);
572 }
573}
574
575/* Called everytime the parser leaves a tag definition.
576 *
577 * When we encounter a script tag, we initialize the js parser and switch the
578 * state to cdata. We also switch to the cdata state when we encounter any
579 * other CDATA/RCDATA tag (style, title or textarea) except that we do not
580 * initialize the js parser.
581 *
582 * To simplify the code, we treat RCDATA and CDATA sections the same since the
583 * differences between them don't affect the context we are in.
584 */
585static void tag_close(statemachine_ctx *ctx, int start, char chr, int end)
586{
587 htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user);
588 assert(html != NULL);
589
590 if (strcmp(html->tag, "script") == 0) {
591 ctx->next_state = HTMLPARSER_STATE_INT_CDATA_TEXT;
592 jsparser_reset(html->jsparser);
593 html->in_js = 1;
594 } else if (strcmp(html->tag, "style") == 0 ||
595 strcmp(html->tag, "title") == 0 ||
596 strcmp(html->tag, "textarea") == 0) {
597 ctx->next_state = HTMLPARSER_STATE_INT_CDATA_TEXT;
598 html->in_js = 0;
599 }
600}
601
602/* Called inside cdata blocks in order to parse the javascript.
603 *
604 * Calls the javascript parser if currently in a script tag.
605 */
606static void in_state_cdata(statemachine_ctx *ctx, int start, char chr, int end)
607{
608 htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user);
609 assert(html != NULL);
610
611 if (html->in_js)
612 jsparser_parse_chr(html->jsparser, chr);
613}
614
615/* Called if we encounter a '<' character in a cdata section.
616 *
617 * When encountering a '<' character inside cdata, we need to find the closing
618 * tag name in order to know if the tag is going to be closed or not, so we
619 * start recording the name of what could be the closing tag.
620 */
621static void enter_state_cdata_may_close(statemachine_ctx *ctx, int start,
622 char chr, int end)
623{
624 statemachine_start_record(ctx);
625}
626
627/* Called when we finish reading what could be a closing cdata tag.
628 *
629 * Checks if the closing tag name matches the current entity, and if so closes
630 * the element.
631 */
632static void exit_state_cdata_may_close(statemachine_ctx *ctx, int start,
633 char chr, int end)
634{
635 htmlparser_ctx *html = CAST(htmlparser_ctx *, ctx->user);
636 const char *cdata_close_tag;
637 assert(html != NULL);
638
639 cdata_close_tag = statemachine_stop_record(ctx);
640 assert(cdata_close_tag[0] == '/');
641
642 if (strcasecmp(&cdata_close_tag[1], html->tag) == 0 &&
643 (chr == '>' || html_isspace(chr))) { /* Make sure we have a delimiter */
644 html->tag[0] = '\0'; /* Empty tag mimicking exit_tag_name(). */
645 html->in_js = 0; /* In case this was a script tag. */
646 } else {
647 /* Does not close the CDATA section. Go back to CDATA. */
648 ctx->next_state = HTMLPARSER_STATE_INT_CDATA_TEXT;
649 }
650}
651
652/* Resets the parser to it's initial state and changes the parser mode.
653 */
654void htmlparser_reset_mode(htmlparser_ctx *ctx, int mode)
655{
656 assert(ctx != NULL);
657 statemachine_reset(ctx->statemachine);
658 ctx->in_js = 0;
659 ctx->tag[0] = '\0';
660 ctx->attr[0] = '\0';
661 ctx->value[0] = '\0';
662
663 jsparser_reset(ctx->jsparser);
664
665 switch (mode) {
666 case HTMLPARSER_MODE_HTML:
667 ctx->statemachine->current_state = HTMLPARSER_STATE_INT_TEXT;
668 break;
669 case HTMLPARSER_MODE_JS:
670 ctx->statemachine->current_state = HTMLPARSER_STATE_INT_JS_FILE;
671 ctx->in_js = 1;
672 break;
673 case HTMLPARSER_MODE_CSS:
674 ctx->statemachine->current_state = HTMLPARSER_STATE_INT_CSS_FILE;
675 break;
676 case HTMLPARSER_MODE_HTML_IN_TAG:
677 ctx->statemachine->current_state = HTMLPARSER_STATE_INT_TAG_SPACE;
678 break;
679 default:
680 assert("Invalid mode in htmlparser_reset_mode()." && 0);
681 }
682}
683
684/* Resets the parser to it's initial state and to the default mode, which
685 * is MODE_HTML.
686 */
687void htmlparser_reset(htmlparser_ctx *ctx)
688{
689 assert(ctx != NULL);
690 htmlparser_reset_mode(ctx, HTMLPARSER_MODE_HTML);
691}
692
693/* Creates a new state machine definition and initializes the events for the
694 * state transitions.
695 *
696 * Although each instance of the parser has it's own private instance of a
697 * statemachine definition, they are still identical across html parser objects
698 * and are never modified after creation. As such, changes to this definition
699 * should not occur outside this function and should not depend on properties
700 * of this particular parser instance as in the future we may opt to use a
701 * single public definition across parser objects.
702 */
703static statemachine_definition *create_statemachine_definition()
704{
705 statemachine_definition *def;
706 def = statemachine_definition_new(HTMLPARSER_NUM_STATES);
707 if (def == NULL)
708 return NULL;
709
710 statemachine_definition_populate(def, htmlparser_state_transitions,
711 htmlparser_states_internal_names);
712
713 statemachine_enter_state(def, HTMLPARSER_STATE_INT_TAG_NAME,
714 enter_tag_name);
715 statemachine_exit_state(def, HTMLPARSER_STATE_INT_TAG_NAME, exit_tag_name);
716
717 statemachine_enter_state(def, HTMLPARSER_STATE_INT_ATTR, enter_attr);
718 statemachine_exit_state(def, HTMLPARSER_STATE_INT_ATTR, exit_attr);
719
720 statemachine_enter_state(def, HTMLPARSER_STATE_INT_TAG_CLOSE, tag_close);
721
722 /* CDATA states. We must list all cdata and javascript states here. */
723 /* TODO(falmeida): Declare this list in htmlparser_fsm.config so it doesn't
724 * go out of sync.
725 */
726 statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_TEXT, in_state_cdata);
727 statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_START,
728 in_state_cdata);
729 statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_START_DASH,
730 in_state_cdata);
731 statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_BODY,
732 in_state_cdata);
733 statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_DASH,
734 in_state_cdata);
735 statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_COMMENT_DASH_DASH,
736 in_state_cdata);
737 statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_LT, in_state_cdata);
738 statemachine_in_state(def, HTMLPARSER_STATE_INT_CDATA_MAY_CLOSE,
739 in_state_cdata);
740
741 /* For simplification, we treat the javascript mode as if it were cdata. */
742 statemachine_in_state(def, HTMLPARSER_STATE_INT_JS_FILE, in_state_cdata);
743
744 statemachine_enter_state(def, HTMLPARSER_STATE_INT_CDATA_MAY_CLOSE,
745 enter_state_cdata_may_close);
746 statemachine_exit_state(def, HTMLPARSER_STATE_INT_CDATA_MAY_CLOSE,
747 exit_state_cdata_may_close);
748 /* value states */
749 statemachine_enter_state(def, HTMLPARSER_STATE_INT_VALUE, enter_value);
750
751 /* Called when we enter the content of the value */
752 statemachine_enter_state(def, HTMLPARSER_STATE_INT_VALUE_TEXT,
753 enter_value_content);
754 statemachine_enter_state(def, HTMLPARSER_STATE_INT_VALUE_Q,
755 enter_value_content);
756 statemachine_enter_state(def, HTMLPARSER_STATE_INT_VALUE_DQ,
757 enter_value_content);
758
759 /* Called when we exit the content of the value */
760 statemachine_exit_state(def, HTMLPARSER_STATE_INT_VALUE_TEXT,
761 exit_value_content);
762 statemachine_exit_state(def, HTMLPARSER_STATE_INT_VALUE_Q,
763 exit_value_content);
764 statemachine_exit_state(def, HTMLPARSER_STATE_INT_VALUE_DQ,
765 exit_value_content);
766
767 statemachine_in_state(def, HTMLPARSER_STATE_INT_VALUE_TEXT, in_state_value);
768 statemachine_in_state(def, HTMLPARSER_STATE_INT_VALUE_Q, in_state_value);
769 statemachine_in_state(def, HTMLPARSER_STATE_INT_VALUE_DQ, in_state_value);
770
771 return def;
772}
773
774
775/* Initializes a new htmlparser instance.
776 *
777 * Returns a pointer to the new instance or NULL if the initialization fails.
778 * Initialization failure is fatal, and if this function fails it may not
779 * deallocate all previsouly allocated memory.
780 */
781htmlparser_ctx *htmlparser_new()
782{
783 htmlparser_ctx *html;
784
785 html = CAST(htmlparser_ctx *, calloc(1, sizeof(htmlparser_ctx)));
786 if (html == NULL)
787 return NULL;
788
789 html->statemachine_def = create_statemachine_definition();
790 if (html->statemachine_def == NULL)
791 return NULL;
792
793 html->statemachine = statemachine_new(html->statemachine_def, html);
794 if (html->statemachine == NULL)
795 return NULL;
796
797 html->jsparser = jsparser_new();
798 if (html->jsparser == NULL)
799 return NULL;
800
801 html->entityfilter = entityfilter_new();
802 if (html->entityfilter == NULL)
803 return NULL;
804
805 htmlparser_reset(html);
806
807 return html;
808}
809
810/* Copies the context of the htmlparser pointed to by src to the htmlparser dst.
811 */
812void htmlparser_copy(htmlparser_ctx *dst, const htmlparser_ctx *src)
813{
814 dst->value_index = src->value_index;
815 dst->in_js = src->in_js;
816 strcpy(dst->tag, src->tag);
817 strcpy(dst->attr, src->attr);
818 strcpy(dst->value, src->value);
819
820 statemachine_copy(dst->statemachine,
821 src->statemachine,
822 dst->statemachine_def,
823 dst);
824
825 jsparser_copy(dst->jsparser, src->jsparser);
826
827 entityfilter_copy(dst->entityfilter, src->entityfilter);
828
829}
830
831/* Receives an htmlparser context and Returns the current html state.
832 */
833int htmlparser_state(htmlparser_ctx *ctx)
834{
835 return state_external(ctx->statemachine->current_state);
836}
837
838/* Parses the input html stream and returns the finishing state.
839 */
840int htmlparser_parse(htmlparser_ctx *ctx, const char *str, int size)
841{
842 int internal_state;
843 internal_state = statemachine_parse(ctx->statemachine, str, size);
844 return state_external(internal_state);
845}
846
847
848/* Returns true if the parser is inside an attribute value and the value is
849 * surrounded by single or double quotes. */
850int htmlparser_is_attr_quoted(htmlparser_ctx *ctx) {
851 int st = statemachine_get_state(ctx->statemachine);
852 if (st == HTMLPARSER_STATE_INT_VALUE_Q_START ||
853 st == HTMLPARSER_STATE_INT_VALUE_Q ||
854 st == HTMLPARSER_STATE_INT_VALUE_DQ_START ||
855 st == HTMLPARSER_STATE_INT_VALUE_DQ)
856 return 1;
857 else
858 return 0;
859}
860
861/* Returns true if the parser is currently in javascript.
862 */
863int htmlparser_in_js(htmlparser_ctx *ctx) {
864 int st = statemachine_get_state(ctx->statemachine);
865
866/* CDATA states plus JS_FILE. We must list all cdata and javascript states
867 * here. */
868/* TODO(falmeida): Declare this list in htmlparser_fsm.config so it doesn't go
869 * out of sync. */
870 if (ctx->in_js &&
871 (st == HTMLPARSER_STATE_INT_CDATA_TEXT ||
872 st == HTMLPARSER_STATE_INT_CDATA_COMMENT_START ||
873 st == HTMLPARSER_STATE_INT_CDATA_COMMENT_START_DASH ||
874 st == HTMLPARSER_STATE_INT_CDATA_COMMENT_BODY ||
875 st == HTMLPARSER_STATE_INT_CDATA_COMMENT_DASH ||
876 st == HTMLPARSER_STATE_INT_CDATA_COMMENT_DASH_DASH ||
877 st == HTMLPARSER_STATE_INT_CDATA_LT ||
878 st == HTMLPARSER_STATE_INT_CDATA_MAY_CLOSE ||
879 st == HTMLPARSER_STATE_INT_JS_FILE))
880 return 1;
881
882 if (state_external(st) == HTMLPARSER_STATE_VALUE && ctx->in_js)
883 return 1;
884 else
885 return 0;
886}
887
888/* Returns the current tag or NULL if not available or we haven't seen the
889 * entire tag yet.
890 */
891const char *htmlparser_tag(htmlparser_ctx *ctx)
892{
893 if (ctx->tag[0] != '\0')
894 return ctx->tag;
895 else
896 return NULL;
897}
898
899/* Returns true if inside an attribute or a value */
900int htmlparser_in_attr(htmlparser_ctx *ctx)
901{
902 int ext_state = state_external(statemachine_get_state(ctx->statemachine));
903 return ext_state == HTMLPARSER_STATE_ATTR ||
904 ext_state == HTMLPARSER_STATE_VALUE;
905}
906
907/* Returns the current attribute name if after an attribute name or in an
908 * attribute value. Returns NULL otherwise. */
909const char *htmlparser_attr(htmlparser_ctx *ctx)
910{
911 if (htmlparser_in_attr(ctx))
912 return ctx->attr;
913 else
914 return NULL;
915}
916
917/* Returns true if the parser is currently inside a CSS construct.
918 */
919int htmlparser_in_css(htmlparser_ctx *ctx) {
920 int state = statemachine_get_state(ctx->statemachine);
921 const char *tag = htmlparser_tag(ctx);
922 int external_state = state_external(state);
923
924 if (state == HTMLPARSER_STATE_INT_CSS_FILE ||
925 (external_state == HTMLPARSER_STATE_VALUE &&
926 htmlparser_attr_type(ctx) == HTMLPARSER_ATTR_STYLE) ||
927 (tag && strcmp(tag, "style") == 0)) {
928 return 1;
929 } else {
930 return 0;
931 }
932}
933
934/* Returns the contents of the current attribute value.
935 */
936const char *htmlparser_value(htmlparser_ctx *ctx)
937{
938 int ext_state = state_external(statemachine_get_state(ctx->statemachine));
939 if (ext_state == HTMLPARSER_STATE_VALUE) {
940 strncpy(ctx->value, statemachine_record_buffer(ctx->statemachine),
941 HTMLPARSER_MAX_STRING);
942 ctx->value[HTMLPARSER_MAX_STRING - 1] = '\0';
943 return ctx->value;
944 } else {
945 return NULL;
946 }
947}
948
949
950/* Returns the current state of the javascript state machine
951 *
952 * Currently only present for testing purposes.
953 */
954int htmlparser_js_state(htmlparser_ctx *ctx)
955{
956 return jsparser_state(ctx->jsparser);
957}
958
959/* True is currently inside a javascript string literal
960 */
961int htmlparser_is_js_quoted(htmlparser_ctx *ctx)
962{
963 if (htmlparser_in_js(ctx)) {
964 int st = jsparser_state(ctx->jsparser);
965 if (st == JSPARSER_STATE_Q ||
966 st == JSPARSER_STATE_DQ)
967 return 1;
968 }
969 return 0;
970}
971
972/* True if currently inside an attribute value
973 */
974int htmlparser_in_value(htmlparser_ctx *ctx)
975{
976 int ext_state = state_external(statemachine_get_state(ctx->statemachine));
977 return ext_state == HTMLPARSER_STATE_VALUE;
978}
979
980/* Returns the position inside the current attribute value
981 */
982int htmlparser_value_index(htmlparser_ctx *ctx)
983{
984 if (htmlparser_in_value(ctx))
985 return ctx->value_index;
986
987 return -1;
988}
989
990/* Returns true if this is the first character of a url inside an attribute.
991 */
992int htmlparser_is_url_start(htmlparser_ctx *ctx)
993{
994 if (htmlparser_attr_type(ctx) == HTMLPARSER_ATTR_URI) {
995 const char* tag = htmlparser_tag(ctx);
996 /*const char* attr =*/ htmlparser_attr(ctx);
997
998 if ((tag && strcmp(tag, "meta") == 0 &&
999 meta_redirect_type(htmlparser_value(ctx)) ==
1000 META_REDIRECT_TYPE_URL_START) ||
1001 htmlparser_value_index(ctx) == 0)
1002 return 1;
1003
1004 }
1005 return 0;
1006}
1007
1008/* Returns the current attribute type.
1009 */
1010int htmlparser_attr_type(htmlparser_ctx *ctx)
1011{
1012 if (!htmlparser_in_attr(ctx))
1013 return HTMLPARSER_ATTR_NONE;
1014
1015 if (is_js_attribute(ctx->attr))
1016 return HTMLPARSER_ATTR_JS;
1017
1018 if (is_uri_attribute(ctx->attr))
1019 return HTMLPARSER_ATTR_URI;
1020
1021 if (is_style_attribute(ctx->attr))
1022 return HTMLPARSER_ATTR_STYLE;
1023
1024 const char* tag = htmlparser_tag(ctx);
1025 const char* attr = htmlparser_attr(ctx);
1026
1027 /* Special logic to handle meta redirect type tags. */
1028 if (tag && strcmp(tag, "meta") == 0 &&
1029 attr && strcmp(attr, "content") == 0) {
1030
1031 const char* value = htmlparser_value(ctx);
1032 meta_redirect_type_enum redirect_type = meta_redirect_type(value);
1033
1034 if (redirect_type == META_REDIRECT_TYPE_URL ||
1035 redirect_type == META_REDIRECT_TYPE_URL_START)
1036 return HTMLPARSER_ATTR_URI;
1037 }
1038
1039 return HTMLPARSER_ATTR_REGULAR;
1040}
1041
1042/* Return the current line number. */
1043int htmlparser_get_line_number(htmlparser_ctx *ctx) {
1044 return statemachine_get_line_number(ctx->statemachine);
1045}
1046
1047/* Set the current line number. */
1048void htmlparser_set_line_number(htmlparser_ctx *ctx, int line) {
1049 statemachine_set_line_number(ctx->statemachine, line);
1050}
1051
1052/* Return the current column number. */
1053int htmlparser_get_column_number(htmlparser_ctx *ctx) {
1054 return statemachine_get_column_number(ctx->statemachine);
1055}
1056
1057/* Set the current column number. */
1058void htmlparser_set_column_number(htmlparser_ctx *ctx, int column) {
1059 statemachine_set_column_number(ctx->statemachine, column);
1060}
1061
1062/* Retrieve a human readable error message in case an error occurred.
1063 *
1064 * NULL is returned if the parser didn't encounter an error.
1065 */
1066const char *htmlparser_get_error_msg(htmlparser_ctx *ctx) {
1067 return statemachine_get_error_msg(ctx->statemachine);
1068}
1069
1070/* Invoked by the caller when text is expanded by the caller.
1071 */
1072int htmlparser_insert_text(htmlparser_ctx *ctx)
1073{
1074 /* TODO(falmeida): Generalize and use a table for these values. */
1075
1076 if (statemachine_get_state(ctx->statemachine) == HTMLPARSER_STATE_INT_VALUE) {
1077 statemachine_set_state(ctx->statemachine, HTMLPARSER_STATE_INT_VALUE_TEXT);
1078 }
1079 return 1;
1080}
1081
1082/* Deallocates an htmlparser context object.
1083 */
1084void htmlparser_delete(htmlparser_ctx *ctx)
1085{
1086 assert(ctx != NULL);
1087 statemachine_definition_delete(ctx->statemachine_def);
1088 statemachine_delete(ctx->statemachine);
1089 jsparser_delete(ctx->jsparser);
1090 entityfilter_delete(ctx->entityfilter);
1091 free(ctx);
1092}
1093
1094#ifdef __cplusplus
1095} /* namespace security_streamhtmlparser */
1096#endif