blob: 15be35d2e7ee9a20090e4448a6754abd312bb316 [file] [log] [blame]
Brian Silverman70325d62015-09-20 17:00:43 -04001/* Copyright (c) 2007, Google Inc.
2 * All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 * ---
30 *
31 * Author: falmeida@google.com (Filipe Almeida)
32 */
33
34#include <stdio.h>
35#include <stdlib.h>
36#include <string.h>
37#include <assert.h>
38
39#include "htmlparser/statemachine.h"
40#include "htmlparser/jsparser.h"
41
42/* So we can support both C and C++ compilers, we use the CAST() macro instead
43 * of using C style casts or static_cast<>() directly.
44 */
45#ifdef __cplusplus
46 #define CAST(type, expression) (static_cast<type>(expression))
47#else
48 #define CAST(type, expression) ((type)(expression))
49#endif
50
51#ifdef __cplusplus
52namespace ctemplate_htmlparser {
53#endif /* __cplusplus */
54
55/* Generated state machine definition. */
56#include "htmlparser/jsparser_fsm.h"
57
58/* List of keywords that can precede a regular expression literal. Taken from:
59 * http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html
60 *
61 * 'void' was added to this list.
62 * The list is used as input to a binary search, so it must be kept in a sorted
63 * form.
64 * There are a large number of keywords in here that don't exist in
65 * ecmascript 3, either because they are reserved or because they are part of
66 * ecmascript 4. However they weren't removed in order to keep the list in sync
67 * with the previous document.
68 */
69static const char *regexp_token_prefix[] = {
70 "abstract",
71 "break",
72 "case",
73 "catch",
74 "class",
75 "const",
76 "continue",
77 "debugger",
78 "default",
79 "delete",
80 "do",
81 "else",
82 "enum",
83 "eval",
84 "export",
85 "extends",
86 "field",
87 "final",
88 "finally",
89 "for",
90 "function",
91 "goto",
92 "if",
93 "implements",
94 "import",
95 "in",
96 "instanceof",
97 "native",
98 "new",
99 "package",
100 "private",
101 "protected",
102 "public",
103 "return",
104 "static",
105 "switch",
106 "synchronized",
107 "throw",
108 "throws",
109 "transient",
110 "try",
111 "typeof",
112 "var",
113 "void",
114 "volatile",
115 "while",
116 "with"
117};
118
119/* Utility functions */
120
121/* Converts the internal state into the external superstate.
122 */
123static inline int state_external(int state)
124{
125 assert(state < JSPARSER_NUM_STATES);
126 assert(state >= 0);
127 return jsparser_states_external[state];
128}
129
130/* Returns true if the character is an ecmascript whitespace or line terminator
131 * character. Includes the characters in sections 7.2 and 7.3 of ECMAScript 3
132 * with the exception of unicode space and line terminators:
133 * http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
134 */
135static inline int js_is_whitespace(char c)
136{
137 return c == '\t' || /* Tab 0x09 */
138 c == '\v' || /* Vertical Tab 0x0B */
139 c == '\f' || /* Form Feed 0x0C */
140 c == ' ' || /* Space 0x20 */
141 c == '\xa0' || /* No-Break Space 0xA0 */
142 c == '\n' || /* line Feed 0x0A */
143 c == '\r'; /* Carriage Return 0x0D */
144}
145
146/* Returns true if the character is part of a javascript identifier. The rules
147 * are pretty relaxed here since we don't accept unicode and don't care if the
148 * first character doesn't contain numbers or not.
149 *
150 * For more detail on the limitations of having this relaxed set of characters
151 * please see the comments in_state_js_text().
152 */
153static inline int js_is_identifier(char c) {
154 return (c >= 'a' && c <= 'z') ||
155 (c >= 'A' && c <= 'Z') ||
156 (c >= '0' && c <= '9') ||
157 c == '_' ||
158 c == '$';
159}
160
161/* Appends a character to the ring buffer. Sequences of whitespace and newlines
162 * are folded into one.
163 *
164 * js->buffer_start points to the first character in the buffer and
165 * js->buffer_end points to the position of the next character to be written,
166 * or one plus the last character written. If the buffer is full there will be
167 * an empty slot position so we can diferentiate an empty buffer from a full
168 * buffer.
169 *
170 * If the buffer is empty, then:
171 * js->buffer_start == js->buffer_end.
172 * If the buffer is full, then:
173 * (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE == js->buffer_start.
174 *
175 */
176void jsparser_buffer_append_chr(jsparser_ctx *js, char chr)
177{
178 /* Fold whitespace so we have enough space in the buffer. */
179 if (js_is_whitespace(chr) &&
180 js_is_whitespace(jsparser_buffer_get(js, -1))) {
181 return;
182 }
183
184 js->buffer[js->buffer_end] = chr;
185 js->buffer_end = (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE;
186 if (js->buffer_end == js->buffer_start) {
187 js->buffer_start = (js->buffer_end + 1) %
188 JSPARSER_RING_BUFFER_SIZE;
189 }
190}
191
192/* Appends a string to the ring buffer. Sequences of whitespace and newlines
193 * are folded into one.
194 */
195void jsparser_buffer_append_str(jsparser_ctx *js, const char *str)
196{
197 assert(js != NULL);
198 assert(str != NULL);
199
200 while(*str != '\0') {
201 jsparser_buffer_append_chr(js, *str++);
202 }
203}
204
205/* Returns the position relative to the start of the buffer or -1 if past the
206 * size of the buffer..
207 */
208static inline int jsparser_buffer_absolute_pos(jsparser_ctx *js, int pos)
209{
210 int absolute_pos;
211 int buffer_len;
212 assert(pos < 0);
213
214 if(pos <= -JSPARSER_RING_BUFFER_SIZE) {
215 return -1;
216 }
217
218 buffer_len = js->buffer_end - js->buffer_start;
219 if (buffer_len < 0) {
220 buffer_len += JSPARSER_RING_BUFFER_SIZE;
221 }
222
223 if (pos < -buffer_len) {
224 return -1;
225 }
226
227 absolute_pos = (pos + js->buffer_end) % JSPARSER_RING_BUFFER_SIZE;
228 if (absolute_pos < 0) {
229 absolute_pos += JSPARSER_RING_BUFFER_SIZE;
230 }
231
232 return absolute_pos;
233}
234
235/* Returns the last appended character and removes it from the buffer. If the
236 * buffer is empty, then it returns ASCII 0 ('\0').
237 */
238char jsparser_buffer_pop(jsparser_ctx *js)
239{
240 if (js->buffer_start == js->buffer_end) {
241 return '\0';
242 }
243
244 js->buffer_end--;
245 if (js->buffer_end < 0) {
246 js->buffer_end += JSPARSER_RING_BUFFER_SIZE;
247 }
248
249 return js->buffer[js->buffer_end];
250}
251
252/* Returns the value of the character at a certain index in the buffer or an
253 * ASCII 0 ('\0') character if the index is outside the buffer boundaries.
254 *
255 * Index positions are negative, were -1 is the last character appended to the
256 * buffer. Using positive integers for the index will result in undefined
257 * behaviour.
258 */
259char jsparser_buffer_get(jsparser_ctx *js, int pos)
260{
261 int absolute_pos;
262 assert(pos < 0);
263
264 absolute_pos = jsparser_buffer_absolute_pos(js, pos);
265 if (absolute_pos < 0) {
266 return '\0';
267 }
268
269 return js->buffer[absolute_pos];
270}
271
272/* Sets the value of the character at a certain index in the buffer. Returns
273 * true if the write was successful or false if there was an attempt to write
274 * outside of the buffer boundaries.
275 *
276 * Index positions are negative, were -1 is the last character appended to the
277 * buffer. Using positive integers for the index will result in undefined
278 * behaviour.
279 */
280int jsparser_buffer_set(jsparser_ctx *js, int pos, char value)
281{
282 int absolute_pos;
283 assert(pos < 0);
284
285 absolute_pos = jsparser_buffer_absolute_pos(js, pos);
286 if (absolute_pos < 0) {
287 return 0;
288 }
289
290 js->buffer[absolute_pos] = value;
291 return 1;
292}
293
294/* Copies a slice of the buffer to the string pointed to by output. start and
295 * end are the indexes of the sliced region. If start extends beyond the
296 * beginning of the buffer, the slice will only contain character from the
297 * beginning of the buffer.
298 */
299void jsparser_buffer_slice(jsparser_ctx *js, char *output, int start, int end)
300{
301 int pos;
302
303 assert(start <= end);
304 assert(start < 0);
305 assert(end < 0);
306
307 for (pos = start; pos <= end; ++pos) {
308 char c;
309 c = jsparser_buffer_get(js, pos);
310 if (c != '\0') {
311 *output++ = jsparser_buffer_get(js, pos);
312 }
313 }
314 *output++ = '\0';
315}
316
317/* Copy the last javascript identifier or keyword found in the buffer to the
318 * string pointed by identifier.
319 *
320 * For simplicity, we consider an identifier to be a sequence of alphanumeric
321 * characters (as opposed to a digit followed by an alphanumeric character.
322 *
323 * Returns 0 if no identifier was matched, in which case the identifier
324 * argument is replaced with an empty string, or non 0 if the identifier was
325 * found.
326 */
327int jsparser_buffer_last_identifier(jsparser_ctx *js, char *identifier)
328{
329 int end;
330 int pos;
331
332 assert(identifier != NULL);
333
334 end = -1;
335 /* Ignore the optional whitespace delimiter */
336 if (js_is_whitespace(jsparser_buffer_get(js, -1))) {
337 --end;
338 }
339
340 /* Find the beginning of the identifier. This loop ends either when we find a
341 * character that doesn't belong to an identifier, or when we find a '\0'
342 * character, which means we reached the end of the buffer. */
343 for(pos = end; js_is_identifier(jsparser_buffer_get(js, pos)); --pos) {
344 }
345 if (pos + 1 >= end) {
346 identifier[0] = '\0';
347 return 0;
348 }
349
350 jsparser_buffer_slice(js, identifier, pos + 1, end);
351 return 1;
352}
353
354/* Callback used in bsearch() for comparing a string against an array of
355 * strings.
356 */
357static int bsearch_strcmp(const void *a, const void *b)
358{
359 return strcmp(CAST(const char*, a), *CAST(const char * const *, b));
360}
361
362/* Returns true if the token argument can be a token prefix to a javascript
363 * regular expression.
364 *
365 * The token argument is compared against a list of identifiers that can
366 * precede a regular expression in the javascript grammar, and returns true if
367 * the argument is found on that list.
368 */
369static inline int is_regexp_token_prefix(char *token)
370{
371 assert(token != NULL);
372
373 return bsearch(token,
374 regexp_token_prefix,
375 sizeof(regexp_token_prefix) / sizeof(char *),
376 sizeof(char *), bsearch_strcmp) != NULL;
377}
378
379/* Called for every character in state text.
380 *
381 * We copy every character we find when we are in state text to the ring
382 * buffer. This has the side effect of also pushing slash characters that are
383 * part of comments into the buffer, although for parsing purposes these should
384 * be treated as whitespace. This issue is addressed in
385 * enter_state_js_comment_ml_after().
386 */
387static void in_state_js_text(statemachine_ctx *ctx, int start, char chr,
388 int end)
389{
390 jsparser_ctx *js = CAST(jsparser_ctx *, ctx->user);
391 assert(js != NULL);
392
393 jsparser_buffer_append_chr(js, chr);
394}
395
396/* This function is called every time we find a slash ('/') character in the
397 * javascript text (except for slashes that close comments or regexp literals).
398 *
399 * Implements the logic to figure out if this slash character is a division
400 * operator or if it opens a regular expression literal. This is heavily
401 * inspired by the syntactic resynchronization for javascript 2.0:
402 * http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html
403 *
404 * When we receive a '/', we look at the previous non space character to figure
405 * out if it's the ending of a punctuator that can precede a regexp literal, in
406 * which case we assume the current '/' is part of a regular expression literal
407 * (or the opening of a javascript comment, but that part is dealt with in the
408 * state machine). The exceptions to this are unary operators, so we look back
409 * a second character to rule out '++' and '--'. Although it is not
410 * straightforward to figure out if the binary operator is a postfix of the
411 * previous expression or a prefix of the regular expression, we rule out the
412 * later as it is an uncommon practice.
413 *
414 * If we ruled out the previous token to be a valid regexp preceding
415 * punctuator, we extract the last identifier in the buffer and match against a
416 * list of keywords that are known to precede expressions in the grammar. If we
417 * get a match on any of these keywords, then we are opening a regular
418 * expression, if not, then we have a division operator.
419 *
420 * Known cases that are accepted by the grammar but we handle differently,
421 * although I don't believe there is a legitimate usage for those:
422 *
423 * Division of a regular expression:
424 * var result = /test/ / 5;
425 *
426 * Prefix unary increment of a regular expression:
427 * var result = ++/test/;
428 *
429 * Division of an object literal:
430 * { a: 1 } /x/.exec('x');
431 *
432 * We only support ascii right now, so unicode characters in identifiers will
433 * be treated as delimiters, effectively breaking the identifier name where
434 * they appear, and this may cause issues in very specific situations. Namely,
435 * if we have a unicode character in an identifier directly preceding a suffix
436 * that matches one of the keywords in regexp_token_prefix[], if this
437 * identifier precedes a / (slash) character:
438 *
439 * var x = test<unicode char>return / 5;
440 *
441 * We will interpret that slash as the start of a regular expression, when in
442 * reality it is a division operator.
443 */
444static void enter_state_js_slash(statemachine_ctx *ctx, int start, char chr,
445 int end)
446{
447 jsparser_ctx *js;
448 char buffer[JSPARSER_RING_BUFFER_SIZE];
449 int pos;
450
451 assert(ctx != NULL);
452 assert(ctx->user != NULL);
453
454 js = CAST(jsparser_ctx *, ctx->user);
455 assert(js != NULL);
456
457 pos = -1;
458 /* Consume the last whitespace. */
459 if (js_is_whitespace(jsparser_buffer_get(js, pos))) {
460 --pos;
461 }
462
463 switch (jsparser_buffer_get(js, pos)) {
464 /* Ignore unary increment */
465 case '+':
466 if (jsparser_buffer_get(js, pos - 1) != '+') {
467 ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
468 }
469 break;
470
471 /* Ignore unary decrement */
472 case '-':
473 if (jsparser_buffer_get(js, pos - 1) != '-') {
474 ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
475 }
476 break;
477
478 /* List of punctuator endings except ), ], }, + and - */
479 case '=':
480 case '<':
481 case '>':
482 case '&':
483 case '|':
484 case '!':
485 case '%':
486 case '*':
487 case '/':
488 case ',':
489 case ';':
490 case '?':
491 case ':':
492 case '^':
493 case '~':
494 case '{':
495 case '(':
496 case '[':
497 case '}':
498 case '\0':
499 ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
500 break;
501
502 default:
503 if (jsparser_buffer_last_identifier(js, buffer) &&
504 is_regexp_token_prefix(buffer)) {
505 ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
506 }
507 }
508
509 jsparser_buffer_append_chr(js, chr);
510}
511
512/* Called at the end of a javascript comment.
513 *
514 * When we open a comment, the initial '/' was inserted into the ring buffer,
515 * but it is not a token and should be considered whitespace for parsing
516 * purposes.
517 *
518 * When we first saw the '/' character, we didn't yet know if it was the
519 * beginning of a comment, a division operator, or a regexp.
520 *
521 * In this function we just replace the inital '/' with a whitespace character,
522 * unless we had a preceding whitespace character, in which case we just remove
523 * the '/'. This is needed to ensure all spaces in the buffer are correctly
524 * folded.
525 */
526static void enter_state_js_comment_after(statemachine_ctx *ctx, int start,
527 char chr, int end)
528{
529 jsparser_ctx *js;
530
531 assert(ctx != NULL);
532 assert(ctx->user != NULL);
533
534 js = CAST(jsparser_ctx *, ctx->user);
535
536 if (js_is_whitespace(jsparser_buffer_get(js, -2))) {
537 (void)jsparser_buffer_pop(js);
538 } else {
539 jsparser_buffer_set(js, -1, ' ');
540 }
541}
542
543static statemachine_definition *create_statemachine_definition()
544{
545 statemachine_definition *def;
546 def = statemachine_definition_new(JSPARSER_NUM_STATES);
547 if (def == NULL)
548 return NULL;
549
550 /* TODO(falmeida): Check return value. */
551 statemachine_definition_populate(def, jsparser_state_transitions,
552 jsparser_states_internal_names);
553
554 statemachine_in_state(def, JSPARSER_STATE_INT_JS_TEXT,
555 in_state_js_text);
556 statemachine_enter_state(def, JSPARSER_STATE_INT_JS_SLASH,
557 enter_state_js_slash);
558 statemachine_enter_state(def, JSPARSER_STATE_INT_JS_COMMENT_AFTER,
559 enter_state_js_comment_after);
560
561 return def;
562}
563
564
565/* Initializes a new jsparser instance.
566 *
567 * Returns a pointer to the new instance or NULL if the initialization
568 * fails.
569 *
570 * Initialization failure is fatal, and if this function fails it may not
571 * deallocate all previsouly allocated memory.
572 */
573
574jsparser_ctx *jsparser_new()
575{
576 jsparser_ctx *js;
577
578 js = CAST(jsparser_ctx *, calloc(1, sizeof(jsparser_ctx)));
579 if (js == NULL)
580 return NULL;
581
582 js->statemachine_def = create_statemachine_definition();
583 if (js->statemachine_def == NULL)
584 return NULL;
585
586 js->statemachine = statemachine_new(js->statemachine_def, js);
587 if (js->statemachine == NULL)
588 return NULL;
589
590 jsparser_reset(js);
591
592 return js;
593}
594
595/* Returns a pointer to a context which is a duplicate of the jsparser src.
596 */
597jsparser_ctx *jsparser_duplicate(jsparser_ctx *src)
598{
599 jsparser_ctx *dst;
600 assert(src != NULL);
601
602 dst = jsparser_new();
603 if (dst == NULL)
604 return NULL;
605
606 jsparser_copy(dst, src);
607
608 return dst;
609}
610
611/* Copies the context of the jsparser pointed to by src to the jsparser dst.
612 *
613 * The state machine definition is preserved since it is read only.
614 */
615void jsparser_copy(jsparser_ctx *dst, jsparser_ctx *src)
616{
617
618 dst->buffer_start = src->buffer_start;
619 dst->buffer_end = src->buffer_end;
620 memcpy(dst->buffer, src->buffer, sizeof(src->buffer));
621
622 statemachine_copy(dst->statemachine,
623 src->statemachine,
624 dst->statemachine_def,
625 dst);
626
627}
628
629void jsparser_reset(jsparser_ctx *ctx)
630{
631 assert(ctx != NULL);
632 ctx->statemachine->current_state = 0;
633 ctx->buffer_start = 0;
634 ctx->buffer_end = 0;
635}
636
637int jsparser_state(jsparser_ctx *ctx)
638{
639 return state_external(ctx->statemachine->current_state);
640}
641
642int jsparser_parse(jsparser_ctx *ctx, const char *str, int size)
643{
644 int internal_state;
645 internal_state = statemachine_parse(ctx->statemachine, str, size);
646 return state_external(internal_state);
647}
648
649void jsparser_delete(jsparser_ctx *ctx)
650{
651 assert(ctx != NULL);
652 statemachine_delete(ctx->statemachine);
653 statemachine_definition_delete(ctx->statemachine_def);
654 free(ctx);
655}
656
657#ifdef __cplusplus
658} /* namespace security_streamhtmlparser */
659#endif /* __cplusplus */