Blame - src/htmlparser/jsparser.cc - RealtimeRoboticsGroup/test

blob: 15be35d2e7ee9a20090e4448a6754abd312bb316 [file] [log] [blame]

Brian Silverman	70325d6	2015-09-20 17:00:43 -0400	[diff] [blame]	1	/* Copyright (c) 2007, Google Inc.
				2	* All rights reserved.
				3	*
				4	* Redistribution and use in source and binary forms, with or without
				5	* modification, are permitted provided that the following conditions are
				6	* met:
				7	*
				8	* * Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* * Redistributions in binary form must reproduce the above
				11	* copyright notice, this list of conditions and the following disclaimer
				12	* in the documentation and/or other materials provided with the
				13	* distribution.
				14	* * Neither the name of Google Inc. nor the names of its
				15	* contributors may be used to endorse or promote products derived from
				16	* this software without specific prior written permission.
				17	*
				18	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				19	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				20	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				21	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				22	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				23	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				24	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				25	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				26	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				27	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				28	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				29	* ---
				30	*
				31	* Author: falmeida@google.com (Filipe Almeida)
				32	*/
				33
				34	#include <stdio.h>
				35	#include <stdlib.h>
				36	#include <string.h>
				37	#include <assert.h>
				38
				39	#include "htmlparser/statemachine.h"
				40	#include "htmlparser/jsparser.h"
				41
				42	/* So we can support both C and C++ compilers, we use the CAST() macro instead
				43	* of using C style casts or static_cast<>() directly.
				44	*/
				45	#ifdef __cplusplus
				46	#define CAST(type, expression) (static_cast<type>(expression))
				47	#else
				48	#define CAST(type, expression) ((type)(expression))
				49	#endif
				50
				51	#ifdef __cplusplus
				52	namespace ctemplate_htmlparser {
				53	#endif /* __cplusplus */
				54
				55	/* Generated state machine definition. */
				56	#include "htmlparser/jsparser_fsm.h"
				57
				58	/* List of keywords that can precede a regular expression literal. Taken from:
				59	* http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html
				60	*
				61	* 'void' was added to this list.
				62	* The list is used as input to a binary search, so it must be kept in a sorted
				63	* form.
				64	* There are a large number of keywords in here that don't exist in
				65	* ecmascript 3, either because they are reserved or because they are part of
				66	* ecmascript 4. However they weren't removed in order to keep the list in sync
				67	* with the previous document.
				68	*/
				69	static const char *regexp_token_prefix[] = {
				70	"abstract",
				71	"break",
				72	"case",
				73	"catch",
				74	"class",
				75	"const",
				76	"continue",
				77	"debugger",
				78	"default",
				79	"delete",
				80	"do",
				81	"else",
				82	"enum",
				83	"eval",
				84	"export",
				85	"extends",
				86	"field",
				87	"final",
				88	"finally",
				89	"for",
				90	"function",
				91	"goto",
				92	"if",
				93	"implements",
				94	"import",
				95	"in",
				96	"instanceof",
				97	"native",
				98	"new",
				99	"package",
				100	"private",
				101	"protected",
				102	"public",
				103	"return",
				104	"static",
				105	"switch",
				106	"synchronized",
				107	"throw",
				108	"throws",
				109	"transient",
				110	"try",
				111	"typeof",
				112	"var",
				113	"void",
				114	"volatile",
				115	"while",
				116	"with"
				117	};
				118
				119	/* Utility functions */
				120
				121	/* Converts the internal state into the external superstate.
				122	*/
				123	static inline int state_external(int state)
				124	{
				125	assert(state < JSPARSER_NUM_STATES);
				126	assert(state >= 0);
				127	return jsparser_states_external[state];
				128	}
				129
				130	/* Returns true if the character is an ecmascript whitespace or line terminator
				131	* character. Includes the characters in sections 7.2 and 7.3 of ECMAScript 3
				132	* with the exception of unicode space and line terminators:
				133	* http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
				134	*/
				135	static inline int js_is_whitespace(char c)
				136	{
				137	return c == '\t' \|\| /* Tab 0x09 */
				138	c == '\v' \|\| /* Vertical Tab 0x0B */
				139	c == '\f' \|\| /* Form Feed 0x0C */
				140	c == ' ' \|\| /* Space 0x20 */
				141	c == '\xa0' \|\| /* No-Break Space 0xA0 */
				142	c == '\n' \|\| /* line Feed 0x0A */
				143	c == '\r'; /* Carriage Return 0x0D */
				144	}
				145
				146	/* Returns true if the character is part of a javascript identifier. The rules
				147	* are pretty relaxed here since we don't accept unicode and don't care if the
				148	* first character doesn't contain numbers or not.
				149	*
				150	* For more detail on the limitations of having this relaxed set of characters
				151	* please see the comments in_state_js_text().
				152	*/
				153	static inline int js_is_identifier(char c) {
				154	return (c >= 'a' && c <= 'z') \|\|
				155	(c >= 'A' && c <= 'Z') \|\|
				156	(c >= '0' && c <= '9') \|\|
				157	c == '_' \|\|
				158	c == '$';
				159	}
				160
				161	/* Appends a character to the ring buffer. Sequences of whitespace and newlines
				162	* are folded into one.
				163	*
				164	* js->buffer_start points to the first character in the buffer and
				165	* js->buffer_end points to the position of the next character to be written,
				166	* or one plus the last character written. If the buffer is full there will be
				167	* an empty slot position so we can diferentiate an empty buffer from a full
				168	* buffer.
				169	*
				170	* If the buffer is empty, then:
				171	* js->buffer_start == js->buffer_end.
				172	* If the buffer is full, then:
				173	* (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE == js->buffer_start.
				174	*
				175	*/
				176	void jsparser_buffer_append_chr(jsparser_ctx *js, char chr)
				177	{
				178	/* Fold whitespace so we have enough space in the buffer. */
				179	if (js_is_whitespace(chr) &&
				180	js_is_whitespace(jsparser_buffer_get(js, -1))) {
				181	return;
				182	}
				183
				184	js->buffer[js->buffer_end] = chr;
				185	js->buffer_end = (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE;
				186	if (js->buffer_end == js->buffer_start) {
				187	js->buffer_start = (js->buffer_end + 1) %
				188	JSPARSER_RING_BUFFER_SIZE;
				189	}
				190	}
				191
				192	/* Appends a string to the ring buffer. Sequences of whitespace and newlines
				193	* are folded into one.
				194	*/
				195	void jsparser_buffer_append_str(jsparser_ctx js, const char str)
				196	{
				197	assert(js != NULL);
				198	assert(str != NULL);
				199
				200	while(*str != '\0') {
				201	jsparser_buffer_append_chr(js, *str++);
				202	}
				203	}
				204
				205	/* Returns the position relative to the start of the buffer or -1 if past the
				206	* size of the buffer..
				207	*/
				208	static inline int jsparser_buffer_absolute_pos(jsparser_ctx *js, int pos)
				209	{
				210	int absolute_pos;
				211	int buffer_len;
				212	assert(pos < 0);
				213
				214	if(pos <= -JSPARSER_RING_BUFFER_SIZE) {
				215	return -1;
				216	}
				217
				218	buffer_len = js->buffer_end - js->buffer_start;
				219	if (buffer_len < 0) {
				220	buffer_len += JSPARSER_RING_BUFFER_SIZE;
				221	}
				222
				223	if (pos < -buffer_len) {
				224	return -1;
				225	}
				226
				227	absolute_pos = (pos + js->buffer_end) % JSPARSER_RING_BUFFER_SIZE;
				228	if (absolute_pos < 0) {
				229	absolute_pos += JSPARSER_RING_BUFFER_SIZE;
				230	}
				231
				232	return absolute_pos;
				233	}
				234
				235	/* Returns the last appended character and removes it from the buffer. If the
				236	* buffer is empty, then it returns ASCII 0 ('\0').
				237	*/
				238	char jsparser_buffer_pop(jsparser_ctx *js)
				239	{
				240	if (js->buffer_start == js->buffer_end) {
				241	return '\0';
				242	}
				243
				244	js->buffer_end--;
				245	if (js->buffer_end < 0) {
				246	js->buffer_end += JSPARSER_RING_BUFFER_SIZE;
				247	}
				248
				249	return js->buffer[js->buffer_end];
				250	}
				251
				252	/* Returns the value of the character at a certain index in the buffer or an
				253	* ASCII 0 ('\0') character if the index is outside the buffer boundaries.
				254	*
				255	* Index positions are negative, were -1 is the last character appended to the
				256	* buffer. Using positive integers for the index will result in undefined
				257	* behaviour.
				258	*/
				259	char jsparser_buffer_get(jsparser_ctx *js, int pos)
				260	{
				261	int absolute_pos;
				262	assert(pos < 0);
				263
				264	absolute_pos = jsparser_buffer_absolute_pos(js, pos);
				265	if (absolute_pos < 0) {
				266	return '\0';
				267	}
				268
				269	return js->buffer[absolute_pos];
				270	}
				271
				272	/* Sets the value of the character at a certain index in the buffer. Returns
				273	* true if the write was successful or false if there was an attempt to write
				274	* outside of the buffer boundaries.
				275	*
				276	* Index positions are negative, were -1 is the last character appended to the
				277	* buffer. Using positive integers for the index will result in undefined
				278	* behaviour.
				279	*/
				280	int jsparser_buffer_set(jsparser_ctx *js, int pos, char value)
				281	{
				282	int absolute_pos;
				283	assert(pos < 0);
				284
				285	absolute_pos = jsparser_buffer_absolute_pos(js, pos);
				286	if (absolute_pos < 0) {
				287	return 0;
				288	}
				289
				290	js->buffer[absolute_pos] = value;
				291	return 1;
				292	}
				293
				294	/* Copies a slice of the buffer to the string pointed to by output. start and
				295	* end are the indexes of the sliced region. If start extends beyond the
				296	* beginning of the buffer, the slice will only contain character from the
				297	* beginning of the buffer.
				298	*/
				299	void jsparser_buffer_slice(jsparser_ctx js, char output, int start, int end)
				300	{
				301	int pos;
				302
				303	assert(start <= end);
				304	assert(start < 0);
				305	assert(end < 0);
				306
				307	for (pos = start; pos <= end; ++pos) {
				308	char c;
				309	c = jsparser_buffer_get(js, pos);
				310	if (c != '\0') {
				311	*output++ = jsparser_buffer_get(js, pos);
				312	}
				313	}
				314	*output++ = '\0';
				315	}
				316
				317	/* Copy the last javascript identifier or keyword found in the buffer to the
				318	* string pointed by identifier.
				319	*
				320	* For simplicity, we consider an identifier to be a sequence of alphanumeric
				321	* characters (as opposed to a digit followed by an alphanumeric character.
				322	*
				323	* Returns 0 if no identifier was matched, in which case the identifier
				324	* argument is replaced with an empty string, or non 0 if the identifier was
				325	* found.
				326	*/
				327	int jsparser_buffer_last_identifier(jsparser_ctx js, char identifier)
				328	{
				329	int end;
				330	int pos;
				331
				332	assert(identifier != NULL);
				333
				334	end = -1;
				335	/* Ignore the optional whitespace delimiter */
				336	if (js_is_whitespace(jsparser_buffer_get(js, -1))) {
				337	--end;
				338	}
				339
				340	/* Find the beginning of the identifier. This loop ends either when we find a
				341	* character that doesn't belong to an identifier, or when we find a '\0'
				342	* character, which means we reached the end of the buffer. */
				343	for(pos = end; js_is_identifier(jsparser_buffer_get(js, pos)); --pos) {
				344	}
				345	if (pos + 1 >= end) {
				346	identifier[0] = '\0';
				347	return 0;
				348	}
				349
				350	jsparser_buffer_slice(js, identifier, pos + 1, end);
				351	return 1;
				352	}
				353
				354	/* Callback used in bsearch() for comparing a string against an array of
				355	* strings.
				356	*/
				357	static int bsearch_strcmp(const void a, const void b)
				358	{
				359	return strcmp(CAST(const char, a), CAST(const char * const *, b));
				360	}
				361
				362	/* Returns true if the token argument can be a token prefix to a javascript
				363	* regular expression.
				364	*
				365	* The token argument is compared against a list of identifiers that can
				366	* precede a regular expression in the javascript grammar, and returns true if
				367	* the argument is found on that list.
				368	*/
				369	static inline int is_regexp_token_prefix(char *token)
				370	{
				371	assert(token != NULL);
				372
				373	return bsearch(token,
				374	regexp_token_prefix,
				375	sizeof(regexp_token_prefix) / sizeof(char *),
				376	sizeof(char *), bsearch_strcmp) != NULL;
				377	}
				378
				379	/* Called for every character in state text.
				380	*
				381	* We copy every character we find when we are in state text to the ring
				382	* buffer. This has the side effect of also pushing slash characters that are
				383	* part of comments into the buffer, although for parsing purposes these should
				384	* be treated as whitespace. This issue is addressed in
				385	* enter_state_js_comment_ml_after().
				386	*/
				387	static void in_state_js_text(statemachine_ctx *ctx, int start, char chr,
				388	int end)
				389	{
				390	jsparser_ctx js = CAST(jsparser_ctx , ctx->user);
				391	assert(js != NULL);
				392
				393	jsparser_buffer_append_chr(js, chr);
				394	}
				395
				396	/* This function is called every time we find a slash ('/') character in the
				397	* javascript text (except for slashes that close comments or regexp literals).
				398	*
				399	* Implements the logic to figure out if this slash character is a division
				400	* operator or if it opens a regular expression literal. This is heavily
				401	* inspired by the syntactic resynchronization for javascript 2.0:
				402	* http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html
				403	*
				404	* When we receive a '/', we look at the previous non space character to figure
				405	* out if it's the ending of a punctuator that can precede a regexp literal, in
				406	* which case we assume the current '/' is part of a regular expression literal
				407	* (or the opening of a javascript comment, but that part is dealt with in the
				408	* state machine). The exceptions to this are unary operators, so we look back
				409	* a second character to rule out '++' and '--'. Although it is not
				410	* straightforward to figure out if the binary operator is a postfix of the
				411	* previous expression or a prefix of the regular expression, we rule out the
				412	* later as it is an uncommon practice.
				413	*
				414	* If we ruled out the previous token to be a valid regexp preceding
				415	* punctuator, we extract the last identifier in the buffer and match against a
				416	* list of keywords that are known to precede expressions in the grammar. If we
				417	* get a match on any of these keywords, then we are opening a regular
				418	* expression, if not, then we have a division operator.
				419	*
				420	* Known cases that are accepted by the grammar but we handle differently,
				421	* although I don't believe there is a legitimate usage for those:
				422	*
				423	* Division of a regular expression:
				424	* var result = /test/ / 5;
				425	*
				426	* Prefix unary increment of a regular expression:
				427	* var result = ++/test/;
				428	*
				429	* Division of an object literal:
				430	* { a: 1 } /x/.exec('x');
				431	*
				432	* We only support ascii right now, so unicode characters in identifiers will
				433	* be treated as delimiters, effectively breaking the identifier name where
				434	* they appear, and this may cause issues in very specific situations. Namely,
				435	* if we have a unicode character in an identifier directly preceding a suffix
				436	* that matches one of the keywords in regexp_token_prefix[], if this
				437	* identifier precedes a / (slash) character:
				438	*
				439	* var x = test<unicode char>return / 5;
				440	*
				441	* We will interpret that slash as the start of a regular expression, when in
				442	* reality it is a division operator.
				443	*/
				444	static void enter_state_js_slash(statemachine_ctx *ctx, int start, char chr,
				445	int end)
				446	{
				447	jsparser_ctx *js;
				448	char buffer[JSPARSER_RING_BUFFER_SIZE];
				449	int pos;
				450
				451	assert(ctx != NULL);
				452	assert(ctx->user != NULL);
				453
				454	js = CAST(jsparser_ctx *, ctx->user);
				455	assert(js != NULL);
				456
				457	pos = -1;
				458	/* Consume the last whitespace. */
				459	if (js_is_whitespace(jsparser_buffer_get(js, pos))) {
				460	--pos;
				461	}
				462
				463	switch (jsparser_buffer_get(js, pos)) {
				464	/* Ignore unary increment */
				465	case '+':
				466	if (jsparser_buffer_get(js, pos - 1) != '+') {
				467	ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
				468	}
				469	break;
				470
				471	/* Ignore unary decrement */
				472	case '-':
				473	if (jsparser_buffer_get(js, pos - 1) != '-') {
				474	ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
				475	}
				476	break;
				477
				478	/* List of punctuator endings except ), ], }, + and - */
				479	case '=':
				480	case '<':
				481	case '>':
				482	case '&':
				483	case '\|':
				484	case '!':
				485	case '%':
				486	case '*':
				487	case '/':
				488	case ',':
				489	case ';':
				490	case '?':
				491	case ':':
				492	case '^':
				493	case '~':
				494	case '{':
				495	case '(':
				496	case '[':
				497	case '}':
				498	case '\0':
				499	ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
				500	break;
				501
				502	default:
				503	if (jsparser_buffer_last_identifier(js, buffer) &&
				504	is_regexp_token_prefix(buffer)) {
				505	ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
				506	}
				507	}
				508
				509	jsparser_buffer_append_chr(js, chr);
				510	}
				511
				512	/* Called at the end of a javascript comment.
				513	*
				514	* When we open a comment, the initial '/' was inserted into the ring buffer,
				515	* but it is not a token and should be considered whitespace for parsing
				516	* purposes.
				517	*
				518	* When we first saw the '/' character, we didn't yet know if it was the
				519	* beginning of a comment, a division operator, or a regexp.
				520	*
				521	* In this function we just replace the inital '/' with a whitespace character,
				522	* unless we had a preceding whitespace character, in which case we just remove
				523	* the '/'. This is needed to ensure all spaces in the buffer are correctly
				524	* folded.
				525	*/
				526	static void enter_state_js_comment_after(statemachine_ctx *ctx, int start,
				527	char chr, int end)
				528	{
				529	jsparser_ctx *js;
				530
				531	assert(ctx != NULL);
				532	assert(ctx->user != NULL);
				533
				534	js = CAST(jsparser_ctx *, ctx->user);
				535
				536	if (js_is_whitespace(jsparser_buffer_get(js, -2))) {
				537	(void)jsparser_buffer_pop(js);
				538	} else {
				539	jsparser_buffer_set(js, -1, ' ');
				540	}
				541	}
				542
				543	static statemachine_definition *create_statemachine_definition()
				544	{
				545	statemachine_definition *def;
				546	def = statemachine_definition_new(JSPARSER_NUM_STATES);
				547	if (def == NULL)
				548	return NULL;
				549
				550	/* TODO(falmeida): Check return value. */
				551	statemachine_definition_populate(def, jsparser_state_transitions,
				552	jsparser_states_internal_names);
				553
				554	statemachine_in_state(def, JSPARSER_STATE_INT_JS_TEXT,
				555	in_state_js_text);
				556	statemachine_enter_state(def, JSPARSER_STATE_INT_JS_SLASH,
				557	enter_state_js_slash);
				558	statemachine_enter_state(def, JSPARSER_STATE_INT_JS_COMMENT_AFTER,
				559	enter_state_js_comment_after);
				560
				561	return def;
				562	}
				563
				564
				565	/* Initializes a new jsparser instance.
				566	*
				567	* Returns a pointer to the new instance or NULL if the initialization
				568	* fails.
				569	*
				570	* Initialization failure is fatal, and if this function fails it may not
				571	* deallocate all previsouly allocated memory.
				572	*/
				573
				574	jsparser_ctx *jsparser_new()
				575	{
				576	jsparser_ctx *js;
				577
				578	js = CAST(jsparser_ctx *, calloc(1, sizeof(jsparser_ctx)));
				579	if (js == NULL)
				580	return NULL;
				581
				582	js->statemachine_def = create_statemachine_definition();
				583	if (js->statemachine_def == NULL)
				584	return NULL;
				585
				586	js->statemachine = statemachine_new(js->statemachine_def, js);
				587	if (js->statemachine == NULL)
				588	return NULL;
				589
				590	jsparser_reset(js);
				591
				592	return js;
				593	}
				594
				595	/* Returns a pointer to a context which is a duplicate of the jsparser src.
				596	*/
				597	jsparser_ctx jsparser_duplicate(jsparser_ctx src)
				598	{
				599	jsparser_ctx *dst;
				600	assert(src != NULL);
				601
				602	dst = jsparser_new();
				603	if (dst == NULL)
				604	return NULL;
				605
				606	jsparser_copy(dst, src);
				607
				608	return dst;
				609	}
				610
				611	/* Copies the context of the jsparser pointed to by src to the jsparser dst.
				612	*
				613	* The state machine definition is preserved since it is read only.
				614	*/
				615	void jsparser_copy(jsparser_ctx dst, jsparser_ctx src)
				616	{
				617
				618	dst->buffer_start = src->buffer_start;
				619	dst->buffer_end = src->buffer_end;
				620	memcpy(dst->buffer, src->buffer, sizeof(src->buffer));
				621
				622	statemachine_copy(dst->statemachine,
				623	src->statemachine,
				624	dst->statemachine_def,
				625	dst);
				626
				627	}
				628
				629	void jsparser_reset(jsparser_ctx *ctx)
				630	{
				631	assert(ctx != NULL);
				632	ctx->statemachine->current_state = 0;
				633	ctx->buffer_start = 0;
				634	ctx->buffer_end = 0;
				635	}
				636
				637	int jsparser_state(jsparser_ctx *ctx)
				638	{
				639	return state_external(ctx->statemachine->current_state);
				640	}
				641
				642	int jsparser_parse(jsparser_ctx ctx, const char str, int size)
				643	{
				644	int internal_state;
				645	internal_state = statemachine_parse(ctx->statemachine, str, size);
				646	return state_external(internal_state);
				647	}
				648
				649	void jsparser_delete(jsparser_ctx *ctx)
				650	{
				651	assert(ctx != NULL);
				652	statemachine_delete(ctx->statemachine);
				653	statemachine_definition_delete(ctx->statemachine_def);
				654	free(ctx);
				655	}
				656
				657	#ifdef __cplusplus
				658	} /* namespace security_streamhtmlparser */
				659	#endif /* __cplusplus */