Blame - third_party/ctemplate/src/template_modifiers.cc - RealtimeRoboticsGroup/test

blob: 4e35281dcb17f467579db2f81969b412ae0ada05 [file] [log] [blame]

Brian Silverman	8ab8a65	2015-09-21 17:49:11 -0400	[diff] [blame^]	1	// Copyright (c) 2007, Google Inc.
				2	// All rights reserved.
				3	//
				4	// Redistribution and use in source and binary forms, with or without
				5	// modification, are permitted provided that the following conditions are
				6	// met:
				7	//
				8	// * Redistributions of source code must retain the above copyright
				9	// notice, this list of conditions and the following disclaimer.
				10	// * Redistributions in binary form must reproduce the above
				11	// copyright notice, this list of conditions and the following disclaimer
				12	// in the documentation and/or other materials provided with the
				13	// distribution.
				14	// * Neither the name of Google Inc. nor the names of its
				15	// contributors may be used to endorse or promote products derived from
				16	// this software without specific prior written permission.
				17	//
				18	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				19	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				20	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				21	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				22	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				23	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				24	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				25	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				26	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				27	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				28	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				29
				30	// ---
				31	// Author: csilvers@google.com (Craig Silverstein)
				32	//
				33	// template_modifiers.h has a description of what each escape-routine does.
				34	//
				35	// When creating a new modifier, you must subclass TemplateModifier
				36	// and define your own Modify() method. This method takes the string
				37	// to be modified as a char*/int pair. It then emits the modified
				38	// version of the string to outbuf. Outbuf is an ExpandEmitter, as
				39	// defined in template_modifiers.h. It's a very simple type that
				40	// supports appending to a data stream.
				41	//
				42	// Be very careful editing an existing modifier. Subtle changes can
				43	// introduce the possibility for cross-site scripting attacks. If you
				44	// do change a modifier, be careful that it does not affect
				45	// the list of Safe XSS Alternatives.
				46	//
				47
				48	#include <config.h>
				49	#include <stdlib.h>
				50	#include <assert.h>
				51	#include <string.h>
				52	#include <string>
				53	#include <vector>
				54	#include "htmlparser/htmlparser_cpp.h"
				55	#include <ctemplate/template_modifiers.h>
				56	#include "template_modifiers_internal.h"
				57	#include <ctemplate/per_expand_data.h>
				58	using std::string;
				59	using std::vector;
				60
				61	#define strliterallen(s) (sizeof("" s "") - 1)
				62
				63	// Really we should be using uint_16_t or something, but this is good
				64	// enough, and more portable...
				65	typedef unsigned int uint16;
				66
				67	namespace URL {
				68	bool HasInsecureProtocol(const char* in, int inlen) {
				69	if (inlen > strliterallen("http://") &&
				70	strncasecmp(in, "http://", strliterallen("http://")) == 0) {
				71	return false; // We're ok, it's an http protocol
				72	}
				73	if (inlen > strliterallen("https://") &&
				74	strncasecmp(in, "https://", strliterallen("https://")) == 0) {
				75	return false; // https is ok as well
				76	}
				77	if (inlen > strliterallen("ftp://") &&
				78	strncasecmp(in, "ftp://", strliterallen("ftp://")) == 0) {
				79	return false; // and ftp
				80	}
				81	return true;
				82	}
				83	} // namespace URL
				84
				85	namespace ctemplate {
				86
				87	using ctemplate_htmlparser::HtmlParser;
				88
				89	// A most-efficient way to append a string literal to the var named 'out'.
				90	// The ""s ensure literal is actually a string literal
				91	#define APPEND(literal) out->Emit("" literal "", sizeof(literal)-1)
				92
				93	// Check whether the string of length len is identical to the literal.
				94	// The ""s ensure literal is actually a string literal
				95	#define STR_IS(str, len, literal) \
				96	((len) == sizeof("" literal "") - 1 && \
				97	memcmp(str, literal, sizeof("" literal "") - 1) == 0)
				98
				99	TemplateModifier::~TemplateModifier() {}
				100
				101	void NullModifier::Modify(const char* in, size_t inlen,
				102	const PerExpandData*,
				103	ExpandEmitter* out, const string& arg) const {
				104	out->Emit(in, inlen);
				105	}
				106	NullModifier null_modifier;
				107
				108	static inline void EmitRun(const char* start, const char* limit,
				109	ExpandEmitter* out) {
				110	if (start < limit) {
				111	out->Emit(start, (limit - start));
				112	}
				113	}
				114
				115	void HtmlEscape::Modify(const char* in, size_t inlen,
				116	const PerExpandData*,
				117	ExpandEmitter* out, const string& arg) const {
				118	const char* pos = in;
				119	const char* start = pos;
				120	const char* const limit = in + inlen;
				121	while (pos < limit) {
				122	switch (*pos) {
				123	default:
				124	// Increment our counter and look at the next character.
				125	++pos;
				126	continue;
				127
				128	case '&': EmitRun(start, pos, out); APPEND("&"); break;
				129	case '"': EmitRun(start, pos, out); APPEND("""); break;
				130	case '\'': EmitRun(start, pos, out); APPEND("'"); break;
				131	case '<': EmitRun(start, pos, out); APPEND("<"); break;
				132	case '>': EmitRun(start, pos, out); APPEND(">"); break;
				133
				134	case '\r': case '\n': case '\v': case '\f': case '\t':
				135	EmitRun(start, pos, out); APPEND(" "); break;
				136	}
				137	start = ++pos;
				138	}
				139	EmitRun(start, pos, out);
				140	}
				141	HtmlEscape html_escape;
				142
				143	void PreEscape::Modify(const char* in, size_t inlen,
				144	const PerExpandData*,
				145	ExpandEmitter* out, const string& arg) const {
				146	const char* pos = in;
				147	const char* start = pos;
				148	const char* const limit = in + inlen;
				149	while (pos < limit) {
				150	switch (*pos) {
				151	default:
				152	// Increment our counter and look at the next character.
				153	++pos;
				154	continue;
				155
				156	// Unlike HtmlEscape, we leave whitespace as is.
				157	case '&': EmitRun(start, pos, out); APPEND("&"); break;
				158	case '"': EmitRun(start, pos, out); APPEND("""); break;
				159	case '\'': EmitRun(start, pos, out); APPEND("'"); break;
				160	case '<': EmitRun(start, pos, out); APPEND("<"); break;
				161	case '>': EmitRun(start, pos, out); APPEND(">"); break;
				162	}
				163	start = ++pos;
				164	}
				165	EmitRun(start, pos, out);
				166	}
				167	PreEscape pre_escape;
				168
				169	// We encode the presence and ordering of unclosed tags in a string, using the
				170	// letters b, i, s, and e to stand for <b>, <i>, <span>, and <em> respectively.
				171	// The most recently opened tag is appended onto the end of the string, so in
				172	// the common case of properly nested tags, we need only look at the last
				173	// character. If we don't find it there, we need to continue looking at
				174	// everything until we find it, because tags may not necessarily be in order.
				175	// Similarly, when we add a tag, we need to check each existing tag for a match
				176	// so that we don't nest.
				177	class UnclosedSnippetTags {
				178	public:
				179	// We could use ordinary ints for the enum values, but using mnemonic
				180	// characters potentially makes debugging easier.
				181	typedef enum {
				182	TAG_B = 'b',
				183	TAG_I = 'i',
				184	TAG_EM = 'e',
				185	TAG_SPAN = 's',
				186	} Tag;
				187
				188	UnclosedSnippetTags() : tag_length(0) {
				189	memset(tags, 0, 5);
				190	}
				191
				192	// Adds a tag to the set of open tags if it's not already open, or otherwise
				193	// return false.
				194	inline bool MaybeAdd(Tag tag) {
				195	if (strchr(tags, tag)) {
				196	return false;
				197	} else {
				198	tags[tag_length++] = tag;
				199	return true;
				200	}
				201	}
				202
				203	// Removes a tag from the set of open tags if it's open, or otherwise return
				204	// false.
				205	inline bool MaybeRemove(Tag tag) {
				206	char* tag_location = strchr(tags, tag);
				207	if (tag_location) {
				208	for (char* c = tag_location; *c; ++c) {
				209	// Have to copy all later tags down by one so we don't leave a gap in the
				210	// array.
				211	c = (c + 1);
				212	}
				213	--tag_length;
				214	return true;
				215	} else {
				216	return false;
				217	}
				218	}
				219
				220	inline void PrintClosingTags(ExpandEmitter* out) {
				221	for (int i = tag_length; i >= 0; --i) {
				222	switch (tags[i]) {
				223	case TAG_B:
				224	out->Emit("</b>"); break;
				225	case TAG_I:
				226	out->Emit("</i>"); break;
				227	case TAG_EM:
				228	out->Emit("</em>"); break;
				229	case TAG_SPAN:
				230	out->Emit("</span>"); break;
				231	}
				232	}
				233	}
				234
				235	private:
				236	char tags[5];
				237	int tag_length;
				238	};
				239
				240	void SnippetEscape::Modify(const char* in, size_t inlen,
				241	const PerExpandData*,
				242	ExpandEmitter* out, const string& arg) const {
				243	UnclosedSnippetTags unclosed;
				244	const char* pos = in;
				245	const char* start = pos;
				246	const char* const limit = in + inlen;
				247	while (pos < limit) {
				248	switch (*pos) {
				249	default:
				250	// Increment our counter and look at the next character.
				251	++pos;
				252	continue;
				253
				254	case '<': {
				255	// If there is a permissible tag, just advance pos past it to
				256	// make it part of the current run. Notice the use of
				257	// "continue" below.
				258	const char* const next_pos = pos + 1;
				259	const int chars_left = limit - next_pos;
				260	if ((chars_left >= 2) && !memcmp(next_pos, "b>", 2)
				261	&& unclosed.MaybeAdd(UnclosedSnippetTags::TAG_B)) {
				262	pos += strliterallen("<b>");
				263	continue;
				264	} else if ((chars_left >= 2) && !memcmp(next_pos, "i>", 2)
				265	&& unclosed.MaybeAdd(UnclosedSnippetTags::TAG_I)) {
				266	pos += strliterallen("<i>");
				267	continue;
				268	} else if ((chars_left >= 3) && !memcmp(next_pos, "em>", 3)
				269	&& unclosed.MaybeAdd(UnclosedSnippetTags::TAG_EM)) {
				270	pos += strliterallen("<em>");
				271	continue;
				272	} else if ((chars_left >= 13) && !memcmp(next_pos, "span dir=", 9)
				273	&& (!memcmp(next_pos + 9, "ltr>", 4) \|\|
				274	!memcmp(next_pos + 9, "rtl>", 4))
				275	&& unclosed.MaybeAdd(UnclosedSnippetTags::TAG_SPAN)) {
				276	pos += strliterallen("<span dir=ltr>");
				277	continue;
				278	} else if ((chars_left >= 3) && !memcmp(next_pos, "/b>", 3)
				279	&& unclosed.MaybeRemove(UnclosedSnippetTags::TAG_B)) {
				280	pos += strliterallen("</b>");
				281	continue;
				282	} else if ((chars_left >= 3) && !memcmp(next_pos, "/i>", 3)
				283	&& unclosed.MaybeRemove(UnclosedSnippetTags::TAG_I)) {
				284	pos += strliterallen("</i>");
				285	continue;
				286	} else if ((chars_left >= 4) && !memcmp(next_pos, "/em>", 4)
				287	&& unclosed.MaybeRemove(UnclosedSnippetTags::TAG_EM)) {
				288	pos += strliterallen("</em>");
				289	continue;
				290	} else if ((chars_left >= 6) && !memcmp(next_pos, "/span>", 6)
				291	&& unclosed.MaybeRemove(UnclosedSnippetTags::TAG_SPAN)) {
				292	pos += strliterallen("</span>");
				293	continue;
				294	} else if ((chars_left >= 3) && !memcmp(next_pos, "br>", 3)) {
				295	pos += strliterallen("<br>");
				296	continue;
				297	} else if ((chars_left >= 4) && !memcmp(next_pos, "wbr>", 4)) {
				298	pos += strliterallen("<wbr>");
				299	continue;
				300	}
				301
				302	// Emit the entity and break out of the switch.
				303	EmitRun(start, pos, out);
				304	APPEND("<");
				305	break;
				306	}
				307
				308	case '&':
				309	EmitRun(start, pos, out);
				310	if (pos + 1 < limit && pos[1] == '{') {
				311	// Could be a javascript entity, so we need to escape.
				312	// (Javascript entities are an xss risk in Netscape 4.)
				313	APPEND("&");
				314	} else {
				315	APPEND("&");
				316	}
				317	break;
				318
				319	case '"': EmitRun(start, pos, out); APPEND("""); break;
				320	case '\'': EmitRun(start, pos, out); APPEND("'"); break;
				321	case '>': EmitRun(start, pos, out); APPEND(">"); break;
				322
				323	case '\r': case '\n': case '\v': case '\f': case '\t':
				324	// non-space whitespace
				325	EmitRun(start, pos, out); APPEND(" "); break;
				326
				327	}
				328	start = ++pos;
				329	}
				330	EmitRun(start, pos, out);
				331	unclosed.PrintClosingTags(out);
				332	}
				333	SnippetEscape snippet_escape;
				334
				335	void CleanseAttribute::Modify(const char* in, size_t inlen,
				336	const PerExpandData*,
				337	ExpandEmitter* out, const string& arg) const {
				338	for (size_t i = 0; i < inlen; ++i) {
				339	char c = in[i];
				340	switch (c) {
				341	case '=': {
				342	if (i == 0 \|\| i == (inlen - 1))
				343	out->Emit('_');
				344	else
				345	out->Emit(c);
				346	break;
				347	}
				348	case '-':
				349	case '.':
				350	case '_':
				351	case ':': {
				352	out->Emit(c);
				353	break;
				354	}
				355	default: {
				356	if ((c >= 'a' && c <= 'z') \|\|
				357	(c >= 'A' && c <= 'Z') \|\|
				358	(c >= '0' && c <= '9')) {
				359	out->Emit(c);
				360	} else {
				361	APPEND("_");
				362	}
				363	break;
				364	}
				365	}
				366	}
				367	}
				368	CleanseAttribute cleanse_attribute;
				369
				370	void CleanseCss::Modify(const char* in, size_t inlen,
				371	const PerExpandData*,
				372	ExpandEmitter* out, const string& arg) const {
				373	for (size_t i = 0; i < inlen; ++i) {
				374	char c = in[i];
				375	switch (c) {
				376	case ' ':
				377	case '_':
				378	case '.':
				379	case ',':
				380	case '!':
				381	case '#':
				382	case '%':
				383	case '-': {
				384	out->Emit(c);
				385	break;
				386	}
				387	default: {
				388	if ((c >= 'a' && c <= 'z') \|\|
				389	(c >= 'A' && c <= 'Z') \|\|
				390	(c >= '0' && c <= '9')) {
				391	out->Emit(c);
				392	}
				393	break;
				394	}
				395	}
				396	}
				397	}
				398	CleanseCss cleanse_css;
				399
				400	// CssUrlEscape is used as a chained modifier by ValidateUrl
				401	// (validate_url_and_css_escape) and is not directly exposed.
				402	class CssUrlEscape : public TemplateModifier {
				403	public:
				404	virtual void Modify(const char* in, size_t inlen,
				405	const PerExpandData, ExpandEmitter outbuf,
				406	const string& arg) const;
				407	};
				408
				409	// URL-encodes the characters [\n\r\\'"()<>*] to ensure the URL can be safely
				410	// inserted in a CSS context, e.g:
				411	// . In an '@import url("URL");' statement
				412	// . In a CSS property such as 'background: url("URL");'
				413	// In both locations above, enclosing quotes are optional but parens are not.
				414	// We want to make sure the URL cannot exit the parens enclosure, close a
				415	// STYLE tag or reset the browser's CSS parser (via comments or newlines).
				416	//
				417	// References:
				418	// . CSS 2.1 URLs: http://www.w3.org/TR/CSS21/syndata.html#url
				419	// . CSS 1 URLs: http://www.w3.org/TR/REC-CSS1/#url
				420	void CssUrlEscape::Modify(const char* in, size_t inlen,
				421	const PerExpandData*,
				422	ExpandEmitter* out, const string& arg) const {
				423	for (size_t i = 0; i < inlen; ++i) {
				424	char c = in[i];
				425	switch (c) {
				426	case '\n': APPEND("%0A"); break;
				427	case '\r': APPEND("%0D"); break;
				428	case '"': APPEND("%22"); break;
				429	case '\'': APPEND("%27"); break;
				430	case '(': APPEND("%28"); break;
				431	case ')': APPEND("%29"); break;
				432	case '*': APPEND("%2A"); break;
				433	case '<': APPEND("%3C"); break;
				434	case '>': APPEND("%3E"); break;
				435	case '\\': APPEND("%5C"); break;
				436	default: out->Emit(c); break;
				437	}
				438	}
				439	}
				440	CssUrlEscape css_url_escape;
				441
				442	// These URLs replace unsafe URLs for :U and :I url-escaping modes.
				443	const char* const ValidateUrl::kUnsafeUrlReplacement = "#";
				444	const char* const ValidateUrl::kUnsafeImgSrcUrlReplacement =
				445	"/images/cleardot.gif";
				446
				447	void ValidateUrl::Modify(const char* in, size_t inlen,
				448	const PerExpandData* per_expand_data,
				449	ExpandEmitter* out, const string& arg) const {
				450	const char* slashpos = (char*)memchr(in, '/', inlen);
				451	if (slashpos == NULL) {
				452	slashpos = in + inlen;
				453	}
				454	const void* colonpos = memchr(in, ':', slashpos - in);
				455	// colon before first slash, could be a protocol
				456	if (colonpos != NULL && URL::HasInsecureProtocol(in, inlen)) {
				457	// It's a bad protocol, so return something safe
				458	chained_modifier_.Modify(unsafe_url_replacement_,
				459	unsafe_url_replacement_length_,
				460	per_expand_data,
				461	out,
				462	"");
				463	return;
				464	}
				465	// If we get here, it's a valid url, so just escape it
				466	chained_modifier_.Modify(in, inlen, per_expand_data, out, "");
				467	}
				468	ValidateUrl validate_url_and_html_escape(
				469	html_escape,
				470	ValidateUrl::kUnsafeUrlReplacement);
				471	ValidateUrl validate_url_and_javascript_escape(
				472	javascript_escape,
				473	ValidateUrl::kUnsafeUrlReplacement);
				474	ValidateUrl validate_url_and_css_escape(
				475	css_url_escape,
				476	ValidateUrl::kUnsafeUrlReplacement);
				477	ValidateUrl validate_img_src_url_and_html_escape(
				478	html_escape,
				479	ValidateUrl::kUnsafeImgSrcUrlReplacement);
				480	ValidateUrl validate_img_src_url_and_javascript_escape(
				481	javascript_escape,
				482	ValidateUrl::kUnsafeImgSrcUrlReplacement);
				483	ValidateUrl validate_img_src_url_and_css_escape(
				484	css_url_escape,
				485	ValidateUrl::kUnsafeImgSrcUrlReplacement);
				486
				487	void XmlEscape::Modify(const char* in, size_t inlen,
				488	const PerExpandData*,
				489	ExpandEmitter* out, const string& arg) const {
				490	const char* pos = in;
				491	const char* start = pos;
				492	const char* const limit = in + inlen;
				493	while (pos < limit) {
				494	char ch = *pos;
				495
				496	// According to section 2.2 of the spec
				497	// http://www.w3.org/TR/REC-xml/#charsets control characters in range
				498	// 0x00-0x1F (except \t, \r and \n) are not valid XML characters. In
				499	// particular, conformant parsers are allowed to die when encountering a FF
				500	// char in PCDATA sections. These chars are replaced by a space.
				501	if (ch >= 0x00 && ch < 0x20 && ch != '\t' && ch != '\r' && ch != '\n') {
				502	EmitRun(start, pos, out);
				503	out->Emit(' ');
				504	start = ++pos;
				505	continue;
				506	}
				507
				508	switch (ch) {
				509	default:
				510	// Increment our counter and look at the next character.
				511	++pos;
				512	continue;
				513
				514	case '&': EmitRun(start, pos, out); APPEND("&"); break;
				515	case '"': EmitRun(start, pos, out); APPEND("""); break;
				516	case '\'': EmitRun(start, pos, out); APPEND("'"); break;
				517	case '<': EmitRun(start, pos, out); APPEND("<"); break;
				518	case '>': EmitRun(start, pos, out); APPEND(">"); break;
				519	}
				520	start = ++pos;
				521	}
				522	EmitRun(start, pos, out);
				523	}
				524	XmlEscape xml_escape;
				525
				526	// This table maps initial characters to code lengths. This could be
				527	// done with a 16-byte table and a shift, but there's a substantial
				528	// performance increase by eliminating the shift.
				529	static const char kCodeLengths[256] = {
				530	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				531	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				532	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				533	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				534
				535	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				536	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				537	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				538	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				539
				540	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				541	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				542	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				543	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				544
				545	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				546	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				547	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				548	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				549	};
				550
				551	// Returns the UTF-8 code-unit starting at start, or the special codepoint
				552	// 0xFFFD if the input ends abruptly or is not well-formed UTF-8.
				553	// start -- address of the start of the code unit which also receives the
				554	// address past the end of the code unit returned.
				555	// end -- exclusive end of the string
				556	static inline uint16 UTF8CodeUnit(const char** start, const char *end) {
				557	// Use kCodeLengths table to calculate the length of the code unit
				558	// from the first character.
				559	unsigned char first_char = static_cast<unsigned char>(**start);
				560	size_t code_unit_len = kCodeLengths[first_char];
				561	if (code_unit_len == 1) {
				562	// Return the current byte as a codepoint.
				563	// Either it is a valid single byte codepoint, or it's not part of a valid
				564	// UTF-8 sequence, and so has to be handled individually.
				565	++*start;
				566	return first_char;
				567	}
				568	const char code_unit_end = start + code_unit_len;
				569	if (code_unit_end < *start \|\| code_unit_end > end) { // Truncated code unit.
				570	++*start;
				571	return 0xFFFDU;
				572	}
				573	const char* pos = *start;
				574	uint16 code_unit = *pos & (0xFFU >> code_unit_len);
				575	while (--code_unit_len) {
				576	uint16 tail_byte = *(++pos);
				577	if ((tail_byte & 0xC0U) != 0x80U) { // Malformed code unit.
				578	++*start;
				579	return 0xFFFDU;
				580	}
				581	code_unit = (code_unit << 6) \| (tail_byte & 0x3FU);
				582	}
				583	*start = code_unit_end;
				584	return code_unit;
				585	}
				586
				587	// A good reference is the ECMA standard (3rd ed), section 7.8.4:
				588	// http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
				589	void JavascriptEscape::Modify(const char* in, size_t inlen,
				590	const PerExpandData*,
				591	ExpandEmitter* out, const string& arg) const {
				592	const char* pos = in;
				593	const char* start = pos;
				594	const char* const limit = in + inlen;
				595
				596	if (limit < in) { return; }
				597
				598	while (pos < limit) {
				599	const char* next_pos = pos;
				600	uint16 code_unit = UTF8CodeUnit(&next_pos, limit);
				601
				602	// Test for 16-bit values outside the switch below, because gcc
				603	// will emit chained branches rather than a jump table for such a
				604	// wide range of values.
				605	if (code_unit & 0xFF00) {
				606	// Linebreaks according to EcmaScript 262 which cannot appear in strings.
				607	if (code_unit == 0x2028) {
				608	// Line separator
				609	EmitRun(start, pos, out); APPEND("\\u2028");
				610	} else if (code_unit == 0x2029) {
				611	// Paragraph separator
				612	EmitRun(start, pos, out); APPEND("\\u2029");
				613	} else {
				614	pos = next_pos;
				615	continue;
				616	}
				617	} else {
				618	switch (code_unit) {
				619	default:
				620	// Increment our counter and look at the next character.
				621	pos = next_pos;
				622	continue;
				623
				624	case '\0': EmitRun(start, pos, out); APPEND("\\x00"); break;
				625	case '"': EmitRun(start, pos, out); APPEND("\\x22"); break;
				626	case '\'': EmitRun(start, pos, out); APPEND("\\x27"); break;
				627	case '\\': EmitRun(start, pos, out); APPEND("\\\\"); break;
				628	case '\t': EmitRun(start, pos, out); APPEND("\\t"); break;
				629	case '\r': EmitRun(start, pos, out); APPEND("\\r"); break;
				630	case '\n': EmitRun(start, pos, out); APPEND("\\n"); break;
				631	case '\b': EmitRun(start, pos, out); APPEND("\\b"); break;
				632	case '\f': EmitRun(start, pos, out); APPEND("\\f"); break;
				633	case '&': EmitRun(start, pos, out); APPEND("\\x26"); break;
				634	case '<': EmitRun(start, pos, out); APPEND("\\x3c"); break;
				635	case '>': EmitRun(start, pos, out); APPEND("\\x3e"); break;
				636	case '=': EmitRun(start, pos, out); APPEND("\\x3d"); break;
				637
				638	case '\v':
				639	// Do not escape vertical tabs to "\\v" since it is interpreted as 'v'
				640	// by JScript according to section 2.1 of
				641	// http://wiki.ecmascript.org/lib/exe/fetch.php?
				642	// id=resources%3Aresources&cache=cache&
				643	// media=resources:jscriptdeviationsfromes3.pdf
				644	EmitRun(start, pos, out); APPEND("\\x0b"); break;
				645	}
				646	}
				647	start = pos = next_pos;
				648	}
				649	EmitRun(start, pos, out);
				650	}
				651	JavascriptEscape javascript_escape;
				652
				653
				654	void JavascriptNumber::Modify(const char* in, size_t inlen,
				655	const PerExpandData*,
				656	ExpandEmitter* out, const string& arg) const {
				657	if (inlen == 0)
				658	return;
				659
				660	if (STR_IS(in, inlen, "true") \|\| STR_IS(in, inlen, "false")) {
				661	out->Emit(in, inlen);
				662	return;
				663	}
				664
				665	bool valid = true;
				666	if (in[0] == '0' && inlen > 2 && (in[1] == 'x' \|\| in[1] == 'X')) {
				667	// There must be at least one hex digit after the 0x for it to be valid.
				668	// Hex number. Check that it is of the form 0(x\|X)[0-9A-Fa-f]+
				669	for (size_t i = 2; i < inlen; i++) {
				670	char c = in[i];
				671	if (!((c >= 'a' && c <= 'f') \|\|
				672	(c >= 'A' && c <= 'F') \|\|
				673	(c >= '0' && c <= '9'))) {
				674	valid = false;
				675	break;
				676	}
				677	}
				678	} else {
				679	// Must be a base-10 (or octal) number.
				680	// Check that it has the form [0-9+-.eE]+
				681	for (size_t i = 0; i < inlen; i++) {
				682	char c = in[i];
				683	if (!((c >= '0' && c <= '9') \|\|
				684	c == '+' \|\| c == '-' \|\| c == '.' \|\|
				685	c == 'e' \|\| c == 'E')) {
				686	valid = false;
				687	break;
				688	}
				689	}
				690	}
				691	if (valid) {
				692	out->Emit(in, inlen); // Number was valid, output it.
				693	} else {
				694	APPEND("null"); // Number was not valid, output null instead.
				695	}
				696	}
				697	JavascriptNumber javascript_number;
				698
				699	static inline bool IsUrlQueryEscapeSafeChar(unsigned char c) {
				700	// Everything not matching [0-9a-zA-Z.,_*/~!()-] is escaped.
				701	static unsigned long _safe_characters[8] = {
				702	0x00000000L, 0x03fff702L, 0x87fffffeL, 0x47fffffeL,
				703	0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L
				704	};
				705
				706	return (_safe_characters[(c)>>5] & (1 << ((c) & 31)));
				707	}
				708
				709	void UrlQueryEscape::Modify(const char* in, size_t inlen,
				710	const PerExpandData*,
				711	ExpandEmitter* out, const string& arg) const {
				712	const char* pos = in;
				713	const char* const limit = in + inlen;
				714	while (true) {
				715	// Peel off any initial runs of safe characters and emit them all
				716	// at once.
				717	const char* start = pos;
				718	while (pos < limit && IsUrlQueryEscapeSafeChar(*pos)) {
				719	pos++;
				720	}
				721	EmitRun(start, pos, out);
				722
				723	// Now deal with a single unsafe character.
				724	if (pos < limit) {
				725	unsigned char c = *pos;
				726	if (c == ' ') {
				727	out->Emit('+');
				728	} else {
				729	out->Emit('%');
				730	out->Emit(((c>>4) < 10 ? ((c>>4) + '0') : (((c>>4) - 10) + 'A')));
				731	out->Emit(((c&0xf) < 10 ? ((c&0xf) + '0') : (((c&0xf) - 10) + 'A')));
				732	}
				733	pos++;
				734	} else {
				735	// We're done!
				736	break;
				737	}
				738	}
				739	}
				740	UrlQueryEscape url_query_escape;
				741
				742	// For more information on escaping JSON, see section 2.5 in
				743	// http://www.ietf.org/rfc/rfc4627.txt.
				744	// Escaping '&', '<', '>' is optional in the JSON proposed RFC
				745	// but alleviates concerns with content sniffing if JSON is used
				746	// in a context where the browser may attempt to interpret HTML.
				747	void JsonEscape::Modify(const char* in, size_t inlen,
				748	const PerExpandData*,
				749	ExpandEmitter* out, const string& arg) const {
				750	const char* pos = in;
				751	const char* start = pos;
				752	const char* const limit = in + inlen;
				753	while (pos < limit) {
				754	switch (*pos) {
				755	default:
				756	// Increment our counter and look at the next character.
				757	++pos;
				758	continue;
				759
				760	case '"': EmitRun(start, pos, out); APPEND("\\\""); break;
				761	case '\\': EmitRun(start, pos, out); APPEND("\\\\"); break;
				762	case '/': EmitRun(start, pos, out); APPEND("\\/"); break;
				763	case '\b': EmitRun(start, pos, out); APPEND("\\b"); break;
				764	case '\f': EmitRun(start, pos, out); APPEND("\\f"); break;
				765	case '\n': EmitRun(start, pos, out); APPEND("\\n"); break;
				766	case '\r': EmitRun(start, pos, out); APPEND("\\r"); break;
				767	case '\t': EmitRun(start, pos, out); APPEND("\\t"); break;
				768	case '&': EmitRun(start, pos, out); APPEND("\\u0026"); break;
				769	case '<': EmitRun(start, pos, out); APPEND("\\u003C"); break;
				770	case '>': EmitRun(start, pos, out); APPEND("\\u003E"); break;
				771	}
				772	start = ++pos;
				773	}
				774	EmitRun(start, pos, out);
				775	}
				776	JsonEscape json_escape;
				777
				778	void PrefixLine::Modify(const char* in, size_t inlen,
				779	const PerExpandData*,
				780	ExpandEmitter* out, const string& arg) const {
				781	while (inlen > 0) {
				782	const char* nl = (const char*)memchr(in, '\n', inlen);
				783	const char* cr = (const char*)memchr(in, '\r', nl ? nl - in : inlen);
				784	size_t linelen;
				785	if (nl == NULL && cr == NULL) {
				786	// We're at the last line
				787	out->Emit(in, inlen);
				788	break;
				789	} else {
				790	// One or both of \r and \n is set; point to the first char past
				791	// the newline. Note for \r\n, that's the char after the \n,
				792	// otherwise, it's the char past the \r or the \n we see.
				793	if ((nl == NULL) != (cr == NULL)) // one is set, the other is NULL
				794	linelen = (nl ? nl : cr) + 1 - in;
				795	else if (nl == cr + 1 \|\| nl < cr) // \r\n, or \n comes first
				796	linelen = nl + 1 - in;
				797	else
				798	linelen = cr + 1 - in;
				799	}
				800	out->Emit(in, linelen);
				801	out->Emit(arg); // a new line, so emit the prefix
				802	in += linelen;
				803	inlen -= linelen;
				804	assert(inlen >= 0);
				805	}
				806	}
				807	PrefixLine prefix_line;
				808
				809
				810	// Must be at least one more than the maximum number of alternative modifiers
				811	// specified in any given element of g_modifiers.
				812	# define MAX_SAFE_ALTERNATIVES 10 // If the compiler complains, increase it.
				813
				814	// Use the empty string if you want a modifier not to have a long-name.
				815	// Use '\0' if you want a modifier not to have a short-name.
				816	// Note: not all modifiers are in this array:
				817	// 1) SnippetEscape: use html_escape_with_arg=snippet to get this
				818	// 2) CleanseAttribute: use html_escape_with_arg=attribute to get this
				819	// 3) ValidateUrl: use html_escape_with_arg=url to get this
				820	//
				821	// Some modifiers define other modifiers that are safe replacements
				822	// from an XSS perspective. Replacements are not commutative so for
				823	// example H=pre considers H=attribute a safe replacement to it
				824	// but H=attribute has no safe replacements.
				825	// This struct is not pretty but allows the definitions to be
				826	// done without the need for a global initialization method.
				827	// Be very careful making a change to g_modifiers as modifiers
				828	// point to other ones within that same array so elements
				829	// may not be re-ordered easily. Also you need to change
				830	// the global g_am_dirs correspondingly.
				831	//
				832	static struct ModifierWithAlternatives {
				833	ModifierInfo modifier_info;
				834	ModifierInfo* safe_alt_mods[MAX_SAFE_ALTERNATIVES];
				835	} g_modifiers[] = {
				836	/* 0 */ { ModifierInfo("cleanse_css", 'c',
				837	XSS_WEB_STANDARD, &cleanse_css),
				838	{&g_modifiers[16].modifier_info, // url_escape_with_arg=css
				839	// img_src_url_escape_with_arg=css
				840	&g_modifiers[19].modifier_info} },
				841	/* 1 */ { ModifierInfo("html_escape", 'h',
				842	XSS_WEB_STANDARD, &html_escape),
				843	{&g_modifiers[2].modifier_info, // html_escape_with_arg=snippet
				844	&g_modifiers[3].modifier_info, // html_escape_with_arg=pre
				845	&g_modifiers[4].modifier_info, // html_escape_with_arg=attribute
				846	&g_modifiers[5].modifier_info, // html_escape_with_arg=url
				847	&g_modifiers[8].modifier_info, // pre_escape
				848	&g_modifiers[9].modifier_info, // url_query_escape
				849	&g_modifiers[11].modifier_info, // url_escape_with_arg=html
				850	&g_modifiers[12].modifier_info, // url_escape_with_arg=query
				851	// img_src_url_escape_with_arg=html
				852	&g_modifiers[18].modifier_info} },
				853	/* 2 */ { ModifierInfo("html_escape_with_arg=snippet", 'H',
				854	XSS_WEB_STANDARD, &snippet_escape),
				855	{&g_modifiers[1].modifier_info, // html_escape
				856	&g_modifiers[3].modifier_info, // html_escape_with_arg=pre
				857	&g_modifiers[4].modifier_info, // html_escape_with_arg=attribute
				858	&g_modifiers[8].modifier_info, // pre_escape
				859	&g_modifiers[9].modifier_info, // url_query_escape
				860	&g_modifiers[12].modifier_info} }, // url_escape_with_arg=query
				861	/* 3 */ { ModifierInfo("html_escape_with_arg=pre", 'H',
				862	XSS_WEB_STANDARD, &pre_escape),
				863	{&g_modifiers[1].modifier_info, // html_escape
				864	&g_modifiers[2].modifier_info, // html_escape_with_arg=snippet
				865	&g_modifiers[4].modifier_info, // html_escape_with_arg=attribute
				866	&g_modifiers[8].modifier_info, // pre_escape
				867	&g_modifiers[9].modifier_info, // url_query_escape
				868	&g_modifiers[12].modifier_info} }, // url_escape_with_arg=query
				869	/* 4 */ { ModifierInfo("html_escape_with_arg=attribute", 'H',
				870	XSS_WEB_STANDARD, &cleanse_attribute), {} },
				871	/* 5 */ { ModifierInfo("html_escape_with_arg=url", 'H',
				872	XSS_WEB_STANDARD, &validate_url_and_html_escape),
				873	// img_src_url_escape_with_arg=html
				874	{&g_modifiers[18].modifier_info} },
				875	/* 6 */ { ModifierInfo("javascript_escape", 'j',
				876	XSS_WEB_STANDARD, &javascript_escape),
				877	{&g_modifiers[7].modifier_info, // json_escape
				878	&g_modifiers[10].modifier_info, // url_escape_with_arg=javascript
				879	// img_src_url_escape_with_arg=javascript
				880	&g_modifiers[17].modifier_info} },
				881	/* 7 */ { ModifierInfo("json_escape", 'o', XSS_WEB_STANDARD, &json_escape),
				882	{&g_modifiers[6].modifier_info} }, // javascript_escape
				883	/* 8 */ { ModifierInfo("pre_escape", 'p', XSS_WEB_STANDARD, &pre_escape),
				884	{&g_modifiers[1].modifier_info, // html_escape
				885	&g_modifiers[2].modifier_info, // html_escape_with_arg=snippet
				886	&g_modifiers[3].modifier_info, // html_escape_with_arg=pre
				887	&g_modifiers[4].modifier_info, // html_escape_with_arg=attr...
				888	&g_modifiers[9].modifier_info, // url_query_escape
				889	&g_modifiers[12].modifier_info} }, // url_escape_with_arg=query
				890	/* 9 */ { ModifierInfo("url_query_escape", 'u',
				891	XSS_WEB_STANDARD, &url_query_escape), {} },
				892	/* 10 */ { ModifierInfo("url_escape_with_arg=javascript", 'U',
				893	XSS_WEB_STANDARD,
				894	&validate_url_and_javascript_escape),
				895	// img_src_url_escape_with_arg=javascript
				896	{&g_modifiers[17].modifier_info} },
				897	/* 11 */ { ModifierInfo("url_escape_with_arg=html", 'U',
				898	XSS_WEB_STANDARD, &validate_url_and_html_escape),
				899	// img_src_url_escape_with_arg=html
				900	{&g_modifiers[18].modifier_info} },
				901	/* 12 */ { ModifierInfo("url_escape_with_arg=query", 'U',
				902	XSS_WEB_STANDARD, &url_query_escape), {} },
				903	/* 13 */ { ModifierInfo("none", '\0', XSS_SAFE, &null_modifier), {} },
				904	/* 14 */ { ModifierInfo("xml_escape", '\0', XSS_WEB_STANDARD, &xml_escape),
				905	{&g_modifiers[1].modifier_info, // html_escape
				906	&g_modifiers[4].modifier_info,} }, // H=attribute
				907	/* 15 */ { ModifierInfo("javascript_escape_with_arg=number", 'J',
				908	XSS_WEB_STANDARD, &javascript_number), {} },
				909	/* 16 */ { ModifierInfo("url_escape_with_arg=css", 'U',
				910	XSS_WEB_STANDARD, &validate_url_and_css_escape), {} },
				911	/* 17 */ { ModifierInfo("img_src_url_escape_with_arg=javascript", 'I',
				912	XSS_WEB_STANDARD,
				913	&validate_img_src_url_and_javascript_escape), {} },
				914	/* 18 */ { ModifierInfo("img_src_url_escape_with_arg=html", 'I',
				915	XSS_WEB_STANDARD,
				916	&validate_img_src_url_and_html_escape), {} },
				917	/* 19 */ { ModifierInfo("img_src_url_escape_with_arg=css", 'I',
				918	XSS_WEB_STANDARD,
				919	&validate_img_src_url_and_css_escape), {} },
				920	};
				921
				922	static vector<const ModifierInfo*> g_extension_modifiers;
				923	static vector<const ModifierInfo*> g_unknown_modifiers;
				924
				925	// Returns whether or not candidate can be safely (w.r.t XSS)
				926	// used in lieu of our ModifierInfo. This is true iff:
				927	// 1. Both have the same modifier function OR
				928	// 2. Candidate's modifier function is in our ModifierInfo's
				929	// list (vector) of safe alternative modifier functions.
				930	//
				931	// This is used with the auto-escaping code, which automatically
				932	// figures out which modifier to apply to a variable based on the
				933	// variable's context (in an html "<A HREF", for instance). Some
				934	// built-in modifiers are considered safe alternatives from the perspective
				935	// of preventing XSS (cross-site-scripting) attacks, in which case
				936	// the auto-escaper should allow the choice of which to use in the
				937	// template. This is intended only for internal use as it is dangerous
				938	// and complicated to figure out which modifier is an XSS-safe
				939	// replacement for a given one. Custom modifiers currently may not
				940	// indicate safe replacements, only built-in ones may do so.
				941	//
				942	// Note that this function is not commutative therefore
				943	// IsSafeXSSAlternative(a, b) may not be equal to IsSafeXSSAlternative(b, a).
				944	bool IsSafeXSSAlternative(const ModifierInfo& our,
				945	const ModifierInfo& candidate) {
				946	// Succeeds even for non built-in modifiers but no harm.
				947	if (our.modifier == candidate.modifier)
				948	return true;
				949
				950	for (const ModifierWithAlternatives* mod_with_alts = g_modifiers;
				951	mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers);
				952	++mod_with_alts) {
				953	if (mod_with_alts->modifier_info.long_name == our.long_name)
				954	// We found our Modifier in the built-in array g_modifiers.
				955	for (int i = 0; mod_with_alts->safe_alt_mods[i] != NULL &&
				956	i < MAX_SAFE_ALTERNATIVES; ++i)
				957	if (mod_with_alts->safe_alt_mods[i]->long_name == candidate.long_name)
				958	// We found candidate in our Modifier's list of safe alternatives.
				959	return true;
				960	}
				961	// our is not built-in or candidate is not a safe replacement to our.
				962	return false;
				963	}
				964
				965	static inline bool IsExtensionModifier(const char* long_name) {
				966	return memcmp(long_name, "x-", 2) == 0;
				967	}
				968
				969	static bool AddModifierCommon(const char* long_name,
				970	const TemplateModifier* modifier, bool xss_safe) {
				971	if (!IsExtensionModifier(long_name))
				972	return false;
				973
				974	// TODO(csilvers): store in a map or multimap, rather than a vector
				975	for (vector<const ModifierInfo*>::const_iterator mod =
				976	g_extension_modifiers.begin();
				977	mod != g_extension_modifiers.end();
				978	++mod) {
				979	// Check if mod has the same name as us. For modifiers that also take
				980	// values, this is everything before the =. The only time it's ok to
				981	// have the same name is when we have different modval specializations:
				982	// "foo=bar" and "foo=baz" are both valid names. Note "foo" and
				983	// "foo=bar" is not valid: foo has no modval, but "foo=bar" does.
				984	const size_t new_modifier_namelen = strcspn(long_name, "=");
				985	const size_t existing_modifier_namelen = strcspn((*mod)->long_name.c_str(),
				986	"=");
				987	if (new_modifier_namelen == existing_modifier_namelen &&
				988	!memcmp(long_name, (*mod)->long_name.c_str(), new_modifier_namelen)) {
				989	if (long_name[new_modifier_namelen] == '=' &&
				990	(*mod)->long_name[existing_modifier_namelen] == '=' &&
				991	(*mod)->long_name != long_name) {
				992	// It's ok, we're different specializations!
				993	} else {
				994	// It's not ok: we have the same name and no good excuse.
				995	return false;
				996	}
				997	}
				998	}
				999
				1000	g_extension_modifiers.push_back(
				1001	new ModifierInfo(long_name, '\0',
				1002	xss_safe ? XSS_SAFE : XSS_UNIQUE,
				1003	modifier));
				1004	return true;
				1005	}
				1006
				1007	// Modifier added with XSS_UNIQUE XssClass.
				1008	bool AddModifier(const char* long_name,
				1009	const TemplateModifier* modifier) {
				1010	return AddModifierCommon(long_name, modifier, false);
				1011	}
				1012
				1013	// Modifier added with XSS_SAFE XssClass.
				1014	bool AddXssSafeModifier(const char* long_name,
				1015	const TemplateModifier* modifier) {
				1016	return AddModifierCommon(long_name, modifier, true);
				1017	}
				1018
				1019	// If candidate_match is a better match for modname/modval than bestmatch,
				1020	// update bestmatch. To be a better match, two conditions must be met:
				1021	// 1) The candidate's name must match modname
				1022	// 2) If the candidate is a specialization (that is, name is of the form
				1023	// "foo=bar", then modval matches the specialization value).
				1024	// 3) If the candidate is not a specialization, bestmatch isn't a
				1025	// specialization either.
				1026	// Condition (3) makes sure that if we match the ModifierInfo with name
				1027	// "foo=bar", we don't claim the ModifierInfo "foo=" is a better match.
				1028	// Recall that by definition, modval will always start with a '=' if present.
				1029	static void UpdateBestMatch(const char* modname, size_t modname_len,
				1030	const char* modval, size_t modval_len,
				1031	const ModifierInfo* candidate_match,
				1032	const ModifierInfo** best_match) {
				1033	// It's easiest to handle the two case differently: (1) candidate_match
				1034	// refers to a modifier that expects a modifier-value; (2) it doesn't.
				1035	if (candidate_match->modval_required) {
				1036	// To be a match, we have to fulfill three requirements: we have a
				1037	// modval, our modname matches candidate_match's modname (either
				1038	// shortname or longname), and our modval is consistent with the
				1039	// value specified in the longname (whatever might follow the =).
				1040	const char* const longname_start = candidate_match->long_name.c_str();
				1041	const char* const equals = strchr(longname_start, '=');
				1042	assert(equals != NULL);
				1043	if (modval_len > 0 &&
				1044	((modname_len == 1 && *modname == candidate_match->short_name) \|\|
				1045	(modname_len == equals - longname_start &&
				1046	memcmp(modname, longname_start, modname_len) == 0)) &&
				1047	((equals[1] == '\0') \|\| // name is "foo=" (not a specialization)
				1048	(modval_len
				1049	== longname_start + candidate_match->long_name.size() - equals &&
				1050	memcmp(modval, equals, modval_len) == 0))) {
				1051	// Condition (3) above is satisfied iff our longname is longer than
				1052	// best-match's longname (so we prefer "foo=bar" to "foo=").
				1053	if (*best_match == NULL \|\|
				1054	candidate_match->long_name.size() > (*best_match)->long_name.size())
				1055	*best_match = candidate_match;
				1056	}
				1057	} else {
				1058	// In this case, to be a match: we must not have a modval. Our
				1059	// modname still must match modinfo's modname (either short or long).
				1060	if (modval_len == 0 &&
				1061	((modname_len == 1 && *modname == candidate_match->short_name) \|\|
				1062	(modname_len == candidate_match->long_name.size() &&
				1063	!memcmp(modname, candidate_match->long_name.data(), modname_len)))) {
				1064	// In the no-modval case, only one match should exist.
				1065	assert(*best_match == NULL);
				1066	*best_match = candidate_match;
				1067	}
				1068	}
				1069	}
				1070
				1071	const ModifierInfo* FindModifier(const char* modname, size_t modname_len,
				1072	const char* modval, size_t modval_len) {
				1073	// More than one modifier can match, in the case of modval specializations
				1074	// (e.g., the modifier "foo=" and "foo=bar" will both match on input of
				1075	// modname="foo", modval="bar"). In that case, we take the ModifierInfo
				1076	// with the longest longname, since that's the most specialized match.
				1077	const ModifierInfo* best_match = NULL;
				1078	if (modname_len >= 2 && IsExtensionModifier(modname)) {
				1079	for (vector<const ModifierInfo*>::const_iterator mod =
				1080	g_extension_modifiers.begin();
				1081	mod != g_extension_modifiers.end();
				1082	++mod) {
				1083	UpdateBestMatch(modname, modname_len, modval, modval_len,
				1084	*mod, &best_match);
				1085	}
				1086	if (best_match != NULL)
				1087	return best_match;
				1088
				1089	for (vector<const ModifierInfo*>::const_iterator mod =
				1090	g_unknown_modifiers.begin();
				1091	mod != g_unknown_modifiers.end();
				1092	++mod) {
				1093	UpdateBestMatch(modname, modname_len, modval, modval_len,
				1094	*mod, &best_match);
				1095	}
				1096	if (best_match != NULL)
				1097	return best_match;
				1098	// This is the only situation where we can pass in a modifier of NULL.
				1099	// It means "we don't know about this modifier-name."
				1100	string fullname(modname, modname_len);
				1101	if (modval_len) {
				1102	fullname.append(modval, modval_len);
				1103	}
				1104	// TODO(csilvers): store in a map or multimap, rather than a vector
				1105	g_unknown_modifiers.push_back(new ModifierInfo(fullname, '\0',
				1106	XSS_UNIQUE, NULL));
				1107	return g_unknown_modifiers.back();
				1108	} else {
				1109	for (const ModifierWithAlternatives* mod_with_alts = g_modifiers;
				1110	mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers);
				1111	++mod_with_alts) {
				1112	UpdateBestMatch(modname, modname_len, modval, modval_len,
				1113	&mod_with_alts->modifier_info, &best_match);
				1114	}
				1115	return best_match;
				1116	}
				1117	}
				1118
				1119	// For escaping variables under the auto-escape mode:
				1120	// Each directive below maps to a distinct sequence of
				1121	// escaping directives (i.e a vector<ModifierAndValue>) applied
				1122	// to a variable during run-time substitution.
				1123	// The directives are stored in a global array (g_mods_ae)
				1124	// initialized under lock in InitializeGlobalModifiers.
				1125	enum AutoModifyDirective {
				1126	AM_EMPTY, // Unused, kept as marker.
				1127	AM_HTML,
				1128	AM_HTML_UNQUOTED,
				1129	AM_JS,
				1130	AM_JS_NUMBER,
				1131	AM_URL_HTML,
				1132	AM_URL_QUERY,
				1133	AM_STYLE,
				1134	AM_XML,
				1135	NUM_ENTRIES_AM,
				1136	};
				1137
				1138	// Populates the global vector of hard-coded modifiers that
				1139	// Auto-Escape may pick. We point to the appropriate modifier in
				1140	// the global g_modifiers.
				1141	// Reference these globals via the global array g_am_dirs[] for consistency.
				1142	// Note: We allow for more than one ModifierAndValue in the array hence
				1143	// the need to terminate with a Null marker. However currently all the
				1144	// escaping directives have exactly one ModifierAndValue.
				1145	static const ModifierAndValue g_am_empty[] = {
				1146	ModifierAndValue(NULL, "", 0)
				1147	};
				1148	static const ModifierAndValue g_am_html[] = {
				1149	ModifierAndValue(&g_modifiers[1].modifier_info, "", 0),
				1150	ModifierAndValue(NULL, "", 0)
				1151	};
				1152	static const ModifierAndValue g_am_html_unquoted[] = {
				1153	ModifierAndValue(&g_modifiers[4].modifier_info, "=attribute", 10),
				1154	ModifierAndValue(NULL, "", 0)
				1155	};
				1156	static const ModifierAndValue g_am_js[] = {
				1157	ModifierAndValue(&g_modifiers[6].modifier_info, "", 0),
				1158	ModifierAndValue(NULL, "", 0)
				1159	};
				1160	static const ModifierAndValue g_am_js_number[] = {
				1161	ModifierAndValue(&g_modifiers[15].modifier_info, "=number", 7),
				1162	ModifierAndValue(NULL, "", 0)
				1163	};
				1164	static const ModifierAndValue g_am_url_html[] = {
				1165	ModifierAndValue(&g_modifiers[11].modifier_info, "=html", 5),
				1166	ModifierAndValue(NULL, "", 0)
				1167	};
				1168	static const ModifierAndValue g_am_url_query[] = {
				1169	ModifierAndValue(&g_modifiers[9].modifier_info, "", 0),
				1170	ModifierAndValue(NULL, "", 0)
				1171	};
				1172	static const ModifierAndValue g_am_style[] = {
				1173	ModifierAndValue(&g_modifiers[0].modifier_info, "", 0),
				1174	ModifierAndValue(NULL, "", 0)
				1175	};
				1176	static const ModifierAndValue g_am_xml[] = {
				1177	ModifierAndValue(&g_modifiers[14].modifier_info, "", 0),
				1178	ModifierAndValue(NULL, "", 0)
				1179	};
				1180
				1181	static const ModifierAndValue* g_am_dirs[NUM_ENTRIES_AM] = {
				1182	g_am_empty, /* AM_EMPTY */
				1183	g_am_html, /* AM_HTML */
				1184	g_am_html_unquoted, /* AM_HTML_UNQUOTED */
				1185	g_am_js, /* AM_JS */
				1186	g_am_js_number, /* AM_JS_NUMBER */
				1187	g_am_url_html, /* AM_URL_HTML */
				1188	g_am_url_query, /* AM_URL_QUERY */
				1189	g_am_style, /* AM_STYLE */
				1190	g_am_xml, /* AM_XML */
				1191	};
				1192
				1193	string PrettyPrintOneModifier(const ModifierAndValue& modval) {
				1194	string out;
				1195	out.append(":");
				1196	if (modval.modifier_info->short_name) // short_name is a char.
				1197	out.append(1, modval.modifier_info->short_name);
				1198	else
				1199	out.append(modval.modifier_info->long_name);
				1200	if (modval.value_len != 0)
				1201	out.append(modval.value, modval.value_len);
				1202	return out;
				1203	}
				1204
				1205	string PrettyPrintModifiers(const vector<const ModifierAndValue*>& modvals,
				1206	const string& separator) {
				1207	string out;
				1208	for (vector<const ModifierAndValue*>::const_iterator it =
				1209	modvals.begin(); it != modvals.end(); ++it) {
				1210	if (it != modvals.begin())
				1211	out.append(separator);
				1212	out.append(PrettyPrintOneModifier(**it));
				1213	}
				1214	return out;
				1215	}
				1216
				1217	// Return the sequence of escaping directives to apply for the given context.
				1218	// An empty vector indicates an error occurred. Currently we never need
				1219	// to chain escaping directives hence on success, the vector is always of
				1220	// size 1. This may change in the future.
				1221	vector<const ModifierAndValue*> GetModifierForHtmlJs(
				1222	HtmlParser* htmlparser, string* error_msg) {
				1223	assert(htmlparser);
				1224	assert(error_msg);
				1225	vector<const ModifierAndValue*> modvals;
				1226
				1227	// Two cases of being inside javascript:
				1228	// 1. Inside raw javascript (within a <script> tag). If the value
				1229	// is quoted we apply javascript_escape, if not we have to coerce
				1230	// it to a safe value due to the risk of javascript code execution
				1231	// hence apply :J=number. If arbitrary code needs to be inserted
				1232	// at run-time, the developer must use :none.
				1233	// 2. In the value of an attribute that takes javascript such
				1234	// as onmouseevent in '<a href="someUrl" onmousevent="{{EVENT}}">'.
				1235	// That will be covered in the STATE_VALUE state logic below.
				1236	if (htmlparser->InJavascript() &&
				1237	htmlparser->state() != HtmlParser::STATE_VALUE) {
				1238	if (htmlparser->IsJavascriptQuoted()) {
				1239	modvals.push_back(g_am_dirs[AM_JS]);
				1240	assert(modvals.size() == 1);
				1241	return modvals;
				1242	} else {
				1243	modvals.push_back(g_am_dirs[AM_JS_NUMBER]);
				1244	assert(modvals.size() == 1);
				1245	return modvals;
				1246	}
				1247	}
				1248	switch (htmlparser->state()) {
				1249	case HtmlParser::STATE_VALUE:{
				1250	string attribute_name = htmlparser->attribute();
				1251	switch (htmlparser->AttributeType()) {
				1252	case HtmlParser::ATTR_URI:
				1253	// Case 1: The URL is quoted:
				1254	// . Apply :U=html if it is a complete URL or :h if it is a fragment.
				1255	// Case 2: The URL is not quoted:
				1256	// . If it is a complete URL, we have no safe modifiers that
				1257	// won't break it so we have to fail.
				1258	// . If it is a URL fragment, then :u is safe and not likely to
				1259	// break the URL.
				1260	if (!htmlparser->IsAttributeQuoted()) {
				1261	if (htmlparser->IsUrlStart()) { // Complete URL.
				1262	error_msg->append("Value of URL attribute \"" + attribute_name +
				1263	"\" must be enclosed in quotes.");
				1264	assert(modvals.empty());
				1265	return modvals; // Empty
				1266	} else { // URL fragment.
				1267	modvals.push_back(g_am_dirs[AM_URL_QUERY]);
				1268	}
				1269	} else {
				1270	// Only validate the URL if we have a complete URL,
				1271	// otherwise simply html_escape.
				1272	if (htmlparser->IsUrlStart())
				1273	modvals.push_back(g_am_dirs[AM_URL_HTML]);
				1274	else
				1275	modvals.push_back(g_am_dirs[AM_HTML]);
				1276	}
				1277	break;
				1278	case HtmlParser::ATTR_REGULAR:
				1279	// If the value is quoted, simply HTML escape, otherwise
				1280	// apply stricter escaping using H=attribute.
				1281	if (htmlparser->IsAttributeQuoted())
				1282	modvals.push_back(g_am_dirs[AM_HTML]);
				1283	else
				1284	modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
				1285	break;
				1286	case HtmlParser::ATTR_STYLE:
				1287	// If the value is quoted apply :c, otherwise fail.
				1288	if (htmlparser->IsAttributeQuoted()) {
				1289	modvals.push_back(g_am_dirs[AM_STYLE]);
				1290	} else {
				1291	error_msg->append("Value of style attribute \"" + attribute_name +
				1292	"\" must be enclosed in quotes.");
				1293	assert(modvals.empty());
				1294	return modvals; // Empty
				1295	}
				1296	break;
				1297	case HtmlParser::ATTR_JS:
				1298	// We require javascript accepting attributes (such as onclick)
				1299	// to be HTML quoted, otherwise they are vulnerable to
				1300	// HTML attribute insertion via the use of whitespace.
				1301	if (!htmlparser->IsAttributeQuoted()) {
				1302	error_msg->append("Value of javascript attribute \"" +
				1303	attribute_name +
				1304	"\" must be enclosed in quotes.");
				1305	assert(modvals.empty());
				1306	return modvals; // Empty
				1307	}
				1308	// If the variable is quoted apply javascript_escape otherwise
				1309	// apply javascript_number which will ensure it is safe against
				1310	// code injection.
				1311	// Note: We normally need to HTML escape after javascript escape
				1312	// but the javascript escape implementation provided makes the
				1313	// HTML escape redundant so simply javascript escape.
				1314	if (htmlparser->IsJavascriptQuoted())
				1315	modvals.push_back(g_am_dirs[AM_JS]);
				1316	else
				1317	modvals.push_back(g_am_dirs[AM_JS_NUMBER]);
				1318	break;
				1319	case HtmlParser::ATTR_NONE:
				1320	assert("We should be in attribute!" && 0);
				1321	default:
				1322	assert("Should not be able to get here." && 0);
				1323	return modvals; // Empty
				1324	}
				1325	// In STATE_VALUE particularly, the parser may get out of sync with
				1326	// the correct state - that the browser sees - due to the fact that
				1327	// it does not get to parse run-time content (variables). So we tell
				1328	// the parser there is content that will be expanded here.
				1329	// A good example is:
				1330	// <a href={{URL}} alt={{NAME}}>
				1331	// The parser sees <a href= alt=> and interprets 'alt=' to be
				1332	// the value of href.
				1333	htmlparser->InsertText(); // Ignore return value.
				1334	assert(modvals.size() == 1);
				1335	return modvals;
				1336	}
				1337	case HtmlParser::STATE_TAG:{
				1338	// Apply H=attribute to tag names since they are alphabetic.
				1339	// Examples of tag names: TITLE, BODY, A and BR.
				1340	modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
				1341	assert(modvals.size() == 1);
				1342	return modvals;
				1343	}
				1344	case HtmlParser::STATE_ATTR:{
				1345	// Apply H=attribute to attribute names since they are alphabetic.
				1346	// Examples of attribute names: HREF, SRC and WIDTH.
				1347	modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
				1348	assert(modvals.size() == 1);
				1349	return modvals;
				1350	}
				1351	case HtmlParser::STATE_COMMENT:
				1352	case HtmlParser::STATE_TEXT:{
				1353	// Apply :h to regular HTML text and :c if within a style tag.
				1354	if (htmlparser->InCss())
				1355	modvals.push_back(g_am_dirs[AM_STYLE]);
				1356	else
				1357	modvals.push_back(g_am_dirs[AM_HTML]);
				1358	assert(modvals.size() == 1);
				1359	return modvals;
				1360	}
				1361	default:{
				1362	assert("Should not be able to get here." && 0);
				1363	return modvals; // Empty
				1364	}
				1365	}
				1366	assert("Should not be able to get here." && 0);
				1367	return modvals; // Empty
				1368	}
				1369
				1370	// TODO(jad): Memoize all GetModifierForXXX functions below.
				1371	// They don't depend on parser context (from csilvers).
				1372	vector<const ModifierAndValue> GetModifierForCss(HtmlParser htmlparser,
				1373	string* error_msg) {
				1374	vector<const ModifierAndValue*> modvals;
				1375	modvals.push_back(g_am_dirs[AM_STYLE]);
				1376	return modvals;
				1377	}
				1378
				1379	vector<const ModifierAndValue> GetModifierForXml(HtmlParser htmlparser,
				1380	string* error_msg) {
				1381	vector<const ModifierAndValue*> modvals;
				1382	modvals.push_back(g_am_dirs[AM_XML]);
				1383	return modvals;
				1384	}
				1385
				1386	vector<const ModifierAndValue> GetModifierForJson(HtmlParser htmlparser,
				1387	string* error_msg) {
				1388	vector<const ModifierAndValue*> modvals;
				1389	modvals.push_back(g_am_dirs[AM_JS]);
				1390	return modvals;
				1391	}
				1392
				1393	vector<const ModifierAndValue*> GetDefaultModifierForHtml() {
				1394	vector<const ModifierAndValue*> modvals;
				1395	modvals.push_back(g_am_dirs[AM_HTML]);
				1396	return modvals;
				1397	}
				1398
				1399	vector<const ModifierAndValue*> GetDefaultModifierForJs() {
				1400	vector<const ModifierAndValue*> modvals;
				1401	modvals.push_back(g_am_dirs[AM_JS]);
				1402	return modvals;
				1403	}
				1404
				1405	vector<const ModifierAndValue*> GetDefaultModifierForCss() {
				1406	return GetModifierForCss(NULL, NULL);
				1407	}
				1408
				1409	vector<const ModifierAndValue*> GetDefaultModifierForXml() {
				1410	return GetModifierForXml(NULL, NULL);
				1411	}
				1412
				1413	vector<const ModifierAndValue*> GetDefaultModifierForJson() {
				1414	return GetModifierForJson(NULL, NULL);
				1415	}
				1416
				1417	}