blob: 4e35281dcb17f467579db2f81969b412ae0ada05 [file] [log] [blame]
// Copyright (c) 2007, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// ---
// Author: csilvers@google.com (Craig Silverstein)
//
// template_modifiers.h has a description of what each escape-routine does.
//
// When creating a new modifier, you must subclass TemplateModifier
// and define your own Modify() method. This method takes the string
// to be modified as a char*/int pair. It then emits the modified
// version of the string to outbuf. Outbuf is an ExpandEmitter, as
// defined in template_modifiers.h. It's a very simple type that
// supports appending to a data stream.
//
// Be very careful editing an existing modifier. Subtle changes can
// introduce the possibility for cross-site scripting attacks. If you
// do change a modifier, be careful that it does not affect
// the list of Safe XSS Alternatives.
//
#include <config.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <string>
#include <vector>
#include "htmlparser/htmlparser_cpp.h"
#include <ctemplate/template_modifiers.h>
#include "template_modifiers_internal.h"
#include <ctemplate/per_expand_data.h>
using std::string;
using std::vector;
#define strliterallen(s) (sizeof("" s "") - 1)
// Really we should be using uint_16_t or something, but this is good
// enough, and more portable...
typedef unsigned int uint16;
namespace URL {
bool HasInsecureProtocol(const char* in, int inlen) {
if (inlen > strliterallen("http://") &&
strncasecmp(in, "http://", strliterallen("http://")) == 0) {
return false; // We're ok, it's an http protocol
}
if (inlen > strliterallen("https://") &&
strncasecmp(in, "https://", strliterallen("https://")) == 0) {
return false; // https is ok as well
}
if (inlen > strliterallen("ftp://") &&
strncasecmp(in, "ftp://", strliterallen("ftp://")) == 0) {
return false; // and ftp
}
return true;
}
} // namespace URL
namespace ctemplate {
using ctemplate_htmlparser::HtmlParser;
// A most-efficient way to append a string literal to the var named 'out'.
// The ""s ensure literal is actually a string literal
#define APPEND(literal) out->Emit("" literal "", sizeof(literal)-1)
// Check whether the string of length len is identical to the literal.
// The ""s ensure literal is actually a string literal
#define STR_IS(str, len, literal) \
((len) == sizeof("" literal "") - 1 && \
memcmp(str, literal, sizeof("" literal "") - 1) == 0)
TemplateModifier::~TemplateModifier() {}
void NullModifier::Modify(const char* in, size_t inlen,
const PerExpandData*,
ExpandEmitter* out, const string& arg) const {
out->Emit(in, inlen);
}
NullModifier null_modifier;
static inline void EmitRun(const char* start, const char* limit,
ExpandEmitter* out) {
if (start < limit) {
out->Emit(start, (limit - start));
}
}
void HtmlEscape::Modify(const char* in, size_t inlen,
const PerExpandData*,
ExpandEmitter* out, const string& arg) const {
const char* pos = in;
const char* start = pos;
const char* const limit = in + inlen;
while (pos < limit) {
switch (*pos) {
default:
// Increment our counter and look at the next character.
++pos;
continue;
case '&': EmitRun(start, pos, out); APPEND("&amp;"); break;
case '"': EmitRun(start, pos, out); APPEND("&quot;"); break;
case '\'': EmitRun(start, pos, out); APPEND("&#39;"); break;
case '<': EmitRun(start, pos, out); APPEND("&lt;"); break;
case '>': EmitRun(start, pos, out); APPEND("&gt;"); break;
case '\r': case '\n': case '\v': case '\f': case '\t':
EmitRun(start, pos, out); APPEND(" "); break;
}
start = ++pos;
}
EmitRun(start, pos, out);
}
HtmlEscape html_escape;
void PreEscape::Modify(const char* in, size_t inlen,
const PerExpandData*,
ExpandEmitter* out, const string& arg) const {
const char* pos = in;
const char* start = pos;
const char* const limit = in + inlen;
while (pos < limit) {
switch (*pos) {
default:
// Increment our counter and look at the next character.
++pos;
continue;
// Unlike HtmlEscape, we leave whitespace as is.
case '&': EmitRun(start, pos, out); APPEND("&amp;"); break;
case '"': EmitRun(start, pos, out); APPEND("&quot;"); break;
case '\'': EmitRun(start, pos, out); APPEND("&#39;"); break;
case '<': EmitRun(start, pos, out); APPEND("&lt;"); break;
case '>': EmitRun(start, pos, out); APPEND("&gt;"); break;
}
start = ++pos;
}
EmitRun(start, pos, out);
}
PreEscape pre_escape;
// We encode the presence and ordering of unclosed tags in a string, using the
// letters b, i, s, and e to stand for <b>, <i>, <span>, and <em> respectively.
// The most recently opened tag is appended onto the end of the string, so in
// the common case of properly nested tags, we need only look at the last
// character. If we don't find it there, we need to continue looking at
// everything until we find it, because tags may not necessarily be in order.
// Similarly, when we add a tag, we need to check each existing tag for a match
// so that we don't nest.
class UnclosedSnippetTags {
public:
// We could use ordinary ints for the enum values, but using mnemonic
// characters potentially makes debugging easier.
typedef enum {
TAG_B = 'b',
TAG_I = 'i',
TAG_EM = 'e',
TAG_SPAN = 's',
} Tag;
UnclosedSnippetTags() : tag_length(0) {
memset(tags, 0, 5);
}
// Adds a tag to the set of open tags if it's not already open, or otherwise
// return false.
inline bool MaybeAdd(Tag tag) {
if (strchr(tags, tag)) {
return false;
} else {
tags[tag_length++] = tag;
return true;
}
}
// Removes a tag from the set of open tags if it's open, or otherwise return
// false.
inline bool MaybeRemove(Tag tag) {
char* tag_location = strchr(tags, tag);
if (tag_location) {
for (char* c = tag_location; *c; ++c) {
// Have to copy all later tags down by one so we don't leave a gap in the
// array.
*c = *(c + 1);
}
--tag_length;
return true;
} else {
return false;
}
}
inline void PrintClosingTags(ExpandEmitter* out) {
for (int i = tag_length; i >= 0; --i) {
switch (tags[i]) {
case TAG_B:
out->Emit("</b>"); break;
case TAG_I:
out->Emit("</i>"); break;
case TAG_EM:
out->Emit("</em>"); break;
case TAG_SPAN:
out->Emit("</span>"); break;
}
}
}
private:
char tags[5];
int tag_length;
};
void SnippetEscape::Modify(const char* in, size_t inlen,
const PerExpandData*,
ExpandEmitter* out, const string& arg) const {
UnclosedSnippetTags unclosed;
const char* pos = in;
const char* start = pos;
const char* const limit = in + inlen;
while (pos < limit) {
switch (*pos) {
default:
// Increment our counter and look at the next character.
++pos;
continue;
case '<': {
// If there is a permissible tag, just advance pos past it to
// make it part of the current run. Notice the use of
// "continue" below.
const char* const next_pos = pos + 1;
const int chars_left = limit - next_pos;
if ((chars_left >= 2) && !memcmp(next_pos, "b>", 2)
&& unclosed.MaybeAdd(UnclosedSnippetTags::TAG_B)) {
pos += strliterallen("<b>");
continue;
} else if ((chars_left >= 2) && !memcmp(next_pos, "i>", 2)
&& unclosed.MaybeAdd(UnclosedSnippetTags::TAG_I)) {
pos += strliterallen("<i>");
continue;
} else if ((chars_left >= 3) && !memcmp(next_pos, "em>", 3)
&& unclosed.MaybeAdd(UnclosedSnippetTags::TAG_EM)) {
pos += strliterallen("<em>");
continue;
} else if ((chars_left >= 13) && !memcmp(next_pos, "span dir=", 9)
&& (!memcmp(next_pos + 9, "ltr>", 4) ||
!memcmp(next_pos + 9, "rtl>", 4))
&& unclosed.MaybeAdd(UnclosedSnippetTags::TAG_SPAN)) {
pos += strliterallen("<span dir=ltr>");
continue;
} else if ((chars_left >= 3) && !memcmp(next_pos, "/b>", 3)
&& unclosed.MaybeRemove(UnclosedSnippetTags::TAG_B)) {
pos += strliterallen("</b>");
continue;
} else if ((chars_left >= 3) && !memcmp(next_pos, "/i>", 3)
&& unclosed.MaybeRemove(UnclosedSnippetTags::TAG_I)) {
pos += strliterallen("</i>");
continue;
} else if ((chars_left >= 4) && !memcmp(next_pos, "/em>", 4)
&& unclosed.MaybeRemove(UnclosedSnippetTags::TAG_EM)) {
pos += strliterallen("</em>");
continue;
} else if ((chars_left >= 6) && !memcmp(next_pos, "/span>", 6)
&& unclosed.MaybeRemove(UnclosedSnippetTags::TAG_SPAN)) {
pos += strliterallen("</span>");
continue;
} else if ((chars_left >= 3) && !memcmp(next_pos, "br>", 3)) {
pos += strliterallen("<br>");
continue;
} else if ((chars_left >= 4) && !memcmp(next_pos, "wbr>", 4)) {
pos += strliterallen("<wbr>");
continue;
}
// Emit the entity and break out of the switch.
EmitRun(start, pos, out);
APPEND("&lt;");
break;
}
case '&':
EmitRun(start, pos, out);
if (pos + 1 < limit && pos[1] == '{') {
// Could be a javascript entity, so we need to escape.
// (Javascript entities are an xss risk in Netscape 4.)
APPEND("&amp;");
} else {
APPEND("&");
}
break;
case '"': EmitRun(start, pos, out); APPEND("&quot;"); break;
case '\'': EmitRun(start, pos, out); APPEND("&#39;"); break;
case '>': EmitRun(start, pos, out); APPEND("&gt;"); break;
case '\r': case '\n': case '\v': case '\f': case '\t':
// non-space whitespace
EmitRun(start, pos, out); APPEND(" "); break;
}
start = ++pos;
}
EmitRun(start, pos, out);
unclosed.PrintClosingTags(out);
}
SnippetEscape snippet_escape;
void CleanseAttribute::Modify(const char* in, size_t inlen,
const PerExpandData*,
ExpandEmitter* out, const string& arg) const {
for (size_t i = 0; i < inlen; ++i) {
char c = in[i];
switch (c) {
case '=': {
if (i == 0 || i == (inlen - 1))
out->Emit('_');
else
out->Emit(c);
break;
}
case '-':
case '.':
case '_':
case ':': {
out->Emit(c);
break;
}
default: {
if ((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9')) {
out->Emit(c);
} else {
APPEND("_");
}
break;
}
}
}
}
CleanseAttribute cleanse_attribute;
void CleanseCss::Modify(const char* in, size_t inlen,
const PerExpandData*,
ExpandEmitter* out, const string& arg) const {
for (size_t i = 0; i < inlen; ++i) {
char c = in[i];
switch (c) {
case ' ':
case '_':
case '.':
case ',':
case '!':
case '#':
case '%':
case '-': {
out->Emit(c);
break;
}
default: {
if ((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9')) {
out->Emit(c);
}
break;
}
}
}
}
CleanseCss cleanse_css;
// CssUrlEscape is used as a chained modifier by ValidateUrl
// (validate_url_and_css_escape) and is not directly exposed.
class CssUrlEscape : public TemplateModifier {
public:
virtual void Modify(const char* in, size_t inlen,
const PerExpandData*, ExpandEmitter* outbuf,
const string& arg) const;
};
// URL-encodes the characters [\n\r\\'"()<>*] to ensure the URL can be safely
// inserted in a CSS context, e.g:
// . In an '@import url("URL");' statement
// . In a CSS property such as 'background: url("URL");'
// In both locations above, enclosing quotes are optional but parens are not.
// We want to make sure the URL cannot exit the parens enclosure, close a
// STYLE tag or reset the browser's CSS parser (via comments or newlines).
//
// References:
// . CSS 2.1 URLs: http://www.w3.org/TR/CSS21/syndata.html#url
// . CSS 1 URLs: http://www.w3.org/TR/REC-CSS1/#url
void CssUrlEscape::Modify(const char* in, size_t inlen,
const PerExpandData*,
ExpandEmitter* out, const string& arg) const {
for (size_t i = 0; i < inlen; ++i) {
char c = in[i];
switch (c) {
case '\n': APPEND("%0A"); break;
case '\r': APPEND("%0D"); break;
case '"': APPEND("%22"); break;
case '\'': APPEND("%27"); break;
case '(': APPEND("%28"); break;
case ')': APPEND("%29"); break;
case '*': APPEND("%2A"); break;
case '<': APPEND("%3C"); break;
case '>': APPEND("%3E"); break;
case '\\': APPEND("%5C"); break;
default: out->Emit(c); break;
}
}
}
CssUrlEscape css_url_escape;
// These URLs replace unsafe URLs for :U and :I url-escaping modes.
const char* const ValidateUrl::kUnsafeUrlReplacement = "#";
const char* const ValidateUrl::kUnsafeImgSrcUrlReplacement =
"/images/cleardot.gif";
void ValidateUrl::Modify(const char* in, size_t inlen,
const PerExpandData* per_expand_data,
ExpandEmitter* out, const string& arg) const {
const char* slashpos = (char*)memchr(in, '/', inlen);
if (slashpos == NULL) {
slashpos = in + inlen;
}
const void* colonpos = memchr(in, ':', slashpos - in);
// colon before first slash, could be a protocol
if (colonpos != NULL && URL::HasInsecureProtocol(in, inlen)) {
// It's a bad protocol, so return something safe
chained_modifier_.Modify(unsafe_url_replacement_,
unsafe_url_replacement_length_,
per_expand_data,
out,
"");
return;
}
// If we get here, it's a valid url, so just escape it
chained_modifier_.Modify(in, inlen, per_expand_data, out, "");
}
ValidateUrl validate_url_and_html_escape(
html_escape,
ValidateUrl::kUnsafeUrlReplacement);
ValidateUrl validate_url_and_javascript_escape(
javascript_escape,
ValidateUrl::kUnsafeUrlReplacement);
ValidateUrl validate_url_and_css_escape(
css_url_escape,
ValidateUrl::kUnsafeUrlReplacement);
ValidateUrl validate_img_src_url_and_html_escape(
html_escape,
ValidateUrl::kUnsafeImgSrcUrlReplacement);
ValidateUrl validate_img_src_url_and_javascript_escape(
javascript_escape,
ValidateUrl::kUnsafeImgSrcUrlReplacement);
ValidateUrl validate_img_src_url_and_css_escape(
css_url_escape,
ValidateUrl::kUnsafeImgSrcUrlReplacement);
void XmlEscape::Modify(const char* in, size_t inlen,
const PerExpandData*,
ExpandEmitter* out, const string& arg) const {
const char* pos = in;
const char* start = pos;
const char* const limit = in + inlen;
while (pos < limit) {
char ch = *pos;
// According to section 2.2 of the spec
// http://www.w3.org/TR/REC-xml/#charsets control characters in range
// 0x00-0x1F (except \t, \r and \n) are not valid XML characters. In
// particular, conformant parsers are allowed to die when encountering a FF
// char in PCDATA sections. These chars are replaced by a space.
if (ch >= 0x00 && ch < 0x20 && ch != '\t' && ch != '\r' && ch != '\n') {
EmitRun(start, pos, out);
out->Emit(' ');
start = ++pos;
continue;
}
switch (ch) {
default:
// Increment our counter and look at the next character.
++pos;
continue;
case '&': EmitRun(start, pos, out); APPEND("&amp;"); break;
case '"': EmitRun(start, pos, out); APPEND("&quot;"); break;
case '\'': EmitRun(start, pos, out); APPEND("&#39;"); break;
case '<': EmitRun(start, pos, out); APPEND("&lt;"); break;
case '>': EmitRun(start, pos, out); APPEND("&gt;"); break;
}
start = ++pos;
}
EmitRun(start, pos, out);
}
XmlEscape xml_escape;
// This table maps initial characters to code lengths. This could be
// done with a 16-byte table and a shift, but there's a substantial
// performance increase by eliminating the shift.
static const char kCodeLengths[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
};
// Returns the UTF-8 code-unit starting at start, or the special codepoint
// 0xFFFD if the input ends abruptly or is not well-formed UTF-8.
// start -- address of the start of the code unit which also receives the
// address past the end of the code unit returned.
// end -- exclusive end of the string
static inline uint16 UTF8CodeUnit(const char** start, const char *end) {
// Use kCodeLengths table to calculate the length of the code unit
// from the first character.
unsigned char first_char = static_cast<unsigned char>(**start);
size_t code_unit_len = kCodeLengths[first_char];
if (code_unit_len == 1) {
// Return the current byte as a codepoint.
// Either it is a valid single byte codepoint, or it's not part of a valid
// UTF-8 sequence, and so has to be handled individually.
++*start;
return first_char;
}
const char *code_unit_end = *start + code_unit_len;
if (code_unit_end < *start || code_unit_end > end) { // Truncated code unit.
++*start;
return 0xFFFDU;
}
const char* pos = *start;
uint16 code_unit = *pos & (0xFFU >> code_unit_len);
while (--code_unit_len) {
uint16 tail_byte = *(++pos);
if ((tail_byte & 0xC0U) != 0x80U) { // Malformed code unit.
++*start;
return 0xFFFDU;
}
code_unit = (code_unit << 6) | (tail_byte & 0x3FU);
}
*start = code_unit_end;
return code_unit;
}
// A good reference is the ECMA standard (3rd ed), section 7.8.4:
// http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
void JavascriptEscape::Modify(const char* in, size_t inlen,
const PerExpandData*,
ExpandEmitter* out, const string& arg) const {
const char* pos = in;
const char* start = pos;
const char* const limit = in + inlen;
if (limit < in) { return; }
while (pos < limit) {
const char* next_pos = pos;
uint16 code_unit = UTF8CodeUnit(&next_pos, limit);
// Test for 16-bit values outside the switch below, because gcc
// will emit chained branches rather than a jump table for such a
// wide range of values.
if (code_unit & 0xFF00) {
// Linebreaks according to EcmaScript 262 which cannot appear in strings.
if (code_unit == 0x2028) {
// Line separator
EmitRun(start, pos, out); APPEND("\\u2028");
} else if (code_unit == 0x2029) {
// Paragraph separator
EmitRun(start, pos, out); APPEND("\\u2029");
} else {
pos = next_pos;
continue;
}
} else {
switch (code_unit) {
default:
// Increment our counter and look at the next character.
pos = next_pos;
continue;
case '\0': EmitRun(start, pos, out); APPEND("\\x00"); break;
case '"': EmitRun(start, pos, out); APPEND("\\x22"); break;
case '\'': EmitRun(start, pos, out); APPEND("\\x27"); break;
case '\\': EmitRun(start, pos, out); APPEND("\\\\"); break;
case '\t': EmitRun(start, pos, out); APPEND("\\t"); break;
case '\r': EmitRun(start, pos, out); APPEND("\\r"); break;
case '\n': EmitRun(start, pos, out); APPEND("\\n"); break;
case '\b': EmitRun(start, pos, out); APPEND("\\b"); break;
case '\f': EmitRun(start, pos, out); APPEND("\\f"); break;
case '&': EmitRun(start, pos, out); APPEND("\\x26"); break;
case '<': EmitRun(start, pos, out); APPEND("\\x3c"); break;
case '>': EmitRun(start, pos, out); APPEND("\\x3e"); break;
case '=': EmitRun(start, pos, out); APPEND("\\x3d"); break;
case '\v':
// Do not escape vertical tabs to "\\v" since it is interpreted as 'v'
// by JScript according to section 2.1 of
// http://wiki.ecmascript.org/lib/exe/fetch.php?
// id=resources%3Aresources&cache=cache&
// media=resources:jscriptdeviationsfromes3.pdf
EmitRun(start, pos, out); APPEND("\\x0b"); break;
}
}
start = pos = next_pos;
}
EmitRun(start, pos, out);
}
JavascriptEscape javascript_escape;
void JavascriptNumber::Modify(const char* in, size_t inlen,
const PerExpandData*,
ExpandEmitter* out, const string& arg) const {
if (inlen == 0)
return;
if (STR_IS(in, inlen, "true") || STR_IS(in, inlen, "false")) {
out->Emit(in, inlen);
return;
}
bool valid = true;
if (in[0] == '0' && inlen > 2 && (in[1] == 'x' || in[1] == 'X')) {
// There must be at least one hex digit after the 0x for it to be valid.
// Hex number. Check that it is of the form 0(x|X)[0-9A-Fa-f]+
for (size_t i = 2; i < inlen; i++) {
char c = in[i];
if (!((c >= 'a' && c <= 'f') ||
(c >= 'A' && c <= 'F') ||
(c >= '0' && c <= '9'))) {
valid = false;
break;
}
}
} else {
// Must be a base-10 (or octal) number.
// Check that it has the form [0-9+-.eE]+
for (size_t i = 0; i < inlen; i++) {
char c = in[i];
if (!((c >= '0' && c <= '9') ||
c == '+' || c == '-' || c == '.' ||
c == 'e' || c == 'E')) {
valid = false;
break;
}
}
}
if (valid) {
out->Emit(in, inlen); // Number was valid, output it.
} else {
APPEND("null"); // Number was not valid, output null instead.
}
}
JavascriptNumber javascript_number;
static inline bool IsUrlQueryEscapeSafeChar(unsigned char c) {
// Everything not matching [0-9a-zA-Z.,_*/~!()-] is escaped.
static unsigned long _safe_characters[8] = {
0x00000000L, 0x03fff702L, 0x87fffffeL, 0x47fffffeL,
0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L
};
return (_safe_characters[(c)>>5] & (1 << ((c) & 31)));
}
void UrlQueryEscape::Modify(const char* in, size_t inlen,
const PerExpandData*,
ExpandEmitter* out, const string& arg) const {
const char* pos = in;
const char* const limit = in + inlen;
while (true) {
// Peel off any initial runs of safe characters and emit them all
// at once.
const char* start = pos;
while (pos < limit && IsUrlQueryEscapeSafeChar(*pos)) {
pos++;
}
EmitRun(start, pos, out);
// Now deal with a single unsafe character.
if (pos < limit) {
unsigned char c = *pos;
if (c == ' ') {
out->Emit('+');
} else {
out->Emit('%');
out->Emit(((c>>4) < 10 ? ((c>>4) + '0') : (((c>>4) - 10) + 'A')));
out->Emit(((c&0xf) < 10 ? ((c&0xf) + '0') : (((c&0xf) - 10) + 'A')));
}
pos++;
} else {
// We're done!
break;
}
}
}
UrlQueryEscape url_query_escape;
// For more information on escaping JSON, see section 2.5 in
// http://www.ietf.org/rfc/rfc4627.txt.
// Escaping '&', '<', '>' is optional in the JSON proposed RFC
// but alleviates concerns with content sniffing if JSON is used
// in a context where the browser may attempt to interpret HTML.
void JsonEscape::Modify(const char* in, size_t inlen,
const PerExpandData*,
ExpandEmitter* out, const string& arg) const {
const char* pos = in;
const char* start = pos;
const char* const limit = in + inlen;
while (pos < limit) {
switch (*pos) {
default:
// Increment our counter and look at the next character.
++pos;
continue;
case '"': EmitRun(start, pos, out); APPEND("\\\""); break;
case '\\': EmitRun(start, pos, out); APPEND("\\\\"); break;
case '/': EmitRun(start, pos, out); APPEND("\\/"); break;
case '\b': EmitRun(start, pos, out); APPEND("\\b"); break;
case '\f': EmitRun(start, pos, out); APPEND("\\f"); break;
case '\n': EmitRun(start, pos, out); APPEND("\\n"); break;
case '\r': EmitRun(start, pos, out); APPEND("\\r"); break;
case '\t': EmitRun(start, pos, out); APPEND("\\t"); break;
case '&': EmitRun(start, pos, out); APPEND("\\u0026"); break;
case '<': EmitRun(start, pos, out); APPEND("\\u003C"); break;
case '>': EmitRun(start, pos, out); APPEND("\\u003E"); break;
}
start = ++pos;
}
EmitRun(start, pos, out);
}
JsonEscape json_escape;
void PrefixLine::Modify(const char* in, size_t inlen,
const PerExpandData*,
ExpandEmitter* out, const string& arg) const {
while (inlen > 0) {
const char* nl = (const char*)memchr(in, '\n', inlen);
const char* cr = (const char*)memchr(in, '\r', nl ? nl - in : inlen);
size_t linelen;
if (nl == NULL && cr == NULL) {
// We're at the last line
out->Emit(in, inlen);
break;
} else {
// One or both of \r and \n is set; point to the first char past
// the newline. Note for \r\n, that's the char after the \n,
// otherwise, it's the char past the \r or the \n we see.
if ((nl == NULL) != (cr == NULL)) // one is set, the other is NULL
linelen = (nl ? nl : cr) + 1 - in;
else if (nl == cr + 1 || nl < cr) // \r\n, or \n comes first
linelen = nl + 1 - in;
else
linelen = cr + 1 - in;
}
out->Emit(in, linelen);
out->Emit(arg); // a new line, so emit the prefix
in += linelen;
inlen -= linelen;
assert(inlen >= 0);
}
}
PrefixLine prefix_line;
// Must be at least one more than the maximum number of alternative modifiers
// specified in any given element of g_modifiers.
# define MAX_SAFE_ALTERNATIVES 10 // If the compiler complains, increase it.
// Use the empty string if you want a modifier not to have a long-name.
// Use '\0' if you want a modifier not to have a short-name.
// Note: not all modifiers are in this array:
// 1) SnippetEscape: use html_escape_with_arg=snippet to get this
// 2) CleanseAttribute: use html_escape_with_arg=attribute to get this
// 3) ValidateUrl: use html_escape_with_arg=url to get this
//
// Some modifiers define other modifiers that are safe replacements
// from an XSS perspective. Replacements are not commutative so for
// example H=pre considers H=attribute a safe replacement to it
// but H=attribute has no safe replacements.
// This struct is not pretty but allows the definitions to be
// done without the need for a global initialization method.
// Be very careful making a change to g_modifiers as modifiers
// point to other ones within that same array so elements
// may not be re-ordered easily. Also you need to change
// the global g_am_dirs correspondingly.
//
static struct ModifierWithAlternatives {
ModifierInfo modifier_info;
ModifierInfo* safe_alt_mods[MAX_SAFE_ALTERNATIVES];
} g_modifiers[] = {
/* 0 */ { ModifierInfo("cleanse_css", 'c',
XSS_WEB_STANDARD, &cleanse_css),
{&g_modifiers[16].modifier_info, // url_escape_with_arg=css
// img_src_url_escape_with_arg=css
&g_modifiers[19].modifier_info} },
/* 1 */ { ModifierInfo("html_escape", 'h',
XSS_WEB_STANDARD, &html_escape),
{&g_modifiers[2].modifier_info, // html_escape_with_arg=snippet
&g_modifiers[3].modifier_info, // html_escape_with_arg=pre
&g_modifiers[4].modifier_info, // html_escape_with_arg=attribute
&g_modifiers[5].modifier_info, // html_escape_with_arg=url
&g_modifiers[8].modifier_info, // pre_escape
&g_modifiers[9].modifier_info, // url_query_escape
&g_modifiers[11].modifier_info, // url_escape_with_arg=html
&g_modifiers[12].modifier_info, // url_escape_with_arg=query
// img_src_url_escape_with_arg=html
&g_modifiers[18].modifier_info} },
/* 2 */ { ModifierInfo("html_escape_with_arg=snippet", 'H',
XSS_WEB_STANDARD, &snippet_escape),
{&g_modifiers[1].modifier_info, // html_escape
&g_modifiers[3].modifier_info, // html_escape_with_arg=pre
&g_modifiers[4].modifier_info, // html_escape_with_arg=attribute
&g_modifiers[8].modifier_info, // pre_escape
&g_modifiers[9].modifier_info, // url_query_escape
&g_modifiers[12].modifier_info} }, // url_escape_with_arg=query
/* 3 */ { ModifierInfo("html_escape_with_arg=pre", 'H',
XSS_WEB_STANDARD, &pre_escape),
{&g_modifiers[1].modifier_info, // html_escape
&g_modifiers[2].modifier_info, // html_escape_with_arg=snippet
&g_modifiers[4].modifier_info, // html_escape_with_arg=attribute
&g_modifiers[8].modifier_info, // pre_escape
&g_modifiers[9].modifier_info, // url_query_escape
&g_modifiers[12].modifier_info} }, // url_escape_with_arg=query
/* 4 */ { ModifierInfo("html_escape_with_arg=attribute", 'H',
XSS_WEB_STANDARD, &cleanse_attribute), {} },
/* 5 */ { ModifierInfo("html_escape_with_arg=url", 'H',
XSS_WEB_STANDARD, &validate_url_and_html_escape),
// img_src_url_escape_with_arg=html
{&g_modifiers[18].modifier_info} },
/* 6 */ { ModifierInfo("javascript_escape", 'j',
XSS_WEB_STANDARD, &javascript_escape),
{&g_modifiers[7].modifier_info, // json_escape
&g_modifiers[10].modifier_info, // url_escape_with_arg=javascript
// img_src_url_escape_with_arg=javascript
&g_modifiers[17].modifier_info} },
/* 7 */ { ModifierInfo("json_escape", 'o', XSS_WEB_STANDARD, &json_escape),
{&g_modifiers[6].modifier_info} }, // javascript_escape
/* 8 */ { ModifierInfo("pre_escape", 'p', XSS_WEB_STANDARD, &pre_escape),
{&g_modifiers[1].modifier_info, // html_escape
&g_modifiers[2].modifier_info, // html_escape_with_arg=snippet
&g_modifiers[3].modifier_info, // html_escape_with_arg=pre
&g_modifiers[4].modifier_info, // html_escape_with_arg=attr...
&g_modifiers[9].modifier_info, // url_query_escape
&g_modifiers[12].modifier_info} }, // url_escape_with_arg=query
/* 9 */ { ModifierInfo("url_query_escape", 'u',
XSS_WEB_STANDARD, &url_query_escape), {} },
/* 10 */ { ModifierInfo("url_escape_with_arg=javascript", 'U',
XSS_WEB_STANDARD,
&validate_url_and_javascript_escape),
// img_src_url_escape_with_arg=javascript
{&g_modifiers[17].modifier_info} },
/* 11 */ { ModifierInfo("url_escape_with_arg=html", 'U',
XSS_WEB_STANDARD, &validate_url_and_html_escape),
// img_src_url_escape_with_arg=html
{&g_modifiers[18].modifier_info} },
/* 12 */ { ModifierInfo("url_escape_with_arg=query", 'U',
XSS_WEB_STANDARD, &url_query_escape), {} },
/* 13 */ { ModifierInfo("none", '\0', XSS_SAFE, &null_modifier), {} },
/* 14 */ { ModifierInfo("xml_escape", '\0', XSS_WEB_STANDARD, &xml_escape),
{&g_modifiers[1].modifier_info, // html_escape
&g_modifiers[4].modifier_info,} }, // H=attribute
/* 15 */ { ModifierInfo("javascript_escape_with_arg=number", 'J',
XSS_WEB_STANDARD, &javascript_number), {} },
/* 16 */ { ModifierInfo("url_escape_with_arg=css", 'U',
XSS_WEB_STANDARD, &validate_url_and_css_escape), {} },
/* 17 */ { ModifierInfo("img_src_url_escape_with_arg=javascript", 'I',
XSS_WEB_STANDARD,
&validate_img_src_url_and_javascript_escape), {} },
/* 18 */ { ModifierInfo("img_src_url_escape_with_arg=html", 'I',
XSS_WEB_STANDARD,
&validate_img_src_url_and_html_escape), {} },
/* 19 */ { ModifierInfo("img_src_url_escape_with_arg=css", 'I',
XSS_WEB_STANDARD,
&validate_img_src_url_and_css_escape), {} },
};
static vector<const ModifierInfo*> g_extension_modifiers;
static vector<const ModifierInfo*> g_unknown_modifiers;
// Returns whether or not candidate can be safely (w.r.t XSS)
// used in lieu of our ModifierInfo. This is true iff:
// 1. Both have the same modifier function OR
// 2. Candidate's modifier function is in our ModifierInfo's
// list (vector) of safe alternative modifier functions.
//
// This is used with the auto-escaping code, which automatically
// figures out which modifier to apply to a variable based on the
// variable's context (in an html "<A HREF", for instance). Some
// built-in modifiers are considered safe alternatives from the perspective
// of preventing XSS (cross-site-scripting) attacks, in which case
// the auto-escaper should allow the choice of which to use in the
// template. This is intended only for internal use as it is dangerous
// and complicated to figure out which modifier is an XSS-safe
// replacement for a given one. Custom modifiers currently may not
// indicate safe replacements, only built-in ones may do so.
//
// Note that this function is not commutative therefore
// IsSafeXSSAlternative(a, b) may not be equal to IsSafeXSSAlternative(b, a).
bool IsSafeXSSAlternative(const ModifierInfo& our,
const ModifierInfo& candidate) {
// Succeeds even for non built-in modifiers but no harm.
if (our.modifier == candidate.modifier)
return true;
for (const ModifierWithAlternatives* mod_with_alts = g_modifiers;
mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers);
++mod_with_alts) {
if (mod_with_alts->modifier_info.long_name == our.long_name)
// We found our Modifier in the built-in array g_modifiers.
for (int i = 0; mod_with_alts->safe_alt_mods[i] != NULL &&
i < MAX_SAFE_ALTERNATIVES; ++i)
if (mod_with_alts->safe_alt_mods[i]->long_name == candidate.long_name)
// We found candidate in our Modifier's list of safe alternatives.
return true;
}
// our is not built-in or candidate is not a safe replacement to our.
return false;
}
static inline bool IsExtensionModifier(const char* long_name) {
return memcmp(long_name, "x-", 2) == 0;
}
static bool AddModifierCommon(const char* long_name,
const TemplateModifier* modifier, bool xss_safe) {
if (!IsExtensionModifier(long_name))
return false;
// TODO(csilvers): store in a map or multimap, rather than a vector
for (vector<const ModifierInfo*>::const_iterator mod =
g_extension_modifiers.begin();
mod != g_extension_modifiers.end();
++mod) {
// Check if mod has the same name as us. For modifiers that also take
// values, this is everything before the =. The only time it's ok to
// have the same name is when we have different modval specializations:
// "foo=bar" and "foo=baz" are both valid names. Note "foo" and
// "foo=bar" is not valid: foo has no modval, but "foo=bar" does.
const size_t new_modifier_namelen = strcspn(long_name, "=");
const size_t existing_modifier_namelen = strcspn((*mod)->long_name.c_str(),
"=");
if (new_modifier_namelen == existing_modifier_namelen &&
!memcmp(long_name, (*mod)->long_name.c_str(), new_modifier_namelen)) {
if (long_name[new_modifier_namelen] == '=' &&
(*mod)->long_name[existing_modifier_namelen] == '=' &&
(*mod)->long_name != long_name) {
// It's ok, we're different specializations!
} else {
// It's not ok: we have the same name and no good excuse.
return false;
}
}
}
g_extension_modifiers.push_back(
new ModifierInfo(long_name, '\0',
xss_safe ? XSS_SAFE : XSS_UNIQUE,
modifier));
return true;
}
// Modifier added with XSS_UNIQUE XssClass.
bool AddModifier(const char* long_name,
const TemplateModifier* modifier) {
return AddModifierCommon(long_name, modifier, false);
}
// Modifier added with XSS_SAFE XssClass.
bool AddXssSafeModifier(const char* long_name,
const TemplateModifier* modifier) {
return AddModifierCommon(long_name, modifier, true);
}
// If candidate_match is a better match for modname/modval than bestmatch,
// update bestmatch. To be a better match, two conditions must be met:
// 1) The candidate's name must match modname
// 2) If the candidate is a specialization (that is, name is of the form
// "foo=bar", then modval matches the specialization value).
// 3) If the candidate is not a specialization, bestmatch isn't a
// specialization either.
// Condition (3) makes sure that if we match the ModifierInfo with name
// "foo=bar", we don't claim the ModifierInfo "foo=" is a better match.
// Recall that by definition, modval will always start with a '=' if present.
static void UpdateBestMatch(const char* modname, size_t modname_len,
const char* modval, size_t modval_len,
const ModifierInfo* candidate_match,
const ModifierInfo** best_match) {
// It's easiest to handle the two case differently: (1) candidate_match
// refers to a modifier that expects a modifier-value; (2) it doesn't.
if (candidate_match->modval_required) {
// To be a match, we have to fulfill three requirements: we have a
// modval, our modname matches candidate_match's modname (either
// shortname or longname), and our modval is consistent with the
// value specified in the longname (whatever might follow the =).
const char* const longname_start = candidate_match->long_name.c_str();
const char* const equals = strchr(longname_start, '=');
assert(equals != NULL);
if (modval_len > 0 &&
((modname_len == 1 && *modname == candidate_match->short_name) ||
(modname_len == equals - longname_start &&
memcmp(modname, longname_start, modname_len) == 0)) &&
((equals[1] == '\0') || // name is "foo=" (not a specialization)
(modval_len
== longname_start + candidate_match->long_name.size() - equals &&
memcmp(modval, equals, modval_len) == 0))) {
// Condition (3) above is satisfied iff our longname is longer than
// best-match's longname (so we prefer "foo=bar" to "foo=").
if (*best_match == NULL ||
candidate_match->long_name.size() > (*best_match)->long_name.size())
*best_match = candidate_match;
}
} else {
// In this case, to be a match: we must *not* have a modval. Our
// modname still must match modinfo's modname (either short or long).
if (modval_len == 0 &&
((modname_len == 1 && *modname == candidate_match->short_name) ||
(modname_len == candidate_match->long_name.size() &&
!memcmp(modname, candidate_match->long_name.data(), modname_len)))) {
// In the no-modval case, only one match should exist.
assert(*best_match == NULL);
*best_match = candidate_match;
}
}
}
const ModifierInfo* FindModifier(const char* modname, size_t modname_len,
const char* modval, size_t modval_len) {
// More than one modifier can match, in the case of modval specializations
// (e.g., the modifier "foo=" and "foo=bar" will both match on input of
// modname="foo", modval="bar"). In that case, we take the ModifierInfo
// with the longest longname, since that's the most specialized match.
const ModifierInfo* best_match = NULL;
if (modname_len >= 2 && IsExtensionModifier(modname)) {
for (vector<const ModifierInfo*>::const_iterator mod =
g_extension_modifiers.begin();
mod != g_extension_modifiers.end();
++mod) {
UpdateBestMatch(modname, modname_len, modval, modval_len,
*mod, &best_match);
}
if (best_match != NULL)
return best_match;
for (vector<const ModifierInfo*>::const_iterator mod =
g_unknown_modifiers.begin();
mod != g_unknown_modifiers.end();
++mod) {
UpdateBestMatch(modname, modname_len, modval, modval_len,
*mod, &best_match);
}
if (best_match != NULL)
return best_match;
// This is the only situation where we can pass in a modifier of NULL.
// It means "we don't know about this modifier-name."
string fullname(modname, modname_len);
if (modval_len) {
fullname.append(modval, modval_len);
}
// TODO(csilvers): store in a map or multimap, rather than a vector
g_unknown_modifiers.push_back(new ModifierInfo(fullname, '\0',
XSS_UNIQUE, NULL));
return g_unknown_modifiers.back();
} else {
for (const ModifierWithAlternatives* mod_with_alts = g_modifiers;
mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers);
++mod_with_alts) {
UpdateBestMatch(modname, modname_len, modval, modval_len,
&mod_with_alts->modifier_info, &best_match);
}
return best_match;
}
}
// For escaping variables under the auto-escape mode:
// Each directive below maps to a distinct sequence of
// escaping directives (i.e a vector<ModifierAndValue>) applied
// to a variable during run-time substitution.
// The directives are stored in a global array (g_mods_ae)
// initialized under lock in InitializeGlobalModifiers.
enum AutoModifyDirective {
AM_EMPTY, // Unused, kept as marker.
AM_HTML,
AM_HTML_UNQUOTED,
AM_JS,
AM_JS_NUMBER,
AM_URL_HTML,
AM_URL_QUERY,
AM_STYLE,
AM_XML,
NUM_ENTRIES_AM,
};
// Populates the global vector of hard-coded modifiers that
// Auto-Escape may pick. We point to the appropriate modifier in
// the global g_modifiers.
// Reference these globals via the global array g_am_dirs[] for consistency.
// Note: We allow for more than one ModifierAndValue in the array hence
// the need to terminate with a Null marker. However currently all the
// escaping directives have exactly one ModifierAndValue.
static const ModifierAndValue g_am_empty[] = {
ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_html[] = {
ModifierAndValue(&g_modifiers[1].modifier_info, "", 0),
ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_html_unquoted[] = {
ModifierAndValue(&g_modifiers[4].modifier_info, "=attribute", 10),
ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_js[] = {
ModifierAndValue(&g_modifiers[6].modifier_info, "", 0),
ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_js_number[] = {
ModifierAndValue(&g_modifiers[15].modifier_info, "=number", 7),
ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_url_html[] = {
ModifierAndValue(&g_modifiers[11].modifier_info, "=html", 5),
ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_url_query[] = {
ModifierAndValue(&g_modifiers[9].modifier_info, "", 0),
ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_style[] = {
ModifierAndValue(&g_modifiers[0].modifier_info, "", 0),
ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_xml[] = {
ModifierAndValue(&g_modifiers[14].modifier_info, "", 0),
ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue* g_am_dirs[NUM_ENTRIES_AM] = {
g_am_empty, /* AM_EMPTY */
g_am_html, /* AM_HTML */
g_am_html_unquoted, /* AM_HTML_UNQUOTED */
g_am_js, /* AM_JS */
g_am_js_number, /* AM_JS_NUMBER */
g_am_url_html, /* AM_URL_HTML */
g_am_url_query, /* AM_URL_QUERY */
g_am_style, /* AM_STYLE */
g_am_xml, /* AM_XML */
};
string PrettyPrintOneModifier(const ModifierAndValue& modval) {
string out;
out.append(":");
if (modval.modifier_info->short_name) // short_name is a char.
out.append(1, modval.modifier_info->short_name);
else
out.append(modval.modifier_info->long_name);
if (modval.value_len != 0)
out.append(modval.value, modval.value_len);
return out;
}
string PrettyPrintModifiers(const vector<const ModifierAndValue*>& modvals,
const string& separator) {
string out;
for (vector<const ModifierAndValue*>::const_iterator it =
modvals.begin(); it != modvals.end(); ++it) {
if (it != modvals.begin())
out.append(separator);
out.append(PrettyPrintOneModifier(**it));
}
return out;
}
// Return the sequence of escaping directives to apply for the given context.
// An empty vector indicates an error occurred. Currently we never need
// to chain escaping directives hence on success, the vector is always of
// size 1. This may change in the future.
vector<const ModifierAndValue*> GetModifierForHtmlJs(
HtmlParser* htmlparser, string* error_msg) {
assert(htmlparser);
assert(error_msg);
vector<const ModifierAndValue*> modvals;
// Two cases of being inside javascript:
// 1. Inside raw javascript (within a <script> tag). If the value
// is quoted we apply javascript_escape, if not we have to coerce
// it to a safe value due to the risk of javascript code execution
// hence apply :J=number. If arbitrary code needs to be inserted
// at run-time, the developer must use :none.
// 2. In the value of an attribute that takes javascript such
// as onmouseevent in '<a href="someUrl" onmousevent="{{EVENT}}">'.
// That will be covered in the STATE_VALUE state logic below.
if (htmlparser->InJavascript() &&
htmlparser->state() != HtmlParser::STATE_VALUE) {
if (htmlparser->IsJavascriptQuoted()) {
modvals.push_back(g_am_dirs[AM_JS]);
assert(modvals.size() == 1);
return modvals;
} else {
modvals.push_back(g_am_dirs[AM_JS_NUMBER]);
assert(modvals.size() == 1);
return modvals;
}
}
switch (htmlparser->state()) {
case HtmlParser::STATE_VALUE:{
string attribute_name = htmlparser->attribute();
switch (htmlparser->AttributeType()) {
case HtmlParser::ATTR_URI:
// Case 1: The URL is quoted:
// . Apply :U=html if it is a complete URL or :h if it is a fragment.
// Case 2: The URL is not quoted:
// . If it is a complete URL, we have no safe modifiers that
// won't break it so we have to fail.
// . If it is a URL fragment, then :u is safe and not likely to
// break the URL.
if (!htmlparser->IsAttributeQuoted()) {
if (htmlparser->IsUrlStart()) { // Complete URL.
error_msg->append("Value of URL attribute \"" + attribute_name +
"\" must be enclosed in quotes.");
assert(modvals.empty());
return modvals; // Empty
} else { // URL fragment.
modvals.push_back(g_am_dirs[AM_URL_QUERY]);
}
} else {
// Only validate the URL if we have a complete URL,
// otherwise simply html_escape.
if (htmlparser->IsUrlStart())
modvals.push_back(g_am_dirs[AM_URL_HTML]);
else
modvals.push_back(g_am_dirs[AM_HTML]);
}
break;
case HtmlParser::ATTR_REGULAR:
// If the value is quoted, simply HTML escape, otherwise
// apply stricter escaping using H=attribute.
if (htmlparser->IsAttributeQuoted())
modvals.push_back(g_am_dirs[AM_HTML]);
else
modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
break;
case HtmlParser::ATTR_STYLE:
// If the value is quoted apply :c, otherwise fail.
if (htmlparser->IsAttributeQuoted()) {
modvals.push_back(g_am_dirs[AM_STYLE]);
} else {
error_msg->append("Value of style attribute \"" + attribute_name +
"\" must be enclosed in quotes.");
assert(modvals.empty());
return modvals; // Empty
}
break;
case HtmlParser::ATTR_JS:
// We require javascript accepting attributes (such as onclick)
// to be HTML quoted, otherwise they are vulnerable to
// HTML attribute insertion via the use of whitespace.
if (!htmlparser->IsAttributeQuoted()) {
error_msg->append("Value of javascript attribute \"" +
attribute_name +
"\" must be enclosed in quotes.");
assert(modvals.empty());
return modvals; // Empty
}
// If the variable is quoted apply javascript_escape otherwise
// apply javascript_number which will ensure it is safe against
// code injection.
// Note: We normally need to HTML escape after javascript escape
// but the javascript escape implementation provided makes the
// HTML escape redundant so simply javascript escape.
if (htmlparser->IsJavascriptQuoted())
modvals.push_back(g_am_dirs[AM_JS]);
else
modvals.push_back(g_am_dirs[AM_JS_NUMBER]);
break;
case HtmlParser::ATTR_NONE:
assert("We should be in attribute!" && 0);
default:
assert("Should not be able to get here." && 0);
return modvals; // Empty
}
// In STATE_VALUE particularly, the parser may get out of sync with
// the correct state - that the browser sees - due to the fact that
// it does not get to parse run-time content (variables). So we tell
// the parser there is content that will be expanded here.
// A good example is:
// <a href={{URL}} alt={{NAME}}>
// The parser sees <a href= alt=> and interprets 'alt=' to be
// the value of href.
htmlparser->InsertText(); // Ignore return value.
assert(modvals.size() == 1);
return modvals;
}
case HtmlParser::STATE_TAG:{
// Apply H=attribute to tag names since they are alphabetic.
// Examples of tag names: TITLE, BODY, A and BR.
modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
assert(modvals.size() == 1);
return modvals;
}
case HtmlParser::STATE_ATTR:{
// Apply H=attribute to attribute names since they are alphabetic.
// Examples of attribute names: HREF, SRC and WIDTH.
modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
assert(modvals.size() == 1);
return modvals;
}
case HtmlParser::STATE_COMMENT:
case HtmlParser::STATE_TEXT:{
// Apply :h to regular HTML text and :c if within a style tag.
if (htmlparser->InCss())
modvals.push_back(g_am_dirs[AM_STYLE]);
else
modvals.push_back(g_am_dirs[AM_HTML]);
assert(modvals.size() == 1);
return modvals;
}
default:{
assert("Should not be able to get here." && 0);
return modvals; // Empty
}
}
assert("Should not be able to get here." && 0);
return modvals; // Empty
}
// TODO(jad): Memoize all GetModifierForXXX functions below.
// They don't depend on parser context (from csilvers).
vector<const ModifierAndValue*> GetModifierForCss(HtmlParser* htmlparser,
string* error_msg) {
vector<const ModifierAndValue*> modvals;
modvals.push_back(g_am_dirs[AM_STYLE]);
return modvals;
}
vector<const ModifierAndValue*> GetModifierForXml(HtmlParser* htmlparser,
string* error_msg) {
vector<const ModifierAndValue*> modvals;
modvals.push_back(g_am_dirs[AM_XML]);
return modvals;
}
vector<const ModifierAndValue*> GetModifierForJson(HtmlParser* htmlparser,
string* error_msg) {
vector<const ModifierAndValue*> modvals;
modvals.push_back(g_am_dirs[AM_JS]);
return modvals;
}
vector<const ModifierAndValue*> GetDefaultModifierForHtml() {
vector<const ModifierAndValue*> modvals;
modvals.push_back(g_am_dirs[AM_HTML]);
return modvals;
}
vector<const ModifierAndValue*> GetDefaultModifierForJs() {
vector<const ModifierAndValue*> modvals;
modvals.push_back(g_am_dirs[AM_JS]);
return modvals;
}
vector<const ModifierAndValue*> GetDefaultModifierForCss() {
return GetModifierForCss(NULL, NULL);
}
vector<const ModifierAndValue*> GetDefaultModifierForXml() {
return GetModifierForXml(NULL, NULL);
}
vector<const ModifierAndValue*> GetDefaultModifierForJson() {
return GetModifierForJson(NULL, NULL);
}
}