| // Copyright (c) 2007, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| // --- |
| // Author: csilvers@google.com (Craig Silverstein) |
| // |
| // template_modifiers.h has a description of what each escape-routine does. |
| // |
| // When creating a new modifier, you must subclass TemplateModifier |
| // and define your own Modify() method. This method takes the string |
| // to be modified as a char*/int pair. It then emits the modified |
| // version of the string to outbuf. Outbuf is an ExpandEmitter, as |
| // defined in template_modifiers.h. It's a very simple type that |
| // supports appending to a data stream. |
| // |
| // Be very careful editing an existing modifier. Subtle changes can |
| // introduce the possibility for cross-site scripting attacks. If you |
| // do change a modifier, be careful that it does not affect |
| // the list of Safe XSS Alternatives. |
| // |
| |
| #include <config.h> |
| #include <stdlib.h> |
| #include <assert.h> |
| #include <string.h> |
| #include <string> |
| #include <vector> |
| #include "htmlparser/htmlparser_cpp.h" |
| #include <ctemplate/template_modifiers.h> |
| #include "template_modifiers_internal.h" |
| #include <ctemplate/per_expand_data.h> |
| using std::string; |
| using std::vector; |
| |
| #define strliterallen(s) (sizeof("" s "") - 1) |
| |
| // Really we should be using uint_16_t or something, but this is good |
| // enough, and more portable... |
| typedef unsigned int uint16; |
| |
| namespace URL { |
| bool HasInsecureProtocol(const char* in, int inlen) { |
| if (inlen > strliterallen("http://") && |
| strncasecmp(in, "http://", strliterallen("http://")) == 0) { |
| return false; // We're ok, it's an http protocol |
| } |
| if (inlen > strliterallen("https://") && |
| strncasecmp(in, "https://", strliterallen("https://")) == 0) { |
| return false; // https is ok as well |
| } |
| if (inlen > strliterallen("ftp://") && |
| strncasecmp(in, "ftp://", strliterallen("ftp://")) == 0) { |
| return false; // and ftp |
| } |
| return true; |
| } |
| } // namespace URL |
| |
| namespace ctemplate { |
| |
| using ctemplate_htmlparser::HtmlParser; |
| |
| // A most-efficient way to append a string literal to the var named 'out'. |
| // The ""s ensure literal is actually a string literal |
| #define APPEND(literal) out->Emit("" literal "", sizeof(literal)-1) |
| |
| // Check whether the string of length len is identical to the literal. |
| // The ""s ensure literal is actually a string literal |
| #define STR_IS(str, len, literal) \ |
| ((len) == sizeof("" literal "") - 1 && \ |
| memcmp(str, literal, sizeof("" literal "") - 1) == 0) |
| |
| TemplateModifier::~TemplateModifier() {} |
| |
| void NullModifier::Modify(const char* in, size_t inlen, |
| const PerExpandData*, |
| ExpandEmitter* out, const string& arg) const { |
| out->Emit(in, inlen); |
| } |
| NullModifier null_modifier; |
| |
| static inline void EmitRun(const char* start, const char* limit, |
| ExpandEmitter* out) { |
| if (start < limit) { |
| out->Emit(start, (limit - start)); |
| } |
| } |
| |
| void HtmlEscape::Modify(const char* in, size_t inlen, |
| const PerExpandData*, |
| ExpandEmitter* out, const string& arg) const { |
| const char* pos = in; |
| const char* start = pos; |
| const char* const limit = in + inlen; |
| while (pos < limit) { |
| switch (*pos) { |
| default: |
| // Increment our counter and look at the next character. |
| ++pos; |
| continue; |
| |
| case '&': EmitRun(start, pos, out); APPEND("&"); break; |
| case '"': EmitRun(start, pos, out); APPEND("""); break; |
| case '\'': EmitRun(start, pos, out); APPEND("'"); break; |
| case '<': EmitRun(start, pos, out); APPEND("<"); break; |
| case '>': EmitRun(start, pos, out); APPEND(">"); break; |
| |
| case '\r': case '\n': case '\v': case '\f': case '\t': |
| EmitRun(start, pos, out); APPEND(" "); break; |
| } |
| start = ++pos; |
| } |
| EmitRun(start, pos, out); |
| } |
| HtmlEscape html_escape; |
| |
| void PreEscape::Modify(const char* in, size_t inlen, |
| const PerExpandData*, |
| ExpandEmitter* out, const string& arg) const { |
| const char* pos = in; |
| const char* start = pos; |
| const char* const limit = in + inlen; |
| while (pos < limit) { |
| switch (*pos) { |
| default: |
| // Increment our counter and look at the next character. |
| ++pos; |
| continue; |
| |
| // Unlike HtmlEscape, we leave whitespace as is. |
| case '&': EmitRun(start, pos, out); APPEND("&"); break; |
| case '"': EmitRun(start, pos, out); APPEND("""); break; |
| case '\'': EmitRun(start, pos, out); APPEND("'"); break; |
| case '<': EmitRun(start, pos, out); APPEND("<"); break; |
| case '>': EmitRun(start, pos, out); APPEND(">"); break; |
| } |
| start = ++pos; |
| } |
| EmitRun(start, pos, out); |
| } |
| PreEscape pre_escape; |
| |
| // We encode the presence and ordering of unclosed tags in a string, using the |
| // letters b, i, s, and e to stand for <b>, <i>, <span>, and <em> respectively. |
| // The most recently opened tag is appended onto the end of the string, so in |
| // the common case of properly nested tags, we need only look at the last |
| // character. If we don't find it there, we need to continue looking at |
| // everything until we find it, because tags may not necessarily be in order. |
| // Similarly, when we add a tag, we need to check each existing tag for a match |
| // so that we don't nest. |
| class UnclosedSnippetTags { |
| public: |
| // We could use ordinary ints for the enum values, but using mnemonic |
| // characters potentially makes debugging easier. |
| typedef enum { |
| TAG_B = 'b', |
| TAG_I = 'i', |
| TAG_EM = 'e', |
| TAG_SPAN = 's', |
| } Tag; |
| |
| UnclosedSnippetTags() : tag_length(0) { |
| memset(tags, 0, 5); |
| } |
| |
| // Adds a tag to the set of open tags if it's not already open, or otherwise |
| // return false. |
| inline bool MaybeAdd(Tag tag) { |
| if (strchr(tags, tag)) { |
| return false; |
| } else { |
| tags[tag_length++] = tag; |
| return true; |
| } |
| } |
| |
| // Removes a tag from the set of open tags if it's open, or otherwise return |
| // false. |
| inline bool MaybeRemove(Tag tag) { |
| char* tag_location = strchr(tags, tag); |
| if (tag_location) { |
| for (char* c = tag_location; *c; ++c) { |
| // Have to copy all later tags down by one so we don't leave a gap in the |
| // array. |
| *c = *(c + 1); |
| } |
| --tag_length; |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| inline void PrintClosingTags(ExpandEmitter* out) { |
| for (int i = tag_length; i >= 0; --i) { |
| switch (tags[i]) { |
| case TAG_B: |
| out->Emit("</b>"); break; |
| case TAG_I: |
| out->Emit("</i>"); break; |
| case TAG_EM: |
| out->Emit("</em>"); break; |
| case TAG_SPAN: |
| out->Emit("</span>"); break; |
| } |
| } |
| } |
| |
| private: |
| char tags[5]; |
| int tag_length; |
| }; |
| |
| void SnippetEscape::Modify(const char* in, size_t inlen, |
| const PerExpandData*, |
| ExpandEmitter* out, const string& arg) const { |
| UnclosedSnippetTags unclosed; |
| const char* pos = in; |
| const char* start = pos; |
| const char* const limit = in + inlen; |
| while (pos < limit) { |
| switch (*pos) { |
| default: |
| // Increment our counter and look at the next character. |
| ++pos; |
| continue; |
| |
| case '<': { |
| // If there is a permissible tag, just advance pos past it to |
| // make it part of the current run. Notice the use of |
| // "continue" below. |
| const char* const next_pos = pos + 1; |
| const int chars_left = limit - next_pos; |
| if ((chars_left >= 2) && !memcmp(next_pos, "b>", 2) |
| && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_B)) { |
| pos += strliterallen("<b>"); |
| continue; |
| } else if ((chars_left >= 2) && !memcmp(next_pos, "i>", 2) |
| && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_I)) { |
| pos += strliterallen("<i>"); |
| continue; |
| } else if ((chars_left >= 3) && !memcmp(next_pos, "em>", 3) |
| && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_EM)) { |
| pos += strliterallen("<em>"); |
| continue; |
| } else if ((chars_left >= 13) && !memcmp(next_pos, "span dir=", 9) |
| && (!memcmp(next_pos + 9, "ltr>", 4) || |
| !memcmp(next_pos + 9, "rtl>", 4)) |
| && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_SPAN)) { |
| pos += strliterallen("<span dir=ltr>"); |
| continue; |
| } else if ((chars_left >= 3) && !memcmp(next_pos, "/b>", 3) |
| && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_B)) { |
| pos += strliterallen("</b>"); |
| continue; |
| } else if ((chars_left >= 3) && !memcmp(next_pos, "/i>", 3) |
| && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_I)) { |
| pos += strliterallen("</i>"); |
| continue; |
| } else if ((chars_left >= 4) && !memcmp(next_pos, "/em>", 4) |
| && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_EM)) { |
| pos += strliterallen("</em>"); |
| continue; |
| } else if ((chars_left >= 6) && !memcmp(next_pos, "/span>", 6) |
| && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_SPAN)) { |
| pos += strliterallen("</span>"); |
| continue; |
| } else if ((chars_left >= 3) && !memcmp(next_pos, "br>", 3)) { |
| pos += strliterallen("<br>"); |
| continue; |
| } else if ((chars_left >= 4) && !memcmp(next_pos, "wbr>", 4)) { |
| pos += strliterallen("<wbr>"); |
| continue; |
| } |
| |
| // Emit the entity and break out of the switch. |
| EmitRun(start, pos, out); |
| APPEND("<"); |
| break; |
| } |
| |
| case '&': |
| EmitRun(start, pos, out); |
| if (pos + 1 < limit && pos[1] == '{') { |
| // Could be a javascript entity, so we need to escape. |
| // (Javascript entities are an xss risk in Netscape 4.) |
| APPEND("&"); |
| } else { |
| APPEND("&"); |
| } |
| break; |
| |
| case '"': EmitRun(start, pos, out); APPEND("""); break; |
| case '\'': EmitRun(start, pos, out); APPEND("'"); break; |
| case '>': EmitRun(start, pos, out); APPEND(">"); break; |
| |
| case '\r': case '\n': case '\v': case '\f': case '\t': |
| // non-space whitespace |
| EmitRun(start, pos, out); APPEND(" "); break; |
| |
| } |
| start = ++pos; |
| } |
| EmitRun(start, pos, out); |
| unclosed.PrintClosingTags(out); |
| } |
| SnippetEscape snippet_escape; |
| |
| void CleanseAttribute::Modify(const char* in, size_t inlen, |
| const PerExpandData*, |
| ExpandEmitter* out, const string& arg) const { |
| for (size_t i = 0; i < inlen; ++i) { |
| char c = in[i]; |
| switch (c) { |
| case '=': { |
| if (i == 0 || i == (inlen - 1)) |
| out->Emit('_'); |
| else |
| out->Emit(c); |
| break; |
| } |
| case '-': |
| case '.': |
| case '_': |
| case ':': { |
| out->Emit(c); |
| break; |
| } |
| default: { |
| if ((c >= 'a' && c <= 'z') || |
| (c >= 'A' && c <= 'Z') || |
| (c >= '0' && c <= '9')) { |
| out->Emit(c); |
| } else { |
| APPEND("_"); |
| } |
| break; |
| } |
| } |
| } |
| } |
| CleanseAttribute cleanse_attribute; |
| |
| void CleanseCss::Modify(const char* in, size_t inlen, |
| const PerExpandData*, |
| ExpandEmitter* out, const string& arg) const { |
| for (size_t i = 0; i < inlen; ++i) { |
| char c = in[i]; |
| switch (c) { |
| case ' ': |
| case '_': |
| case '.': |
| case ',': |
| case '!': |
| case '#': |
| case '%': |
| case '-': { |
| out->Emit(c); |
| break; |
| } |
| default: { |
| if ((c >= 'a' && c <= 'z') || |
| (c >= 'A' && c <= 'Z') || |
| (c >= '0' && c <= '9')) { |
| out->Emit(c); |
| } |
| break; |
| } |
| } |
| } |
| } |
| CleanseCss cleanse_css; |
| |
| // CssUrlEscape is used as a chained modifier by ValidateUrl |
| // (validate_url_and_css_escape) and is not directly exposed. |
| class CssUrlEscape : public TemplateModifier { |
| public: |
| virtual void Modify(const char* in, size_t inlen, |
| const PerExpandData*, ExpandEmitter* outbuf, |
| const string& arg) const; |
| }; |
| |
| // URL-encodes the characters [\n\r\\'"()<>*] to ensure the URL can be safely |
| // inserted in a CSS context, e.g: |
| // . In an '@import url("URL");' statement |
| // . In a CSS property such as 'background: url("URL");' |
| // In both locations above, enclosing quotes are optional but parens are not. |
| // We want to make sure the URL cannot exit the parens enclosure, close a |
| // STYLE tag or reset the browser's CSS parser (via comments or newlines). |
| // |
| // References: |
| // . CSS 2.1 URLs: http://www.w3.org/TR/CSS21/syndata.html#url |
| // . CSS 1 URLs: http://www.w3.org/TR/REC-CSS1/#url |
| void CssUrlEscape::Modify(const char* in, size_t inlen, |
| const PerExpandData*, |
| ExpandEmitter* out, const string& arg) const { |
| for (size_t i = 0; i < inlen; ++i) { |
| char c = in[i]; |
| switch (c) { |
| case '\n': APPEND("%0A"); break; |
| case '\r': APPEND("%0D"); break; |
| case '"': APPEND("%22"); break; |
| case '\'': APPEND("%27"); break; |
| case '(': APPEND("%28"); break; |
| case ')': APPEND("%29"); break; |
| case '*': APPEND("%2A"); break; |
| case '<': APPEND("%3C"); break; |
| case '>': APPEND("%3E"); break; |
| case '\\': APPEND("%5C"); break; |
| default: out->Emit(c); break; |
| } |
| } |
| } |
| CssUrlEscape css_url_escape; |
| |
| // These URLs replace unsafe URLs for :U and :I url-escaping modes. |
| const char* const ValidateUrl::kUnsafeUrlReplacement = "#"; |
| const char* const ValidateUrl::kUnsafeImgSrcUrlReplacement = |
| "/images/cleardot.gif"; |
| |
| void ValidateUrl::Modify(const char* in, size_t inlen, |
| const PerExpandData* per_expand_data, |
| ExpandEmitter* out, const string& arg) const { |
| const char* slashpos = (char*)memchr(in, '/', inlen); |
| if (slashpos == NULL) { |
| slashpos = in + inlen; |
| } |
| const void* colonpos = memchr(in, ':', slashpos - in); |
| // colon before first slash, could be a protocol |
| if (colonpos != NULL && URL::HasInsecureProtocol(in, inlen)) { |
| // It's a bad protocol, so return something safe |
| chained_modifier_.Modify(unsafe_url_replacement_, |
| unsafe_url_replacement_length_, |
| per_expand_data, |
| out, |
| ""); |
| return; |
| } |
| // If we get here, it's a valid url, so just escape it |
| chained_modifier_.Modify(in, inlen, per_expand_data, out, ""); |
| } |
| ValidateUrl validate_url_and_html_escape( |
| html_escape, |
| ValidateUrl::kUnsafeUrlReplacement); |
| ValidateUrl validate_url_and_javascript_escape( |
| javascript_escape, |
| ValidateUrl::kUnsafeUrlReplacement); |
| ValidateUrl validate_url_and_css_escape( |
| css_url_escape, |
| ValidateUrl::kUnsafeUrlReplacement); |
| ValidateUrl validate_img_src_url_and_html_escape( |
| html_escape, |
| ValidateUrl::kUnsafeImgSrcUrlReplacement); |
| ValidateUrl validate_img_src_url_and_javascript_escape( |
| javascript_escape, |
| ValidateUrl::kUnsafeImgSrcUrlReplacement); |
| ValidateUrl validate_img_src_url_and_css_escape( |
| css_url_escape, |
| ValidateUrl::kUnsafeImgSrcUrlReplacement); |
| |
| void XmlEscape::Modify(const char* in, size_t inlen, |
| const PerExpandData*, |
| ExpandEmitter* out, const string& arg) const { |
| const char* pos = in; |
| const char* start = pos; |
| const char* const limit = in + inlen; |
| while (pos < limit) { |
| char ch = *pos; |
| |
| // According to section 2.2 of the spec |
| // http://www.w3.org/TR/REC-xml/#charsets control characters in range |
| // 0x00-0x1F (except \t, \r and \n) are not valid XML characters. In |
| // particular, conformant parsers are allowed to die when encountering a FF |
| // char in PCDATA sections. These chars are replaced by a space. |
| if (ch >= 0x00 && ch < 0x20 && ch != '\t' && ch != '\r' && ch != '\n') { |
| EmitRun(start, pos, out); |
| out->Emit(' '); |
| start = ++pos; |
| continue; |
| } |
| |
| switch (ch) { |
| default: |
| // Increment our counter and look at the next character. |
| ++pos; |
| continue; |
| |
| case '&': EmitRun(start, pos, out); APPEND("&"); break; |
| case '"': EmitRun(start, pos, out); APPEND("""); break; |
| case '\'': EmitRun(start, pos, out); APPEND("'"); break; |
| case '<': EmitRun(start, pos, out); APPEND("<"); break; |
| case '>': EmitRun(start, pos, out); APPEND(">"); break; |
| } |
| start = ++pos; |
| } |
| EmitRun(start, pos, out); |
| } |
| XmlEscape xml_escape; |
| |
| // This table maps initial characters to code lengths. This could be |
| // done with a 16-byte table and a shift, but there's a substantial |
| // performance increase by eliminating the shift. |
| static const char kCodeLengths[256] = { |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| |
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| }; |
| |
| // Returns the UTF-8 code-unit starting at start, or the special codepoint |
| // 0xFFFD if the input ends abruptly or is not well-formed UTF-8. |
| // start -- address of the start of the code unit which also receives the |
| // address past the end of the code unit returned. |
| // end -- exclusive end of the string |
| static inline uint16 UTF8CodeUnit(const char** start, const char *end) { |
| // Use kCodeLengths table to calculate the length of the code unit |
| // from the first character. |
| unsigned char first_char = static_cast<unsigned char>(**start); |
| size_t code_unit_len = kCodeLengths[first_char]; |
| if (code_unit_len == 1) { |
| // Return the current byte as a codepoint. |
| // Either it is a valid single byte codepoint, or it's not part of a valid |
| // UTF-8 sequence, and so has to be handled individually. |
| ++*start; |
| return first_char; |
| } |
| const char *code_unit_end = *start + code_unit_len; |
| if (code_unit_end < *start || code_unit_end > end) { // Truncated code unit. |
| ++*start; |
| return 0xFFFDU; |
| } |
| const char* pos = *start; |
| uint16 code_unit = *pos & (0xFFU >> code_unit_len); |
| while (--code_unit_len) { |
| uint16 tail_byte = *(++pos); |
| if ((tail_byte & 0xC0U) != 0x80U) { // Malformed code unit. |
| ++*start; |
| return 0xFFFDU; |
| } |
| code_unit = (code_unit << 6) | (tail_byte & 0x3FU); |
| } |
| *start = code_unit_end; |
| return code_unit; |
| } |
| |
| // A good reference is the ECMA standard (3rd ed), section 7.8.4: |
| // http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf |
| void JavascriptEscape::Modify(const char* in, size_t inlen, |
| const PerExpandData*, |
| ExpandEmitter* out, const string& arg) const { |
| const char* pos = in; |
| const char* start = pos; |
| const char* const limit = in + inlen; |
| |
| if (limit < in) { return; } |
| |
| while (pos < limit) { |
| const char* next_pos = pos; |
| uint16 code_unit = UTF8CodeUnit(&next_pos, limit); |
| |
| // Test for 16-bit values outside the switch below, because gcc |
| // will emit chained branches rather than a jump table for such a |
| // wide range of values. |
| if (code_unit & 0xFF00) { |
| // Linebreaks according to EcmaScript 262 which cannot appear in strings. |
| if (code_unit == 0x2028) { |
| // Line separator |
| EmitRun(start, pos, out); APPEND("\\u2028"); |
| } else if (code_unit == 0x2029) { |
| // Paragraph separator |
| EmitRun(start, pos, out); APPEND("\\u2029"); |
| } else { |
| pos = next_pos; |
| continue; |
| } |
| } else { |
| switch (code_unit) { |
| default: |
| // Increment our counter and look at the next character. |
| pos = next_pos; |
| continue; |
| |
| case '\0': EmitRun(start, pos, out); APPEND("\\x00"); break; |
| case '"': EmitRun(start, pos, out); APPEND("\\x22"); break; |
| case '\'': EmitRun(start, pos, out); APPEND("\\x27"); break; |
| case '\\': EmitRun(start, pos, out); APPEND("\\\\"); break; |
| case '\t': EmitRun(start, pos, out); APPEND("\\t"); break; |
| case '\r': EmitRun(start, pos, out); APPEND("\\r"); break; |
| case '\n': EmitRun(start, pos, out); APPEND("\\n"); break; |
| case '\b': EmitRun(start, pos, out); APPEND("\\b"); break; |
| case '\f': EmitRun(start, pos, out); APPEND("\\f"); break; |
| case '&': EmitRun(start, pos, out); APPEND("\\x26"); break; |
| case '<': EmitRun(start, pos, out); APPEND("\\x3c"); break; |
| case '>': EmitRun(start, pos, out); APPEND("\\x3e"); break; |
| case '=': EmitRun(start, pos, out); APPEND("\\x3d"); break; |
| |
| case '\v': |
| // Do not escape vertical tabs to "\\v" since it is interpreted as 'v' |
| // by JScript according to section 2.1 of |
| // http://wiki.ecmascript.org/lib/exe/fetch.php? |
| // id=resources%3Aresources&cache=cache& |
| // media=resources:jscriptdeviationsfromes3.pdf |
| EmitRun(start, pos, out); APPEND("\\x0b"); break; |
| } |
| } |
| start = pos = next_pos; |
| } |
| EmitRun(start, pos, out); |
| } |
| JavascriptEscape javascript_escape; |
| |
| |
| void JavascriptNumber::Modify(const char* in, size_t inlen, |
| const PerExpandData*, |
| ExpandEmitter* out, const string& arg) const { |
| if (inlen == 0) |
| return; |
| |
| if (STR_IS(in, inlen, "true") || STR_IS(in, inlen, "false")) { |
| out->Emit(in, inlen); |
| return; |
| } |
| |
| bool valid = true; |
| if (in[0] == '0' && inlen > 2 && (in[1] == 'x' || in[1] == 'X')) { |
| // There must be at least one hex digit after the 0x for it to be valid. |
| // Hex number. Check that it is of the form 0(x|X)[0-9A-Fa-f]+ |
| for (size_t i = 2; i < inlen; i++) { |
| char c = in[i]; |
| if (!((c >= 'a' && c <= 'f') || |
| (c >= 'A' && c <= 'F') || |
| (c >= '0' && c <= '9'))) { |
| valid = false; |
| break; |
| } |
| } |
| } else { |
| // Must be a base-10 (or octal) number. |
| // Check that it has the form [0-9+-.eE]+ |
| for (size_t i = 0; i < inlen; i++) { |
| char c = in[i]; |
| if (!((c >= '0' && c <= '9') || |
| c == '+' || c == '-' || c == '.' || |
| c == 'e' || c == 'E')) { |
| valid = false; |
| break; |
| } |
| } |
| } |
| if (valid) { |
| out->Emit(in, inlen); // Number was valid, output it. |
| } else { |
| APPEND("null"); // Number was not valid, output null instead. |
| } |
| } |
| JavascriptNumber javascript_number; |
| |
| static inline bool IsUrlQueryEscapeSafeChar(unsigned char c) { |
| // Everything not matching [0-9a-zA-Z.,_*/~!()-] is escaped. |
| static unsigned long _safe_characters[8] = { |
| 0x00000000L, 0x03fff702L, 0x87fffffeL, 0x47fffffeL, |
| 0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L |
| }; |
| |
| return (_safe_characters[(c)>>5] & (1 << ((c) & 31))); |
| } |
| |
| void UrlQueryEscape::Modify(const char* in, size_t inlen, |
| const PerExpandData*, |
| ExpandEmitter* out, const string& arg) const { |
| const char* pos = in; |
| const char* const limit = in + inlen; |
| while (true) { |
| // Peel off any initial runs of safe characters and emit them all |
| // at once. |
| const char* start = pos; |
| while (pos < limit && IsUrlQueryEscapeSafeChar(*pos)) { |
| pos++; |
| } |
| EmitRun(start, pos, out); |
| |
| // Now deal with a single unsafe character. |
| if (pos < limit) { |
| unsigned char c = *pos; |
| if (c == ' ') { |
| out->Emit('+'); |
| } else { |
| out->Emit('%'); |
| out->Emit(((c>>4) < 10 ? ((c>>4) + '0') : (((c>>4) - 10) + 'A'))); |
| out->Emit(((c&0xf) < 10 ? ((c&0xf) + '0') : (((c&0xf) - 10) + 'A'))); |
| } |
| pos++; |
| } else { |
| // We're done! |
| break; |
| } |
| } |
| } |
| UrlQueryEscape url_query_escape; |
| |
| // For more information on escaping JSON, see section 2.5 in |
| // http://www.ietf.org/rfc/rfc4627.txt. |
| // Escaping '&', '<', '>' is optional in the JSON proposed RFC |
| // but alleviates concerns with content sniffing if JSON is used |
| // in a context where the browser may attempt to interpret HTML. |
| void JsonEscape::Modify(const char* in, size_t inlen, |
| const PerExpandData*, |
| ExpandEmitter* out, const string& arg) const { |
| const char* pos = in; |
| const char* start = pos; |
| const char* const limit = in + inlen; |
| while (pos < limit) { |
| switch (*pos) { |
| default: |
| // Increment our counter and look at the next character. |
| ++pos; |
| continue; |
| |
| case '"': EmitRun(start, pos, out); APPEND("\\\""); break; |
| case '\\': EmitRun(start, pos, out); APPEND("\\\\"); break; |
| case '/': EmitRun(start, pos, out); APPEND("\\/"); break; |
| case '\b': EmitRun(start, pos, out); APPEND("\\b"); break; |
| case '\f': EmitRun(start, pos, out); APPEND("\\f"); break; |
| case '\n': EmitRun(start, pos, out); APPEND("\\n"); break; |
| case '\r': EmitRun(start, pos, out); APPEND("\\r"); break; |
| case '\t': EmitRun(start, pos, out); APPEND("\\t"); break; |
| case '&': EmitRun(start, pos, out); APPEND("\\u0026"); break; |
| case '<': EmitRun(start, pos, out); APPEND("\\u003C"); break; |
| case '>': EmitRun(start, pos, out); APPEND("\\u003E"); break; |
| } |
| start = ++pos; |
| } |
| EmitRun(start, pos, out); |
| } |
| JsonEscape json_escape; |
| |
| void PrefixLine::Modify(const char* in, size_t inlen, |
| const PerExpandData*, |
| ExpandEmitter* out, const string& arg) const { |
| while (inlen > 0) { |
| const char* nl = (const char*)memchr(in, '\n', inlen); |
| const char* cr = (const char*)memchr(in, '\r', nl ? nl - in : inlen); |
| size_t linelen; |
| if (nl == NULL && cr == NULL) { |
| // We're at the last line |
| out->Emit(in, inlen); |
| break; |
| } else { |
| // One or both of \r and \n is set; point to the first char past |
| // the newline. Note for \r\n, that's the char after the \n, |
| // otherwise, it's the char past the \r or the \n we see. |
| if ((nl == NULL) != (cr == NULL)) // one is set, the other is NULL |
| linelen = (nl ? nl : cr) + 1 - in; |
| else if (nl == cr + 1 || nl < cr) // \r\n, or \n comes first |
| linelen = nl + 1 - in; |
| else |
| linelen = cr + 1 - in; |
| } |
| out->Emit(in, linelen); |
| out->Emit(arg); // a new line, so emit the prefix |
| in += linelen; |
| inlen -= linelen; |
| assert(inlen >= 0); |
| } |
| } |
| PrefixLine prefix_line; |
| |
| |
| // Must be at least one more than the maximum number of alternative modifiers |
| // specified in any given element of g_modifiers. |
| # define MAX_SAFE_ALTERNATIVES 10 // If the compiler complains, increase it. |
| |
| // Use the empty string if you want a modifier not to have a long-name. |
| // Use '\0' if you want a modifier not to have a short-name. |
| // Note: not all modifiers are in this array: |
| // 1) SnippetEscape: use html_escape_with_arg=snippet to get this |
| // 2) CleanseAttribute: use html_escape_with_arg=attribute to get this |
| // 3) ValidateUrl: use html_escape_with_arg=url to get this |
| // |
| // Some modifiers define other modifiers that are safe replacements |
| // from an XSS perspective. Replacements are not commutative so for |
| // example H=pre considers H=attribute a safe replacement to it |
| // but H=attribute has no safe replacements. |
| // This struct is not pretty but allows the definitions to be |
| // done without the need for a global initialization method. |
| // Be very careful making a change to g_modifiers as modifiers |
| // point to other ones within that same array so elements |
| // may not be re-ordered easily. Also you need to change |
| // the global g_am_dirs correspondingly. |
| // |
| static struct ModifierWithAlternatives { |
| ModifierInfo modifier_info; |
| ModifierInfo* safe_alt_mods[MAX_SAFE_ALTERNATIVES]; |
| } g_modifiers[] = { |
| /* 0 */ { ModifierInfo("cleanse_css", 'c', |
| XSS_WEB_STANDARD, &cleanse_css), |
| {&g_modifiers[16].modifier_info, // url_escape_with_arg=css |
| // img_src_url_escape_with_arg=css |
| &g_modifiers[19].modifier_info} }, |
| /* 1 */ { ModifierInfo("html_escape", 'h', |
| XSS_WEB_STANDARD, &html_escape), |
| {&g_modifiers[2].modifier_info, // html_escape_with_arg=snippet |
| &g_modifiers[3].modifier_info, // html_escape_with_arg=pre |
| &g_modifiers[4].modifier_info, // html_escape_with_arg=attribute |
| &g_modifiers[5].modifier_info, // html_escape_with_arg=url |
| &g_modifiers[8].modifier_info, // pre_escape |
| &g_modifiers[9].modifier_info, // url_query_escape |
| &g_modifiers[11].modifier_info, // url_escape_with_arg=html |
| &g_modifiers[12].modifier_info, // url_escape_with_arg=query |
| // img_src_url_escape_with_arg=html |
| &g_modifiers[18].modifier_info} }, |
| /* 2 */ { ModifierInfo("html_escape_with_arg=snippet", 'H', |
| XSS_WEB_STANDARD, &snippet_escape), |
| {&g_modifiers[1].modifier_info, // html_escape |
| &g_modifiers[3].modifier_info, // html_escape_with_arg=pre |
| &g_modifiers[4].modifier_info, // html_escape_with_arg=attribute |
| &g_modifiers[8].modifier_info, // pre_escape |
| &g_modifiers[9].modifier_info, // url_query_escape |
| &g_modifiers[12].modifier_info} }, // url_escape_with_arg=query |
| /* 3 */ { ModifierInfo("html_escape_with_arg=pre", 'H', |
| XSS_WEB_STANDARD, &pre_escape), |
| {&g_modifiers[1].modifier_info, // html_escape |
| &g_modifiers[2].modifier_info, // html_escape_with_arg=snippet |
| &g_modifiers[4].modifier_info, // html_escape_with_arg=attribute |
| &g_modifiers[8].modifier_info, // pre_escape |
| &g_modifiers[9].modifier_info, // url_query_escape |
| &g_modifiers[12].modifier_info} }, // url_escape_with_arg=query |
| /* 4 */ { ModifierInfo("html_escape_with_arg=attribute", 'H', |
| XSS_WEB_STANDARD, &cleanse_attribute), {} }, |
| /* 5 */ { ModifierInfo("html_escape_with_arg=url", 'H', |
| XSS_WEB_STANDARD, &validate_url_and_html_escape), |
| // img_src_url_escape_with_arg=html |
| {&g_modifiers[18].modifier_info} }, |
| /* 6 */ { ModifierInfo("javascript_escape", 'j', |
| XSS_WEB_STANDARD, &javascript_escape), |
| {&g_modifiers[7].modifier_info, // json_escape |
| &g_modifiers[10].modifier_info, // url_escape_with_arg=javascript |
| // img_src_url_escape_with_arg=javascript |
| &g_modifiers[17].modifier_info} }, |
| /* 7 */ { ModifierInfo("json_escape", 'o', XSS_WEB_STANDARD, &json_escape), |
| {&g_modifiers[6].modifier_info} }, // javascript_escape |
| /* 8 */ { ModifierInfo("pre_escape", 'p', XSS_WEB_STANDARD, &pre_escape), |
| {&g_modifiers[1].modifier_info, // html_escape |
| &g_modifiers[2].modifier_info, // html_escape_with_arg=snippet |
| &g_modifiers[3].modifier_info, // html_escape_with_arg=pre |
| &g_modifiers[4].modifier_info, // html_escape_with_arg=attr... |
| &g_modifiers[9].modifier_info, // url_query_escape |
| &g_modifiers[12].modifier_info} }, // url_escape_with_arg=query |
| /* 9 */ { ModifierInfo("url_query_escape", 'u', |
| XSS_WEB_STANDARD, &url_query_escape), {} }, |
| /* 10 */ { ModifierInfo("url_escape_with_arg=javascript", 'U', |
| XSS_WEB_STANDARD, |
| &validate_url_and_javascript_escape), |
| // img_src_url_escape_with_arg=javascript |
| {&g_modifiers[17].modifier_info} }, |
| /* 11 */ { ModifierInfo("url_escape_with_arg=html", 'U', |
| XSS_WEB_STANDARD, &validate_url_and_html_escape), |
| // img_src_url_escape_with_arg=html |
| {&g_modifiers[18].modifier_info} }, |
| /* 12 */ { ModifierInfo("url_escape_with_arg=query", 'U', |
| XSS_WEB_STANDARD, &url_query_escape), {} }, |
| /* 13 */ { ModifierInfo("none", '\0', XSS_SAFE, &null_modifier), {} }, |
| /* 14 */ { ModifierInfo("xml_escape", '\0', XSS_WEB_STANDARD, &xml_escape), |
| {&g_modifiers[1].modifier_info, // html_escape |
| &g_modifiers[4].modifier_info,} }, // H=attribute |
| /* 15 */ { ModifierInfo("javascript_escape_with_arg=number", 'J', |
| XSS_WEB_STANDARD, &javascript_number), {} }, |
| /* 16 */ { ModifierInfo("url_escape_with_arg=css", 'U', |
| XSS_WEB_STANDARD, &validate_url_and_css_escape), {} }, |
| /* 17 */ { ModifierInfo("img_src_url_escape_with_arg=javascript", 'I', |
| XSS_WEB_STANDARD, |
| &validate_img_src_url_and_javascript_escape), {} }, |
| /* 18 */ { ModifierInfo("img_src_url_escape_with_arg=html", 'I', |
| XSS_WEB_STANDARD, |
| &validate_img_src_url_and_html_escape), {} }, |
| /* 19 */ { ModifierInfo("img_src_url_escape_with_arg=css", 'I', |
| XSS_WEB_STANDARD, |
| &validate_img_src_url_and_css_escape), {} }, |
| }; |
| |
| static vector<const ModifierInfo*> g_extension_modifiers; |
| static vector<const ModifierInfo*> g_unknown_modifiers; |
| |
| // Returns whether or not candidate can be safely (w.r.t XSS) |
| // used in lieu of our ModifierInfo. This is true iff: |
| // 1. Both have the same modifier function OR |
| // 2. Candidate's modifier function is in our ModifierInfo's |
| // list (vector) of safe alternative modifier functions. |
| // |
| // This is used with the auto-escaping code, which automatically |
| // figures out which modifier to apply to a variable based on the |
| // variable's context (in an html "<A HREF", for instance). Some |
| // built-in modifiers are considered safe alternatives from the perspective |
| // of preventing XSS (cross-site-scripting) attacks, in which case |
| // the auto-escaper should allow the choice of which to use in the |
| // template. This is intended only for internal use as it is dangerous |
| // and complicated to figure out which modifier is an XSS-safe |
| // replacement for a given one. Custom modifiers currently may not |
| // indicate safe replacements, only built-in ones may do so. |
| // |
| // Note that this function is not commutative therefore |
| // IsSafeXSSAlternative(a, b) may not be equal to IsSafeXSSAlternative(b, a). |
| bool IsSafeXSSAlternative(const ModifierInfo& our, |
| const ModifierInfo& candidate) { |
| // Succeeds even for non built-in modifiers but no harm. |
| if (our.modifier == candidate.modifier) |
| return true; |
| |
| for (const ModifierWithAlternatives* mod_with_alts = g_modifiers; |
| mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers); |
| ++mod_with_alts) { |
| if (mod_with_alts->modifier_info.long_name == our.long_name) |
| // We found our Modifier in the built-in array g_modifiers. |
| for (int i = 0; mod_with_alts->safe_alt_mods[i] != NULL && |
| i < MAX_SAFE_ALTERNATIVES; ++i) |
| if (mod_with_alts->safe_alt_mods[i]->long_name == candidate.long_name) |
| // We found candidate in our Modifier's list of safe alternatives. |
| return true; |
| } |
| // our is not built-in or candidate is not a safe replacement to our. |
| return false; |
| } |
| |
| static inline bool IsExtensionModifier(const char* long_name) { |
| return memcmp(long_name, "x-", 2) == 0; |
| } |
| |
| static bool AddModifierCommon(const char* long_name, |
| const TemplateModifier* modifier, bool xss_safe) { |
| if (!IsExtensionModifier(long_name)) |
| return false; |
| |
| // TODO(csilvers): store in a map or multimap, rather than a vector |
| for (vector<const ModifierInfo*>::const_iterator mod = |
| g_extension_modifiers.begin(); |
| mod != g_extension_modifiers.end(); |
| ++mod) { |
| // Check if mod has the same name as us. For modifiers that also take |
| // values, this is everything before the =. The only time it's ok to |
| // have the same name is when we have different modval specializations: |
| // "foo=bar" and "foo=baz" are both valid names. Note "foo" and |
| // "foo=bar" is not valid: foo has no modval, but "foo=bar" does. |
| const size_t new_modifier_namelen = strcspn(long_name, "="); |
| const size_t existing_modifier_namelen = strcspn((*mod)->long_name.c_str(), |
| "="); |
| if (new_modifier_namelen == existing_modifier_namelen && |
| !memcmp(long_name, (*mod)->long_name.c_str(), new_modifier_namelen)) { |
| if (long_name[new_modifier_namelen] == '=' && |
| (*mod)->long_name[existing_modifier_namelen] == '=' && |
| (*mod)->long_name != long_name) { |
| // It's ok, we're different specializations! |
| } else { |
| // It's not ok: we have the same name and no good excuse. |
| return false; |
| } |
| } |
| } |
| |
| g_extension_modifiers.push_back( |
| new ModifierInfo(long_name, '\0', |
| xss_safe ? XSS_SAFE : XSS_UNIQUE, |
| modifier)); |
| return true; |
| } |
| |
| // Modifier added with XSS_UNIQUE XssClass. |
| bool AddModifier(const char* long_name, |
| const TemplateModifier* modifier) { |
| return AddModifierCommon(long_name, modifier, false); |
| } |
| |
| // Modifier added with XSS_SAFE XssClass. |
| bool AddXssSafeModifier(const char* long_name, |
| const TemplateModifier* modifier) { |
| return AddModifierCommon(long_name, modifier, true); |
| } |
| |
| // If candidate_match is a better match for modname/modval than bestmatch, |
| // update bestmatch. To be a better match, two conditions must be met: |
| // 1) The candidate's name must match modname |
| // 2) If the candidate is a specialization (that is, name is of the form |
| // "foo=bar", then modval matches the specialization value). |
| // 3) If the candidate is not a specialization, bestmatch isn't a |
| // specialization either. |
| // Condition (3) makes sure that if we match the ModifierInfo with name |
| // "foo=bar", we don't claim the ModifierInfo "foo=" is a better match. |
| // Recall that by definition, modval will always start with a '=' if present. |
| static void UpdateBestMatch(const char* modname, size_t modname_len, |
| const char* modval, size_t modval_len, |
| const ModifierInfo* candidate_match, |
| const ModifierInfo** best_match) { |
| // It's easiest to handle the two case differently: (1) candidate_match |
| // refers to a modifier that expects a modifier-value; (2) it doesn't. |
| if (candidate_match->modval_required) { |
| // To be a match, we have to fulfill three requirements: we have a |
| // modval, our modname matches candidate_match's modname (either |
| // shortname or longname), and our modval is consistent with the |
| // value specified in the longname (whatever might follow the =). |
| const char* const longname_start = candidate_match->long_name.c_str(); |
| const char* const equals = strchr(longname_start, '='); |
| assert(equals != NULL); |
| if (modval_len > 0 && |
| ((modname_len == 1 && *modname == candidate_match->short_name) || |
| (modname_len == equals - longname_start && |
| memcmp(modname, longname_start, modname_len) == 0)) && |
| ((equals[1] == '\0') || // name is "foo=" (not a specialization) |
| (modval_len |
| == longname_start + candidate_match->long_name.size() - equals && |
| memcmp(modval, equals, modval_len) == 0))) { |
| // Condition (3) above is satisfied iff our longname is longer than |
| // best-match's longname (so we prefer "foo=bar" to "foo="). |
| if (*best_match == NULL || |
| candidate_match->long_name.size() > (*best_match)->long_name.size()) |
| *best_match = candidate_match; |
| } |
| } else { |
| // In this case, to be a match: we must *not* have a modval. Our |
| // modname still must match modinfo's modname (either short or long). |
| if (modval_len == 0 && |
| ((modname_len == 1 && *modname == candidate_match->short_name) || |
| (modname_len == candidate_match->long_name.size() && |
| !memcmp(modname, candidate_match->long_name.data(), modname_len)))) { |
| // In the no-modval case, only one match should exist. |
| assert(*best_match == NULL); |
| *best_match = candidate_match; |
| } |
| } |
| } |
| |
| const ModifierInfo* FindModifier(const char* modname, size_t modname_len, |
| const char* modval, size_t modval_len) { |
| // More than one modifier can match, in the case of modval specializations |
| // (e.g., the modifier "foo=" and "foo=bar" will both match on input of |
| // modname="foo", modval="bar"). In that case, we take the ModifierInfo |
| // with the longest longname, since that's the most specialized match. |
| const ModifierInfo* best_match = NULL; |
| if (modname_len >= 2 && IsExtensionModifier(modname)) { |
| for (vector<const ModifierInfo*>::const_iterator mod = |
| g_extension_modifiers.begin(); |
| mod != g_extension_modifiers.end(); |
| ++mod) { |
| UpdateBestMatch(modname, modname_len, modval, modval_len, |
| *mod, &best_match); |
| } |
| if (best_match != NULL) |
| return best_match; |
| |
| for (vector<const ModifierInfo*>::const_iterator mod = |
| g_unknown_modifiers.begin(); |
| mod != g_unknown_modifiers.end(); |
| ++mod) { |
| UpdateBestMatch(modname, modname_len, modval, modval_len, |
| *mod, &best_match); |
| } |
| if (best_match != NULL) |
| return best_match; |
| // This is the only situation where we can pass in a modifier of NULL. |
| // It means "we don't know about this modifier-name." |
| string fullname(modname, modname_len); |
| if (modval_len) { |
| fullname.append(modval, modval_len); |
| } |
| // TODO(csilvers): store in a map or multimap, rather than a vector |
| g_unknown_modifiers.push_back(new ModifierInfo(fullname, '\0', |
| XSS_UNIQUE, NULL)); |
| return g_unknown_modifiers.back(); |
| } else { |
| for (const ModifierWithAlternatives* mod_with_alts = g_modifiers; |
| mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers); |
| ++mod_with_alts) { |
| UpdateBestMatch(modname, modname_len, modval, modval_len, |
| &mod_with_alts->modifier_info, &best_match); |
| } |
| return best_match; |
| } |
| } |
| |
| // For escaping variables under the auto-escape mode: |
| // Each directive below maps to a distinct sequence of |
| // escaping directives (i.e a vector<ModifierAndValue>) applied |
| // to a variable during run-time substitution. |
| // The directives are stored in a global array (g_mods_ae) |
| // initialized under lock in InitializeGlobalModifiers. |
| enum AutoModifyDirective { |
| AM_EMPTY, // Unused, kept as marker. |
| AM_HTML, |
| AM_HTML_UNQUOTED, |
| AM_JS, |
| AM_JS_NUMBER, |
| AM_URL_HTML, |
| AM_URL_QUERY, |
| AM_STYLE, |
| AM_XML, |
| NUM_ENTRIES_AM, |
| }; |
| |
| // Populates the global vector of hard-coded modifiers that |
| // Auto-Escape may pick. We point to the appropriate modifier in |
| // the global g_modifiers. |
| // Reference these globals via the global array g_am_dirs[] for consistency. |
| // Note: We allow for more than one ModifierAndValue in the array hence |
| // the need to terminate with a Null marker. However currently all the |
| // escaping directives have exactly one ModifierAndValue. |
| static const ModifierAndValue g_am_empty[] = { |
| ModifierAndValue(NULL, "", 0) |
| }; |
| static const ModifierAndValue g_am_html[] = { |
| ModifierAndValue(&g_modifiers[1].modifier_info, "", 0), |
| ModifierAndValue(NULL, "", 0) |
| }; |
| static const ModifierAndValue g_am_html_unquoted[] = { |
| ModifierAndValue(&g_modifiers[4].modifier_info, "=attribute", 10), |
| ModifierAndValue(NULL, "", 0) |
| }; |
| static const ModifierAndValue g_am_js[] = { |
| ModifierAndValue(&g_modifiers[6].modifier_info, "", 0), |
| ModifierAndValue(NULL, "", 0) |
| }; |
| static const ModifierAndValue g_am_js_number[] = { |
| ModifierAndValue(&g_modifiers[15].modifier_info, "=number", 7), |
| ModifierAndValue(NULL, "", 0) |
| }; |
| static const ModifierAndValue g_am_url_html[] = { |
| ModifierAndValue(&g_modifiers[11].modifier_info, "=html", 5), |
| ModifierAndValue(NULL, "", 0) |
| }; |
| static const ModifierAndValue g_am_url_query[] = { |
| ModifierAndValue(&g_modifiers[9].modifier_info, "", 0), |
| ModifierAndValue(NULL, "", 0) |
| }; |
| static const ModifierAndValue g_am_style[] = { |
| ModifierAndValue(&g_modifiers[0].modifier_info, "", 0), |
| ModifierAndValue(NULL, "", 0) |
| }; |
| static const ModifierAndValue g_am_xml[] = { |
| ModifierAndValue(&g_modifiers[14].modifier_info, "", 0), |
| ModifierAndValue(NULL, "", 0) |
| }; |
| |
| static const ModifierAndValue* g_am_dirs[NUM_ENTRIES_AM] = { |
| g_am_empty, /* AM_EMPTY */ |
| g_am_html, /* AM_HTML */ |
| g_am_html_unquoted, /* AM_HTML_UNQUOTED */ |
| g_am_js, /* AM_JS */ |
| g_am_js_number, /* AM_JS_NUMBER */ |
| g_am_url_html, /* AM_URL_HTML */ |
| g_am_url_query, /* AM_URL_QUERY */ |
| g_am_style, /* AM_STYLE */ |
| g_am_xml, /* AM_XML */ |
| }; |
| |
| string PrettyPrintOneModifier(const ModifierAndValue& modval) { |
| string out; |
| out.append(":"); |
| if (modval.modifier_info->short_name) // short_name is a char. |
| out.append(1, modval.modifier_info->short_name); |
| else |
| out.append(modval.modifier_info->long_name); |
| if (modval.value_len != 0) |
| out.append(modval.value, modval.value_len); |
| return out; |
| } |
| |
| string PrettyPrintModifiers(const vector<const ModifierAndValue*>& modvals, |
| const string& separator) { |
| string out; |
| for (vector<const ModifierAndValue*>::const_iterator it = |
| modvals.begin(); it != modvals.end(); ++it) { |
| if (it != modvals.begin()) |
| out.append(separator); |
| out.append(PrettyPrintOneModifier(**it)); |
| } |
| return out; |
| } |
| |
| // Return the sequence of escaping directives to apply for the given context. |
| // An empty vector indicates an error occurred. Currently we never need |
| // to chain escaping directives hence on success, the vector is always of |
| // size 1. This may change in the future. |
| vector<const ModifierAndValue*> GetModifierForHtmlJs( |
| HtmlParser* htmlparser, string* error_msg) { |
| assert(htmlparser); |
| assert(error_msg); |
| vector<const ModifierAndValue*> modvals; |
| |
| // Two cases of being inside javascript: |
| // 1. Inside raw javascript (within a <script> tag). If the value |
| // is quoted we apply javascript_escape, if not we have to coerce |
| // it to a safe value due to the risk of javascript code execution |
| // hence apply :J=number. If arbitrary code needs to be inserted |
| // at run-time, the developer must use :none. |
| // 2. In the value of an attribute that takes javascript such |
| // as onmouseevent in '<a href="someUrl" onmousevent="{{EVENT}}">'. |
| // That will be covered in the STATE_VALUE state logic below. |
| if (htmlparser->InJavascript() && |
| htmlparser->state() != HtmlParser::STATE_VALUE) { |
| if (htmlparser->IsJavascriptQuoted()) { |
| modvals.push_back(g_am_dirs[AM_JS]); |
| assert(modvals.size() == 1); |
| return modvals; |
| } else { |
| modvals.push_back(g_am_dirs[AM_JS_NUMBER]); |
| assert(modvals.size() == 1); |
| return modvals; |
| } |
| } |
| switch (htmlparser->state()) { |
| case HtmlParser::STATE_VALUE:{ |
| string attribute_name = htmlparser->attribute(); |
| switch (htmlparser->AttributeType()) { |
| case HtmlParser::ATTR_URI: |
| // Case 1: The URL is quoted: |
| // . Apply :U=html if it is a complete URL or :h if it is a fragment. |
| // Case 2: The URL is not quoted: |
| // . If it is a complete URL, we have no safe modifiers that |
| // won't break it so we have to fail. |
| // . If it is a URL fragment, then :u is safe and not likely to |
| // break the URL. |
| if (!htmlparser->IsAttributeQuoted()) { |
| if (htmlparser->IsUrlStart()) { // Complete URL. |
| error_msg->append("Value of URL attribute \"" + attribute_name + |
| "\" must be enclosed in quotes."); |
| assert(modvals.empty()); |
| return modvals; // Empty |
| } else { // URL fragment. |
| modvals.push_back(g_am_dirs[AM_URL_QUERY]); |
| } |
| } else { |
| // Only validate the URL if we have a complete URL, |
| // otherwise simply html_escape. |
| if (htmlparser->IsUrlStart()) |
| modvals.push_back(g_am_dirs[AM_URL_HTML]); |
| else |
| modvals.push_back(g_am_dirs[AM_HTML]); |
| } |
| break; |
| case HtmlParser::ATTR_REGULAR: |
| // If the value is quoted, simply HTML escape, otherwise |
| // apply stricter escaping using H=attribute. |
| if (htmlparser->IsAttributeQuoted()) |
| modvals.push_back(g_am_dirs[AM_HTML]); |
| else |
| modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]); |
| break; |
| case HtmlParser::ATTR_STYLE: |
| // If the value is quoted apply :c, otherwise fail. |
| if (htmlparser->IsAttributeQuoted()) { |
| modvals.push_back(g_am_dirs[AM_STYLE]); |
| } else { |
| error_msg->append("Value of style attribute \"" + attribute_name + |
| "\" must be enclosed in quotes."); |
| assert(modvals.empty()); |
| return modvals; // Empty |
| } |
| break; |
| case HtmlParser::ATTR_JS: |
| // We require javascript accepting attributes (such as onclick) |
| // to be HTML quoted, otherwise they are vulnerable to |
| // HTML attribute insertion via the use of whitespace. |
| if (!htmlparser->IsAttributeQuoted()) { |
| error_msg->append("Value of javascript attribute \"" + |
| attribute_name + |
| "\" must be enclosed in quotes."); |
| assert(modvals.empty()); |
| return modvals; // Empty |
| } |
| // If the variable is quoted apply javascript_escape otherwise |
| // apply javascript_number which will ensure it is safe against |
| // code injection. |
| // Note: We normally need to HTML escape after javascript escape |
| // but the javascript escape implementation provided makes the |
| // HTML escape redundant so simply javascript escape. |
| if (htmlparser->IsJavascriptQuoted()) |
| modvals.push_back(g_am_dirs[AM_JS]); |
| else |
| modvals.push_back(g_am_dirs[AM_JS_NUMBER]); |
| break; |
| case HtmlParser::ATTR_NONE: |
| assert("We should be in attribute!" && 0); |
| default: |
| assert("Should not be able to get here." && 0); |
| return modvals; // Empty |
| } |
| // In STATE_VALUE particularly, the parser may get out of sync with |
| // the correct state - that the browser sees - due to the fact that |
| // it does not get to parse run-time content (variables). So we tell |
| // the parser there is content that will be expanded here. |
| // A good example is: |
| // <a href={{URL}} alt={{NAME}}> |
| // The parser sees <a href= alt=> and interprets 'alt=' to be |
| // the value of href. |
| htmlparser->InsertText(); // Ignore return value. |
| assert(modvals.size() == 1); |
| return modvals; |
| } |
| case HtmlParser::STATE_TAG:{ |
| // Apply H=attribute to tag names since they are alphabetic. |
| // Examples of tag names: TITLE, BODY, A and BR. |
| modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]); |
| assert(modvals.size() == 1); |
| return modvals; |
| } |
| case HtmlParser::STATE_ATTR:{ |
| // Apply H=attribute to attribute names since they are alphabetic. |
| // Examples of attribute names: HREF, SRC and WIDTH. |
| modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]); |
| assert(modvals.size() == 1); |
| return modvals; |
| } |
| case HtmlParser::STATE_COMMENT: |
| case HtmlParser::STATE_TEXT:{ |
| // Apply :h to regular HTML text and :c if within a style tag. |
| if (htmlparser->InCss()) |
| modvals.push_back(g_am_dirs[AM_STYLE]); |
| else |
| modvals.push_back(g_am_dirs[AM_HTML]); |
| assert(modvals.size() == 1); |
| return modvals; |
| } |
| default:{ |
| assert("Should not be able to get here." && 0); |
| return modvals; // Empty |
| } |
| } |
| assert("Should not be able to get here." && 0); |
| return modvals; // Empty |
| } |
| |
| // TODO(jad): Memoize all GetModifierForXXX functions below. |
| // They don't depend on parser context (from csilvers). |
| vector<const ModifierAndValue*> GetModifierForCss(HtmlParser* htmlparser, |
| string* error_msg) { |
| vector<const ModifierAndValue*> modvals; |
| modvals.push_back(g_am_dirs[AM_STYLE]); |
| return modvals; |
| } |
| |
| vector<const ModifierAndValue*> GetModifierForXml(HtmlParser* htmlparser, |
| string* error_msg) { |
| vector<const ModifierAndValue*> modvals; |
| modvals.push_back(g_am_dirs[AM_XML]); |
| return modvals; |
| } |
| |
| vector<const ModifierAndValue*> GetModifierForJson(HtmlParser* htmlparser, |
| string* error_msg) { |
| vector<const ModifierAndValue*> modvals; |
| modvals.push_back(g_am_dirs[AM_JS]); |
| return modvals; |
| } |
| |
| vector<const ModifierAndValue*> GetDefaultModifierForHtml() { |
| vector<const ModifierAndValue*> modvals; |
| modvals.push_back(g_am_dirs[AM_HTML]); |
| return modvals; |
| } |
| |
| vector<const ModifierAndValue*> GetDefaultModifierForJs() { |
| vector<const ModifierAndValue*> modvals; |
| modvals.push_back(g_am_dirs[AM_JS]); |
| return modvals; |
| } |
| |
| vector<const ModifierAndValue*> GetDefaultModifierForCss() { |
| return GetModifierForCss(NULL, NULL); |
| } |
| |
| vector<const ModifierAndValue*> GetDefaultModifierForXml() { |
| return GetModifierForXml(NULL, NULL); |
| } |
| |
| vector<const ModifierAndValue*> GetDefaultModifierForJson() { |
| return GetModifierForJson(NULL, NULL); |
| } |
| |
| } |