Merge commit '3e013f9387c8370dd9bd729e05799cbdcb924005' as 'third_party/ctemplate' Change-Id: Ie20726199c36cc8bf7372e1f6f4a547c29c18cc5

commit: 8ab8a65883d39259a33ea8927c4f3411e44d5e41 [log] [tgz]
author: Brian Silverman <brians> Mon Sep 21 17:49:11 2015 -0400
committer: Brian Silverman <brians> Mon Sep 21 17:49:11 2015 -0400
tree: e95fa14d1cb63e92012afc0a4d09bae7745d422f
parent: f20afaa78874f228ad554a24021ab1dd72f172e9 [diff] [blame]
diff --git a/third_party/ctemplate/src/template_modifiers.cc b/third_party/ctemplate/src/template_modifiers.cc
new file mode 100644
index 0000000..4e35281
--- /dev/null
+++ b/third_party/ctemplate/src/template_modifiers.cc

@@ -0,0 +1,1417 @@
+// Copyright (c) 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: csilvers@google.com (Craig Silverstein)
+//
+// template_modifiers.h has a description of what each escape-routine does.
+//
+// When creating a new modifier, you must subclass TemplateModifier
+// and define your own Modify() method.  This method takes the string
+// to be modified as a char*/int pair.  It then emits the modified
+// version of the string to outbuf.  Outbuf is an ExpandEmitter, as
+// defined in template_modifiers.h.  It's a very simple type that
+// supports appending to a data stream.
+//
+// Be very careful editing an existing modifier.  Subtle changes can
+// introduce the possibility for cross-site scripting attacks.  If you
+// do change a modifier, be careful that it does not affect
+// the list of Safe XSS Alternatives.
+//
+
+#include <config.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <string>
+#include <vector>
+#include "htmlparser/htmlparser_cpp.h"
+#include <ctemplate/template_modifiers.h>
+#include "template_modifiers_internal.h"
+#include <ctemplate/per_expand_data.h>
+using std::string;
+using std::vector;
+
+#define strliterallen(s)  (sizeof("" s "") - 1)
+
+// Really we should be using uint_16_t or something, but this is good
+// enough, and more portable...
+typedef unsigned int uint16;
+
+namespace URL {
+bool HasInsecureProtocol(const char* in, int inlen) {
+  if (inlen > strliterallen("http://") &&
+      strncasecmp(in, "http://", strliterallen("http://")) == 0) {
+    return false;  // We're ok, it's an http protocol
+  }
+  if (inlen > strliterallen("https://") &&
+      strncasecmp(in, "https://", strliterallen("https://")) == 0) {
+    return false;  // https is ok as well
+  }
+  if (inlen > strliterallen("ftp://") &&
+      strncasecmp(in, "ftp://", strliterallen("ftp://")) == 0) {
+    return false;  // and ftp
+  }
+  return true;
+}
+}  // namespace URL
+
+namespace ctemplate {
+
+using ctemplate_htmlparser::HtmlParser;
+
+// A most-efficient way to append a string literal to the var named 'out'.
+// The ""s ensure literal is actually a string literal
+#define APPEND(literal)  out->Emit("" literal "", sizeof(literal)-1)
+
+// Check whether the string of length len is identical to the literal.
+// The ""s ensure literal is actually a string literal
+#define STR_IS(str, len, literal) \
+  ((len) == sizeof("" literal "") - 1 && \
+   memcmp(str, literal, sizeof("" literal "") - 1) == 0)
+
+TemplateModifier::~TemplateModifier() {}
+
+void NullModifier::Modify(const char* in, size_t inlen,
+                          const PerExpandData*,
+                          ExpandEmitter* out, const string& arg) const {
+  out->Emit(in, inlen);
+}
+NullModifier null_modifier;
+
+static inline void EmitRun(const char* start, const char* limit,
+                           ExpandEmitter* out) {
+  if (start < limit) {
+    out->Emit(start, (limit - start));
+  }
+}
+
+void HtmlEscape::Modify(const char* in, size_t inlen,
+                        const PerExpandData*,
+                        ExpandEmitter* out, const string& arg) const {
+  const char* pos = in;
+  const char* start = pos;
+  const char* const limit = in + inlen;
+  while (pos < limit) {
+    switch (*pos) {
+      default:
+        // Increment our counter and look at the next character.
+        ++pos;
+        continue;
+
+      case '&':  EmitRun(start, pos, out); APPEND("&amp;");  break;
+      case '"':  EmitRun(start, pos, out); APPEND("&quot;"); break;
+      case '\'': EmitRun(start, pos, out); APPEND("&#39;");  break;
+      case '<':  EmitRun(start, pos, out); APPEND("&lt;");   break;
+      case '>':  EmitRun(start, pos, out); APPEND("&gt;");   break;
+
+      case '\r': case '\n': case '\v': case '\f': case '\t':
+        EmitRun(start, pos, out); APPEND(" ");      break;
+    }
+    start = ++pos;
+  }
+  EmitRun(start, pos, out);
+}
+HtmlEscape html_escape;
+
+void PreEscape::Modify(const char* in, size_t inlen,
+                       const PerExpandData*,
+                       ExpandEmitter* out, const string& arg) const {
+  const char* pos = in;
+  const char* start = pos;
+  const char* const limit = in + inlen;
+  while (pos < limit)  {
+    switch (*pos) {
+      default:
+        // Increment our counter and look at the next character.
+        ++pos;
+        continue;
+
+      // Unlike HtmlEscape, we leave whitespace as is.
+      case '&':  EmitRun(start, pos, out); APPEND("&amp;");  break;
+      case '"':  EmitRun(start, pos, out); APPEND("&quot;"); break;
+      case '\'': EmitRun(start, pos, out); APPEND("&#39;");  break;
+      case '<':  EmitRun(start, pos, out); APPEND("&lt;");   break;
+      case '>':  EmitRun(start, pos, out); APPEND("&gt;");   break;
+    }
+    start = ++pos;
+  }
+  EmitRun(start, pos, out);
+}
+PreEscape pre_escape;
+
+// We encode the presence and ordering of unclosed tags in a string, using the
+// letters b, i, s, and e to stand for <b>, <i>, <span>, and <em> respectively.
+// The most recently opened tag is appended onto the end of the string, so in
+// the common case of properly nested tags, we need only look at the last
+// character.  If we don't find it there, we need to continue looking at
+// everything until we find it, because tags may not necessarily be in order.
+// Similarly, when we add a tag, we need to check each existing tag for a match
+// so that we don't nest.
+class UnclosedSnippetTags {
+ public:
+  // We could use ordinary ints for the enum values, but using mnemonic
+  // characters potentially makes debugging easier.
+  typedef enum {
+    TAG_B = 'b',
+    TAG_I = 'i',
+    TAG_EM = 'e',
+    TAG_SPAN = 's',
+  } Tag;
+
+  UnclosedSnippetTags() : tag_length(0) {
+    memset(tags, 0, 5);
+  }
+
+  // Adds a tag to the set of open tags if it's not already open, or otherwise
+  // return false.
+  inline bool MaybeAdd(Tag tag) {
+    if (strchr(tags, tag)) {
+      return false;
+    } else {
+      tags[tag_length++] = tag;
+      return true;
+    }
+  }
+
+  // Removes a tag from the set of open tags if it's open, or otherwise return
+  // false.
+  inline bool MaybeRemove(Tag tag) {
+    char* tag_location = strchr(tags, tag);
+    if (tag_location) {
+      for (char* c = tag_location; *c; ++c) {
+        // Have to copy all later tags down by one so we don't leave a gap in the
+        // array.
+        *c = *(c + 1);
+      }
+      --tag_length;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  inline void PrintClosingTags(ExpandEmitter* out) {
+    for (int i = tag_length; i >= 0; --i) {
+      switch (tags[i]) {
+        case TAG_B:
+          out->Emit("</b>"); break;
+        case TAG_I:
+          out->Emit("</i>"); break;
+        case TAG_EM:
+          out->Emit("</em>"); break;
+        case TAG_SPAN:
+          out->Emit("</span>"); break;
+      }
+    }
+  }
+
+ private:
+  char tags[5];
+  int tag_length;
+};
+
+void SnippetEscape::Modify(const char* in, size_t inlen,
+                           const PerExpandData*,
+                           ExpandEmitter* out, const string& arg) const {
+  UnclosedSnippetTags unclosed;
+  const char* pos = in;
+  const char* start = pos;
+  const char* const limit = in + inlen;
+  while (pos < limit) {
+    switch (*pos) {
+      default:
+        // Increment our counter and look at the next character.
+        ++pos;
+        continue;
+
+      case '<': {
+        // If there is a permissible tag, just advance pos past it to
+        // make it part of the current run.  Notice the use of
+        // "continue" below.
+        const char* const next_pos = pos + 1;
+        const int chars_left = limit - next_pos;
+        if ((chars_left >= 2) && !memcmp(next_pos, "b>", 2)
+            && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_B)) {
+          pos += strliterallen("<b>");
+          continue;
+        } else if ((chars_left >= 2) && !memcmp(next_pos, "i>", 2)
+                   && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_I)) {
+          pos += strliterallen("<i>");
+          continue;
+        } else if ((chars_left >= 3) && !memcmp(next_pos, "em>", 3)
+                   && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_EM)) {
+          pos += strliterallen("<em>");
+          continue;
+        } else if ((chars_left >= 13) && !memcmp(next_pos, "span dir=", 9)
+                   && (!memcmp(next_pos + 9, "ltr>", 4) ||
+                       !memcmp(next_pos + 9, "rtl>", 4))
+                   && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_SPAN)) {
+          pos += strliterallen("<span dir=ltr>");
+          continue;
+        } else if ((chars_left >= 3) && !memcmp(next_pos, "/b>", 3)
+                   && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_B)) {
+          pos += strliterallen("</b>");
+          continue;
+        } else if ((chars_left >= 3) && !memcmp(next_pos, "/i>", 3)
+                   && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_I)) {
+          pos += strliterallen("</i>");
+          continue;
+        } else if ((chars_left >= 4) && !memcmp(next_pos, "/em>", 4)
+                   && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_EM)) {
+          pos += strliterallen("</em>");
+          continue;
+        } else if ((chars_left >= 6) && !memcmp(next_pos, "/span>", 6)
+                   && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_SPAN)) {
+          pos += strliterallen("</span>");
+          continue;
+        } else if ((chars_left >= 3) && !memcmp(next_pos, "br>", 3)) {
+          pos += strliterallen("<br>");
+          continue;
+        } else if ((chars_left >= 4) && !memcmp(next_pos, "wbr>", 4)) {
+          pos += strliterallen("<wbr>");
+          continue;
+        }
+
+        // Emit the entity and break out of the switch.
+        EmitRun(start, pos, out);
+        APPEND("&lt;");
+        break;
+      }
+
+      case '&':
+        EmitRun(start, pos, out);
+        if (pos + 1 < limit && pos[1] == '{') {
+          // Could be a javascript entity, so we need to escape.
+          // (Javascript entities are an xss risk in Netscape 4.)
+          APPEND("&amp;");
+        } else {
+          APPEND("&");
+        }
+        break;
+
+      case '"':  EmitRun(start, pos, out); APPEND("&quot;"); break;
+      case '\'': EmitRun(start, pos, out); APPEND("&#39;");  break;
+      case '>':  EmitRun(start, pos, out); APPEND("&gt;");   break;
+
+      case '\r': case '\n': case '\v': case '\f': case '\t':
+        // non-space whitespace
+        EmitRun(start, pos, out); APPEND(" "); break;
+
+    }
+    start = ++pos;
+  }
+  EmitRun(start, pos, out);
+  unclosed.PrintClosingTags(out);
+}
+SnippetEscape snippet_escape;
+
+void CleanseAttribute::Modify(const char* in, size_t inlen,
+                              const PerExpandData*,
+                              ExpandEmitter* out, const string& arg) const {
+  for (size_t i = 0; i < inlen; ++i) {
+    char c = in[i];
+    switch (c) {
+      case '=': {
+        if (i == 0 || i == (inlen - 1))
+          out->Emit('_');
+        else
+          out->Emit(c);
+        break;
+      }
+      case '-':
+      case '.':
+      case '_':
+      case ':': {
+        out->Emit(c);
+        break;
+      }
+      default: {
+        if ((c >= 'a' && c <= 'z') ||
+            (c >= 'A' && c <= 'Z') ||
+            (c >= '0' && c <= '9')) {
+          out->Emit(c);
+        } else {
+          APPEND("_");
+        }
+        break;
+      }
+    }
+  }
+}
+CleanseAttribute cleanse_attribute;
+
+void CleanseCss::Modify(const char* in, size_t inlen,
+                        const PerExpandData*,
+                        ExpandEmitter* out, const string& arg) const {
+  for (size_t i = 0; i < inlen; ++i) {
+    char c = in[i];
+    switch (c) {
+      case ' ':
+      case '_':
+      case '.':
+      case ',':
+      case '!':
+      case '#':
+      case '%':
+      case '-': {
+        out->Emit(c);
+        break;
+      }
+      default: {
+        if ((c >= 'a' && c <= 'z') ||
+            (c >= 'A' && c <= 'Z') ||
+            (c >= '0' && c <= '9')) {
+          out->Emit(c);
+        }
+        break;
+      }
+    }
+  }
+}
+CleanseCss cleanse_css;
+
+// CssUrlEscape is used as a chained modifier by ValidateUrl
+// (validate_url_and_css_escape) and is not directly exposed.
+class CssUrlEscape : public TemplateModifier {
+ public:
+  virtual void Modify(const char* in, size_t inlen,
+                      const PerExpandData*, ExpandEmitter* outbuf,
+                      const string& arg) const;
+};
+
+// URL-encodes the characters [\n\r\\'"()<>*] to ensure the URL can be safely
+// inserted in a CSS context, e.g:
+// . In an '@import url("URL");' statement
+// . In a CSS property such as 'background: url("URL");'
+// In both locations above, enclosing quotes are optional but parens are not.
+// We want to make sure the URL cannot exit the parens enclosure, close a
+// STYLE tag or reset the browser's CSS parser (via comments or newlines).
+//
+// References:
+// . CSS 2.1 URLs: http://www.w3.org/TR/CSS21/syndata.html#url
+// . CSS 1 URLs: http://www.w3.org/TR/REC-CSS1/#url
+void CssUrlEscape::Modify(const char* in, size_t inlen,
+                          const PerExpandData*,
+                          ExpandEmitter* out, const string& arg) const {
+  for (size_t i = 0; i < inlen; ++i) {
+    char c = in[i];
+    switch (c) {
+      case '\n': APPEND("%0A"); break;
+      case '\r': APPEND("%0D"); break;
+      case '"':  APPEND("%22"); break;
+      case '\'': APPEND("%27"); break;
+      case '(':  APPEND("%28"); break;
+      case ')':  APPEND("%29"); break;
+      case '*':  APPEND("%2A"); break;
+      case '<':  APPEND("%3C"); break;
+      case '>':  APPEND("%3E"); break;
+      case '\\': APPEND("%5C"); break;
+      default: out->Emit(c); break;
+    }
+  }
+}
+CssUrlEscape css_url_escape;
+
+// These URLs replace unsafe URLs for :U and :I url-escaping modes.
+const char* const ValidateUrl::kUnsafeUrlReplacement = "#";
+const char* const ValidateUrl::kUnsafeImgSrcUrlReplacement =
+    "/images/cleardot.gif";
+
+void ValidateUrl::Modify(const char* in, size_t inlen,
+                         const PerExpandData* per_expand_data,
+                         ExpandEmitter* out, const string& arg) const {
+  const char* slashpos = (char*)memchr(in, '/', inlen);
+  if (slashpos == NULL) {
+    slashpos = in + inlen;
+  }
+  const void* colonpos = memchr(in, ':', slashpos - in);
+  // colon before first slash, could be a protocol
+  if (colonpos != NULL && URL::HasInsecureProtocol(in, inlen)) {
+    // It's a bad protocol, so return something safe
+    chained_modifier_.Modify(unsafe_url_replacement_,
+                             unsafe_url_replacement_length_,
+                             per_expand_data,
+                             out,
+                             "");
+    return;
+  }
+  // If we get here, it's a valid url, so just escape it
+  chained_modifier_.Modify(in, inlen, per_expand_data, out, "");
+}
+ValidateUrl validate_url_and_html_escape(
+    html_escape,
+    ValidateUrl::kUnsafeUrlReplacement);
+ValidateUrl validate_url_and_javascript_escape(
+    javascript_escape,
+    ValidateUrl::kUnsafeUrlReplacement);
+ValidateUrl validate_url_and_css_escape(
+    css_url_escape,
+    ValidateUrl::kUnsafeUrlReplacement);
+ValidateUrl validate_img_src_url_and_html_escape(
+    html_escape,
+    ValidateUrl::kUnsafeImgSrcUrlReplacement);
+ValidateUrl validate_img_src_url_and_javascript_escape(
+    javascript_escape,
+    ValidateUrl::kUnsafeImgSrcUrlReplacement);
+ValidateUrl validate_img_src_url_and_css_escape(
+    css_url_escape,
+    ValidateUrl::kUnsafeImgSrcUrlReplacement);
+
+void XmlEscape::Modify(const char* in, size_t inlen,
+                       const PerExpandData*,
+                       ExpandEmitter* out, const string& arg) const {
+  const char* pos = in;
+  const char* start = pos;
+  const char* const limit = in + inlen;
+  while (pos < limit) {
+    char ch = *pos;
+
+    // According to section 2.2 of the spec
+    // http://www.w3.org/TR/REC-xml/#charsets control characters in range
+    // 0x00-0x1F (except \t, \r and \n) are not valid XML characters. In
+    // particular, conformant parsers are allowed to die when encountering a FF
+    // char in PCDATA sections. These chars are replaced by a space.
+    if (ch >= 0x00 && ch < 0x20 && ch != '\t' && ch != '\r' && ch != '\n') {
+      EmitRun(start, pos, out);
+      out->Emit(' ');
+      start = ++pos;
+      continue;
+    }
+
+    switch (ch) {
+      default:
+        // Increment our counter and look at the next character.
+        ++pos;
+        continue;
+
+      case '&':  EmitRun(start, pos, out); APPEND("&amp;");  break;
+      case '"':  EmitRun(start, pos, out); APPEND("&quot;"); break;
+      case '\'': EmitRun(start, pos, out); APPEND("&#39;");  break;
+      case '<':  EmitRun(start, pos, out); APPEND("&lt;");   break;
+      case '>':  EmitRun(start, pos, out); APPEND("&gt;");   break;
+    }
+    start = ++pos;
+  }
+  EmitRun(start, pos, out);
+}
+XmlEscape xml_escape;
+
+// This table maps initial characters to code lengths.  This could be
+// done with a 16-byte table and a shift, but there's a substantial
+// performance increase by eliminating the shift.
+static const char kCodeLengths[256] = {
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+};
+
+// Returns the UTF-8 code-unit starting at start, or the special codepoint
+// 0xFFFD if the input ends abruptly or is not well-formed UTF-8.
+// start -- address of the start of the code unit which also receives the
+//          address past the end of the code unit returned.
+// end -- exclusive end of the string
+static inline uint16 UTF8CodeUnit(const char** start, const char *end) {
+  // Use kCodeLengths table to calculate the length of the code unit
+  // from the first character.
+  unsigned char first_char = static_cast<unsigned char>(**start);
+  size_t code_unit_len = kCodeLengths[first_char];
+  if (code_unit_len == 1) {
+    // Return the current byte as a codepoint.
+    // Either it is a valid single byte codepoint, or it's not part of a valid
+    // UTF-8 sequence, and so has to be handled individually.
+    ++*start;
+    return first_char;
+  }
+  const char *code_unit_end = *start + code_unit_len;
+  if (code_unit_end < *start || code_unit_end > end) {  // Truncated code unit.
+    ++*start;
+    return 0xFFFDU;
+  }
+  const char* pos = *start;
+  uint16 code_unit = *pos & (0xFFU >> code_unit_len);
+  while (--code_unit_len) {
+    uint16 tail_byte = *(++pos);
+    if ((tail_byte & 0xC0U) != 0x80U) {  // Malformed code unit.
+      ++*start;
+      return 0xFFFDU;
+    }
+    code_unit = (code_unit << 6) | (tail_byte & 0x3FU);
+  }
+  *start = code_unit_end;
+  return code_unit;
+}
+
+// A good reference is the ECMA standard (3rd ed), section 7.8.4:
+// http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
+void JavascriptEscape::Modify(const char* in, size_t inlen,
+                              const PerExpandData*,
+                              ExpandEmitter* out, const string& arg) const {
+  const char* pos = in;
+  const char* start = pos;
+  const char* const limit = in + inlen;
+
+  if (limit < in) { return; }
+
+  while (pos < limit) {
+    const char* next_pos = pos;
+    uint16 code_unit = UTF8CodeUnit(&next_pos, limit);
+
+    // Test for 16-bit values outside the switch below, because gcc
+    // will emit chained branches rather than a jump table for such a
+    // wide range of values.
+    if (code_unit & 0xFF00) {
+      // Linebreaks according to EcmaScript 262 which cannot appear in strings.
+      if (code_unit == 0x2028) {
+        // Line separator
+        EmitRun(start, pos, out); APPEND("\\u2028");
+      } else if (code_unit == 0x2029) {
+        // Paragraph separator
+        EmitRun(start, pos, out); APPEND("\\u2029");
+      } else {
+        pos = next_pos;
+        continue;
+      }
+    } else {
+      switch (code_unit) {
+        default:
+          // Increment our counter and look at the next character.
+          pos = next_pos;
+          continue;
+
+        case '\0': EmitRun(start, pos, out); APPEND("\\x00"); break;
+        case '"':  EmitRun(start, pos, out); APPEND("\\x22"); break;
+        case '\'': EmitRun(start, pos, out); APPEND("\\x27"); break;
+        case '\\': EmitRun(start, pos, out); APPEND("\\\\");  break;
+        case '\t': EmitRun(start, pos, out); APPEND("\\t");   break;
+        case '\r': EmitRun(start, pos, out); APPEND("\\r");   break;
+        case '\n': EmitRun(start, pos, out); APPEND("\\n");   break;
+        case '\b': EmitRun(start, pos, out); APPEND("\\b");   break;
+        case '\f': EmitRun(start, pos, out); APPEND("\\f");   break;
+        case '&':  EmitRun(start, pos, out); APPEND("\\x26"); break;
+        case '<':  EmitRun(start, pos, out); APPEND("\\x3c"); break;
+        case '>':  EmitRun(start, pos, out); APPEND("\\x3e"); break;
+        case '=':  EmitRun(start, pos, out); APPEND("\\x3d"); break;
+
+        case '\v':
+          // Do not escape vertical tabs to "\\v" since it is interpreted as 'v'
+          // by JScript according to section 2.1 of
+          // http://wiki.ecmascript.org/lib/exe/fetch.php?
+          // id=resources%3Aresources&cache=cache&
+          // media=resources:jscriptdeviationsfromes3.pdf
+          EmitRun(start, pos, out); APPEND("\\x0b"); break;
+      }
+    }
+    start = pos = next_pos;
+  }
+  EmitRun(start, pos, out);
+}
+JavascriptEscape javascript_escape;
+
+
+void JavascriptNumber::Modify(const char* in, size_t inlen,
+                              const PerExpandData*,
+                              ExpandEmitter* out, const string& arg) const {
+  if (inlen == 0)
+    return;
+
+  if (STR_IS(in, inlen, "true") || STR_IS(in, inlen, "false")) {
+    out->Emit(in, inlen);
+    return;
+  }
+
+  bool valid = true;
+  if (in[0] == '0' && inlen > 2 && (in[1] == 'x' || in[1] == 'X')) {
+    // There must be at least one hex digit after the 0x for it to be valid.
+    // Hex number. Check that it is of the form 0(x|X)[0-9A-Fa-f]+
+    for (size_t i = 2; i < inlen; i++) {
+      char c = in[i];
+      if (!((c >= 'a' && c <= 'f') ||
+            (c >= 'A' && c <= 'F') ||
+            (c >= '0' && c <= '9'))) {
+        valid = false;
+        break;
+      }
+    }
+  } else {
+    // Must be a base-10 (or octal) number.
+    // Check that it has the form [0-9+-.eE]+
+    for (size_t i = 0; i < inlen; i++) {
+      char c = in[i];
+      if (!((c >= '0' && c <= '9') ||
+            c == '+' || c == '-' || c == '.' ||
+            c == 'e' || c == 'E')) {
+        valid = false;
+        break;
+      }
+    }
+  }
+  if (valid) {
+    out->Emit(in, inlen);   // Number was valid, output it.
+  } else {
+    APPEND("null");         // Number was not valid, output null instead.
+  }
+}
+JavascriptNumber javascript_number;
+
+static inline bool IsUrlQueryEscapeSafeChar(unsigned char c) {
+  // Everything not matching [0-9a-zA-Z.,_*/~!()-] is escaped.
+  static unsigned long _safe_characters[8] = {
+    0x00000000L, 0x03fff702L, 0x87fffffeL, 0x47fffffeL,
+    0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L
+  };
+
+  return (_safe_characters[(c)>>5] & (1 << ((c) & 31)));
+}
+
+void UrlQueryEscape::Modify(const char* in, size_t inlen,
+                            const PerExpandData*,
+                            ExpandEmitter* out, const string& arg) const {
+  const char* pos = in;
+  const char* const limit = in + inlen;
+  while (true) {
+    // Peel off any initial runs of safe characters and emit them all
+    // at once.
+    const char* start = pos;
+    while (pos < limit && IsUrlQueryEscapeSafeChar(*pos)) {
+      pos++;
+    }
+    EmitRun(start, pos, out);
+
+    // Now deal with a single unsafe character.
+    if (pos < limit) {
+      unsigned char c = *pos;
+      if (c == ' ') {
+        out->Emit('+');
+      } else {
+        out->Emit('%');
+        out->Emit(((c>>4) < 10 ? ((c>>4) + '0') : (((c>>4) - 10) + 'A')));
+        out->Emit(((c&0xf) < 10 ? ((c&0xf) + '0') : (((c&0xf) - 10) + 'A')));
+      }
+      pos++;
+    } else {
+      // We're done!
+      break;
+    }
+  }
+}
+UrlQueryEscape url_query_escape;
+
+// For more information on escaping JSON, see section 2.5 in
+// http://www.ietf.org/rfc/rfc4627.txt.
+// Escaping '&', '<', '>' is optional in the JSON proposed RFC
+// but alleviates concerns with content sniffing if JSON is used
+// in a context where the browser may attempt to interpret HTML.
+void JsonEscape::Modify(const char* in, size_t inlen,
+                        const PerExpandData*,
+                        ExpandEmitter* out, const string& arg) const {
+  const char* pos = in;
+  const char* start = pos;
+  const char* const limit = in + inlen;
+  while (pos < limit) {
+    switch (*pos) {
+      default:
+        // Increment our counter and look at the next character.
+        ++pos;
+        continue;
+
+      case '"':  EmitRun(start, pos, out); APPEND("\\\"");    break;
+      case '\\': EmitRun(start, pos, out); APPEND("\\\\");    break;
+      case '/':  EmitRun(start, pos, out); APPEND("\\/");     break;
+      case '\b': EmitRun(start, pos, out); APPEND("\\b");     break;
+      case '\f': EmitRun(start, pos, out); APPEND("\\f");     break;
+      case '\n': EmitRun(start, pos, out); APPEND("\\n");     break;
+      case '\r': EmitRun(start, pos, out); APPEND("\\r");     break;
+      case '\t': EmitRun(start, pos, out); APPEND("\\t");     break;
+      case '&':  EmitRun(start, pos, out); APPEND("\\u0026"); break;
+      case '<':  EmitRun(start, pos, out); APPEND("\\u003C"); break;
+      case '>':  EmitRun(start, pos, out); APPEND("\\u003E"); break;
+    }
+    start = ++pos;
+  }
+  EmitRun(start, pos, out);
+}
+JsonEscape json_escape;
+
+void PrefixLine::Modify(const char* in, size_t inlen,
+                        const PerExpandData*,
+                        ExpandEmitter* out, const string& arg) const {
+  while (inlen > 0) {
+    const char* nl = (const char*)memchr(in, '\n', inlen);
+    const char* cr = (const char*)memchr(in, '\r', nl ? nl - in : inlen);
+    size_t linelen;
+    if (nl == NULL && cr == NULL) {
+      // We're at the last line
+      out->Emit(in, inlen);
+      break;
+    } else {
+      // One or both of \r and \n is set; point to the first char past
+      // the newline.  Note for \r\n, that's the char after the \n,
+      // otherwise, it's the char past the \r or the \n we see.
+      if ((nl == NULL) != (cr == NULL))     // one is set, the other is NULL
+        linelen = (nl ? nl : cr) + 1 - in;
+      else if (nl == cr + 1 || nl < cr)     // \r\n, or \n comes first
+        linelen = nl + 1 - in;
+      else
+        linelen = cr + 1 - in;
+    }
+    out->Emit(in, linelen);
+    out->Emit(arg);               // a new line, so emit the prefix
+    in += linelen;
+    inlen -= linelen;
+    assert(inlen >= 0);
+  }
+}
+PrefixLine prefix_line;
+
+
+// Must be at least one more than the maximum number of alternative modifiers
+// specified in any given element of g_modifiers.
+# define MAX_SAFE_ALTERNATIVES 10  // If the compiler complains, increase it.
+
+// Use the empty string if you want a modifier not to have a long-name.
+// Use '\0' if you want a modifier not to have a short-name.
+// Note: not all modifiers are in this array:
+// 1) SnippetEscape: use html_escape_with_arg=snippet to get this
+// 2) CleanseAttribute: use html_escape_with_arg=attribute to get this
+// 3) ValidateUrl: use html_escape_with_arg=url to get this
+//
+// Some modifiers define other modifiers that are safe replacements
+// from an XSS perspective. Replacements are not commutative so for
+// example H=pre considers H=attribute a safe replacement to it
+// but H=attribute has no safe replacements.
+// This struct is not pretty but allows the definitions to be
+// done without the need for a global initialization method.
+// Be very careful making a change to g_modifiers as modifiers
+// point to other ones within that same array so elements
+// may not be re-ordered easily. Also you need to change
+// the global g_am_dirs correspondingly.
+//
+static struct ModifierWithAlternatives {
+  ModifierInfo modifier_info;
+  ModifierInfo* safe_alt_mods[MAX_SAFE_ALTERNATIVES];
+} g_modifiers[] = {
+  /* 0 */ { ModifierInfo("cleanse_css", 'c',
+                         XSS_WEB_STANDARD, &cleanse_css),
+            {&g_modifiers[16].modifier_info,  // url_escape_with_arg=css
+             // img_src_url_escape_with_arg=css
+             &g_modifiers[19].modifier_info} },
+  /* 1 */ { ModifierInfo("html_escape", 'h',
+                         XSS_WEB_STANDARD, &html_escape),
+            {&g_modifiers[2].modifier_info,   // html_escape_with_arg=snippet
+             &g_modifiers[3].modifier_info,   // html_escape_with_arg=pre
+             &g_modifiers[4].modifier_info,   // html_escape_with_arg=attribute
+             &g_modifiers[5].modifier_info,   // html_escape_with_arg=url
+             &g_modifiers[8].modifier_info,   // pre_escape
+             &g_modifiers[9].modifier_info,   // url_query_escape
+             &g_modifiers[11].modifier_info,  // url_escape_with_arg=html
+             &g_modifiers[12].modifier_info,  // url_escape_with_arg=query
+             // img_src_url_escape_with_arg=html
+             &g_modifiers[18].modifier_info} },
+  /* 2 */ { ModifierInfo("html_escape_with_arg=snippet", 'H',
+                         XSS_WEB_STANDARD, &snippet_escape),
+            {&g_modifiers[1].modifier_info,   // html_escape
+             &g_modifiers[3].modifier_info,   // html_escape_with_arg=pre
+             &g_modifiers[4].modifier_info,   // html_escape_with_arg=attribute
+             &g_modifiers[8].modifier_info,   // pre_escape
+             &g_modifiers[9].modifier_info,   // url_query_escape
+             &g_modifiers[12].modifier_info} },  // url_escape_with_arg=query
+  /* 3 */ { ModifierInfo("html_escape_with_arg=pre", 'H',
+                         XSS_WEB_STANDARD, &pre_escape),
+            {&g_modifiers[1].modifier_info,   // html_escape
+             &g_modifiers[2].modifier_info,   // html_escape_with_arg=snippet
+             &g_modifiers[4].modifier_info,   // html_escape_with_arg=attribute
+             &g_modifiers[8].modifier_info,   // pre_escape
+             &g_modifiers[9].modifier_info,   // url_query_escape
+             &g_modifiers[12].modifier_info} },  // url_escape_with_arg=query
+  /* 4 */ { ModifierInfo("html_escape_with_arg=attribute", 'H',
+                         XSS_WEB_STANDARD, &cleanse_attribute), {} },
+  /* 5 */ { ModifierInfo("html_escape_with_arg=url", 'H',
+                         XSS_WEB_STANDARD, &validate_url_and_html_escape),
+            // img_src_url_escape_with_arg=html
+            {&g_modifiers[18].modifier_info} },
+  /* 6 */ { ModifierInfo("javascript_escape", 'j',
+                         XSS_WEB_STANDARD, &javascript_escape),
+            {&g_modifiers[7].modifier_info,   // json_escape
+             &g_modifiers[10].modifier_info,  // url_escape_with_arg=javascript
+             // img_src_url_escape_with_arg=javascript
+             &g_modifiers[17].modifier_info} },
+  /* 7 */ { ModifierInfo("json_escape", 'o', XSS_WEB_STANDARD, &json_escape),
+            {&g_modifiers[6].modifier_info} },  // javascript_escape
+  /* 8 */ { ModifierInfo("pre_escape", 'p', XSS_WEB_STANDARD, &pre_escape),
+            {&g_modifiers[1].modifier_info,     // html_escape
+             &g_modifiers[2].modifier_info,     // html_escape_with_arg=snippet
+             &g_modifiers[3].modifier_info,     // html_escape_with_arg=pre
+             &g_modifiers[4].modifier_info,     // html_escape_with_arg=attr...
+             &g_modifiers[9].modifier_info,     // url_query_escape
+             &g_modifiers[12].modifier_info} },   // url_escape_with_arg=query
+  /* 9 */ { ModifierInfo("url_query_escape", 'u',
+                         XSS_WEB_STANDARD, &url_query_escape), {} },
+  /* 10 */ { ModifierInfo("url_escape_with_arg=javascript", 'U',
+                          XSS_WEB_STANDARD,
+                          &validate_url_and_javascript_escape),
+             // img_src_url_escape_with_arg=javascript
+             {&g_modifiers[17].modifier_info} },
+  /* 11 */ { ModifierInfo("url_escape_with_arg=html", 'U',
+                          XSS_WEB_STANDARD, &validate_url_and_html_escape),
+             // img_src_url_escape_with_arg=html
+             {&g_modifiers[18].modifier_info} },
+  /* 12 */ { ModifierInfo("url_escape_with_arg=query", 'U',
+                          XSS_WEB_STANDARD, &url_query_escape), {} },
+  /* 13 */ { ModifierInfo("none", '\0', XSS_SAFE, &null_modifier), {} },
+  /* 14 */ { ModifierInfo("xml_escape", '\0', XSS_WEB_STANDARD, &xml_escape),
+             {&g_modifiers[1].modifier_info,      // html_escape
+              &g_modifiers[4].modifier_info,} },  // H=attribute
+  /* 15 */ { ModifierInfo("javascript_escape_with_arg=number", 'J',
+                          XSS_WEB_STANDARD, &javascript_number), {} },
+  /* 16 */ { ModifierInfo("url_escape_with_arg=css", 'U',
+                          XSS_WEB_STANDARD, &validate_url_and_css_escape), {} },
+  /* 17 */ { ModifierInfo("img_src_url_escape_with_arg=javascript", 'I',
+                          XSS_WEB_STANDARD,
+                          &validate_img_src_url_and_javascript_escape), {} },
+  /* 18 */ { ModifierInfo("img_src_url_escape_with_arg=html", 'I',
+                          XSS_WEB_STANDARD,
+                          &validate_img_src_url_and_html_escape), {} },
+  /* 19 */ { ModifierInfo("img_src_url_escape_with_arg=css", 'I',
+                          XSS_WEB_STANDARD,
+                          &validate_img_src_url_and_css_escape), {} },
+};
+
+static vector<const ModifierInfo*> g_extension_modifiers;
+static vector<const ModifierInfo*> g_unknown_modifiers;
+
+// Returns whether or not candidate can be safely (w.r.t XSS)
+// used in lieu of our ModifierInfo. This is true iff:
+//   1. Both have the same modifier function OR
+//   2. Candidate's modifier function is in our ModifierInfo's
+//      list (vector) of safe alternative modifier functions.
+//
+// This is used with the auto-escaping code, which automatically
+// figures out which modifier to apply to a variable based on the
+// variable's context (in an html "<A HREF", for instance).  Some
+// built-in modifiers are considered safe alternatives from the perspective
+// of preventing XSS (cross-site-scripting) attacks, in which case
+// the auto-escaper should allow the choice of which to use in the
+// template. This is intended only for internal use as it is dangerous
+// and complicated to figure out which modifier is an XSS-safe
+// replacement for a given one. Custom modifiers currently may not
+// indicate safe replacements, only built-in ones may do so.
+//
+// Note that this function is not commutative therefore
+// IsSafeXSSAlternative(a, b) may not be equal to IsSafeXSSAlternative(b, a).
+bool IsSafeXSSAlternative(const ModifierInfo& our,
+                          const ModifierInfo& candidate) {
+  // Succeeds even for non built-in modifiers but no harm.
+  if (our.modifier == candidate.modifier)
+    return true;
+
+  for (const ModifierWithAlternatives* mod_with_alts = g_modifiers;
+       mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers);
+       ++mod_with_alts) {
+    if (mod_with_alts->modifier_info.long_name == our.long_name)
+      // We found our Modifier in the built-in array g_modifiers.
+      for (int i = 0; mod_with_alts->safe_alt_mods[i] != NULL &&
+               i < MAX_SAFE_ALTERNATIVES; ++i)
+        if (mod_with_alts->safe_alt_mods[i]->long_name == candidate.long_name)
+          // We found candidate in our Modifier's list of safe alternatives.
+          return true;
+  }
+  // our is not built-in or candidate is not a safe replacement to our.
+  return false;
+}
+
+static inline bool IsExtensionModifier(const char* long_name) {
+  return memcmp(long_name, "x-", 2) == 0;
+}
+
+static bool AddModifierCommon(const char* long_name,
+                 const TemplateModifier* modifier, bool xss_safe) {
+  if (!IsExtensionModifier(long_name))
+    return false;
+
+  // TODO(csilvers): store in a map or multimap, rather than a vector
+  for (vector<const ModifierInfo*>::const_iterator mod =
+           g_extension_modifiers.begin();
+       mod != g_extension_modifiers.end();
+       ++mod) {
+    // Check if mod has the same name as us.  For modifiers that also take
+    // values, this is everything before the =.  The only time it's ok to
+    // have the same name is when we have different modval specializations:
+    // "foo=bar" and "foo=baz" are both valid names.  Note "foo" and
+    // "foo=bar" is not valid: foo has no modval, but "foo=bar" does.
+    const size_t new_modifier_namelen = strcspn(long_name, "=");
+    const size_t existing_modifier_namelen = strcspn((*mod)->long_name.c_str(),
+                                                     "=");
+    if (new_modifier_namelen == existing_modifier_namelen &&
+        !memcmp(long_name, (*mod)->long_name.c_str(), new_modifier_namelen)) {
+      if (long_name[new_modifier_namelen] == '=' &&
+          (*mod)->long_name[existing_modifier_namelen] == '=' &&
+          (*mod)->long_name != long_name) {
+        // It's ok, we're different specializations!
+      } else {
+        // It's not ok: we have the same name and no good excuse.
+        return false;
+      }
+    }
+  }
+
+  g_extension_modifiers.push_back(
+      new ModifierInfo(long_name, '\0',
+                       xss_safe ? XSS_SAFE : XSS_UNIQUE,
+                       modifier));
+  return true;
+}
+
+// Modifier added with XSS_UNIQUE XssClass.
+bool AddModifier(const char* long_name,
+                 const TemplateModifier* modifier) {
+  return AddModifierCommon(long_name, modifier, false);
+}
+
+// Modifier added with XSS_SAFE XssClass.
+bool AddXssSafeModifier(const char* long_name,
+                 const TemplateModifier* modifier) {
+  return AddModifierCommon(long_name, modifier, true);
+}
+
+// If candidate_match is a better match for modname/modval than bestmatch,
+// update bestmatch.  To be a better match, two conditions must be met:
+//  1) The candidate's name must match modname
+//  2) If the candidate is a specialization (that is, name is of the form
+//     "foo=bar", then modval matches the specialization value).
+//  3) If the candidate is not a specialization, bestmatch isn't a
+//     specialization either.
+// Condition (3) makes sure that if we match the ModifierInfo with name
+// "foo=bar", we don't claim the ModifierInfo "foo=" is a better match.
+// Recall that by definition, modval will always start with a '=' if present.
+static void UpdateBestMatch(const char* modname, size_t modname_len,
+                            const char* modval, size_t modval_len,
+                            const ModifierInfo* candidate_match,
+                            const ModifierInfo** best_match) {
+  // It's easiest to handle the two case differently: (1) candidate_match
+  // refers to a modifier that expects a modifier-value; (2) it doesn't.
+  if (candidate_match->modval_required) {
+    // To be a match, we have to fulfill three requirements: we have a
+    // modval, our modname matches candidate_match's modname (either
+    // shortname or longname), and our modval is consistent with the
+    // value specified in the longname (whatever might follow the =).
+    const char* const longname_start = candidate_match->long_name.c_str();
+    const char* const equals = strchr(longname_start, '=');
+    assert(equals != NULL);
+    if (modval_len > 0 &&
+        ((modname_len == 1 && *modname == candidate_match->short_name) ||
+         (modname_len == equals - longname_start &&
+          memcmp(modname, longname_start, modname_len) == 0)) &&
+        ((equals[1] == '\0') ||  // name is "foo=" (not a specialization)
+         (modval_len
+          == longname_start + candidate_match->long_name.size() - equals &&
+          memcmp(modval, equals, modval_len) == 0))) {
+      // Condition (3) above is satisfied iff our longname is longer than
+      // best-match's longname (so we prefer "foo=bar" to "foo=").
+      if (*best_match == NULL ||
+          candidate_match->long_name.size() > (*best_match)->long_name.size())
+        *best_match = candidate_match;
+    }
+  } else {
+    // In this case, to be a match: we must *not* have a modval.  Our
+    // modname still must match modinfo's modname (either short or long).
+    if (modval_len == 0 &&
+        ((modname_len == 1 && *modname == candidate_match->short_name) ||
+         (modname_len == candidate_match->long_name.size() &&
+          !memcmp(modname, candidate_match->long_name.data(), modname_len)))) {
+      // In the no-modval case, only one match should exist.
+      assert(*best_match == NULL);
+      *best_match = candidate_match;
+    }
+  }
+}
+
+const ModifierInfo* FindModifier(const char* modname, size_t modname_len,
+                                 const char* modval, size_t modval_len) {
+  // More than one modifier can match, in the case of modval specializations
+  // (e.g., the modifier "foo=" and "foo=bar" will both match on input of
+  // modname="foo", modval="bar").  In that case, we take the ModifierInfo
+  // with the longest longname, since that's the most specialized match.
+  const ModifierInfo* best_match = NULL;
+  if (modname_len >= 2 && IsExtensionModifier(modname)) {
+    for (vector<const ModifierInfo*>::const_iterator mod =
+             g_extension_modifiers.begin();
+         mod != g_extension_modifiers.end();
+         ++mod) {
+      UpdateBestMatch(modname, modname_len, modval, modval_len,
+                      *mod, &best_match);
+    }
+    if (best_match != NULL)
+      return best_match;
+
+    for (vector<const ModifierInfo*>::const_iterator mod =
+             g_unknown_modifiers.begin();
+         mod != g_unknown_modifiers.end();
+         ++mod) {
+      UpdateBestMatch(modname, modname_len, modval, modval_len,
+                      *mod, &best_match);
+    }
+    if (best_match != NULL)
+      return best_match;
+    // This is the only situation where we can pass in a modifier of NULL.
+    // It means "we don't know about this modifier-name."
+    string fullname(modname, modname_len);
+    if (modval_len) {
+      fullname.append(modval, modval_len);
+    }
+    // TODO(csilvers): store in a map or multimap, rather than a vector
+    g_unknown_modifiers.push_back(new ModifierInfo(fullname, '\0',
+                                                   XSS_UNIQUE, NULL));
+    return g_unknown_modifiers.back();
+  } else {
+    for (const ModifierWithAlternatives* mod_with_alts = g_modifiers;
+         mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers);
+         ++mod_with_alts) {
+      UpdateBestMatch(modname, modname_len, modval, modval_len,
+                      &mod_with_alts->modifier_info, &best_match);
+    }
+    return best_match;
+  }
+}
+
+// For escaping variables under the auto-escape mode:
+// Each directive below maps to a distinct sequence of
+// escaping directives (i.e a vector<ModifierAndValue>) applied
+// to a variable during run-time substitution.
+// The directives are stored in a global array (g_mods_ae)
+// initialized under lock in InitializeGlobalModifiers.
+enum AutoModifyDirective {
+  AM_EMPTY,                         // Unused, kept as marker.
+  AM_HTML,
+  AM_HTML_UNQUOTED,
+  AM_JS,
+  AM_JS_NUMBER,
+  AM_URL_HTML,
+  AM_URL_QUERY,
+  AM_STYLE,
+  AM_XML,
+  NUM_ENTRIES_AM,
+};
+
+// Populates the global vector of hard-coded modifiers that
+// Auto-Escape may pick. We point to the appropriate modifier in
+// the global g_modifiers.
+// Reference these globals via the global array g_am_dirs[] for consistency.
+// Note: We allow for more than one ModifierAndValue in the array hence
+// the need to terminate with a Null marker. However currently all the
+// escaping directives have exactly one ModifierAndValue.
+static const ModifierAndValue g_am_empty[] = {
+  ModifierAndValue(NULL, "", 0)
+};
+static const ModifierAndValue g_am_html[] = {
+  ModifierAndValue(&g_modifiers[1].modifier_info, "", 0),
+  ModifierAndValue(NULL, "", 0)
+};
+static const ModifierAndValue g_am_html_unquoted[] = {
+  ModifierAndValue(&g_modifiers[4].modifier_info, "=attribute", 10),
+  ModifierAndValue(NULL, "", 0)
+};
+static const ModifierAndValue g_am_js[] = {
+  ModifierAndValue(&g_modifiers[6].modifier_info, "", 0),
+  ModifierAndValue(NULL, "", 0)
+};
+static const ModifierAndValue g_am_js_number[] = {
+  ModifierAndValue(&g_modifiers[15].modifier_info, "=number", 7),
+  ModifierAndValue(NULL, "", 0)
+};
+static const ModifierAndValue g_am_url_html[] = {
+  ModifierAndValue(&g_modifiers[11].modifier_info, "=html", 5),
+  ModifierAndValue(NULL, "", 0)
+};
+static const ModifierAndValue g_am_url_query[] = {
+  ModifierAndValue(&g_modifiers[9].modifier_info, "", 0),
+  ModifierAndValue(NULL, "", 0)
+};
+static const ModifierAndValue g_am_style[] = {
+  ModifierAndValue(&g_modifiers[0].modifier_info, "", 0),
+  ModifierAndValue(NULL, "", 0)
+};
+static const ModifierAndValue g_am_xml[] = {
+  ModifierAndValue(&g_modifiers[14].modifier_info, "", 0),
+  ModifierAndValue(NULL, "", 0)
+};
+
+static const ModifierAndValue* g_am_dirs[NUM_ENTRIES_AM] = {
+  g_am_empty,                  /* AM_EMPTY */
+  g_am_html,                   /* AM_HTML */
+  g_am_html_unquoted,          /* AM_HTML_UNQUOTED */
+  g_am_js,                     /* AM_JS */
+  g_am_js_number,              /* AM_JS_NUMBER */
+  g_am_url_html,               /* AM_URL_HTML */
+  g_am_url_query,              /* AM_URL_QUERY */
+  g_am_style,                  /* AM_STYLE */
+  g_am_xml,                    /* AM_XML */
+};
+
+string PrettyPrintOneModifier(const ModifierAndValue& modval) {
+  string out;
+  out.append(":");
+  if (modval.modifier_info->short_name)      // short_name is a char.
+    out.append(1, modval.modifier_info->short_name);
+  else
+    out.append(modval.modifier_info->long_name);
+  if (modval.value_len != 0)
+    out.append(modval.value, modval.value_len);
+  return out;
+}
+
+string PrettyPrintModifiers(const vector<const ModifierAndValue*>& modvals,
+                            const string& separator) {
+  string out;
+  for (vector<const ModifierAndValue*>::const_iterator it =
+           modvals.begin(); it != modvals.end();  ++it) {
+    if (it != modvals.begin())
+      out.append(separator);
+    out.append(PrettyPrintOneModifier(**it));
+  }
+  return out;
+}
+
+// Return the sequence of escaping directives to apply for the given context.
+// An empty vector indicates an error occurred. Currently we never need
+// to chain escaping directives hence on success, the vector is always of
+// size 1. This may change in the future.
+vector<const ModifierAndValue*> GetModifierForHtmlJs(
+    HtmlParser* htmlparser, string* error_msg) {
+  assert(htmlparser);
+  assert(error_msg);
+  vector<const ModifierAndValue*> modvals;
+
+  // Two cases of being inside javascript:
+  // 1. Inside raw javascript (within a <script> tag). If the value
+  //    is quoted we apply javascript_escape, if not we have to coerce
+  //    it to a safe value due to the risk of javascript code execution
+  //    hence apply :J=number. If arbitrary code needs to be inserted
+  //    at run-time, the developer must use :none.
+  // 2. In the value of an attribute that takes javascript such
+  //    as onmouseevent in '<a href="someUrl" onmousevent="{{EVENT}}">'.
+  //    That will be covered in the STATE_VALUE state logic below.
+  if (htmlparser->InJavascript() &&
+      htmlparser->state() != HtmlParser::STATE_VALUE) {
+    if (htmlparser->IsJavascriptQuoted()) {
+      modvals.push_back(g_am_dirs[AM_JS]);
+      assert(modvals.size() == 1);
+      return modvals;
+    } else {
+      modvals.push_back(g_am_dirs[AM_JS_NUMBER]);
+      assert(modvals.size() == 1);
+      return modvals;
+    }
+  }
+  switch (htmlparser->state()) {
+    case HtmlParser::STATE_VALUE:{
+      string attribute_name = htmlparser->attribute();
+      switch (htmlparser->AttributeType()) {
+        case HtmlParser::ATTR_URI:
+          // Case 1: The URL is quoted:
+          // . Apply :U=html if it is a complete URL or :h if it is a fragment.
+          // Case 2: The URL is not quoted:
+          // .  If it is a complete URL, we have no safe modifiers that
+          //   won't break it so we have to fail.
+          // .  If it is a URL fragment, then :u is safe and not likely to
+          //   break the URL.
+          if (!htmlparser->IsAttributeQuoted()) {
+            if (htmlparser->IsUrlStart()) {   // Complete URL.
+              error_msg->append("Value of URL attribute \"" + attribute_name +
+                                "\" must be enclosed in quotes.");
+              assert(modvals.empty());
+              return modvals;  // Empty
+            } else {                                // URL fragment.
+              modvals.push_back(g_am_dirs[AM_URL_QUERY]);
+            }
+          } else {
+            // Only validate the URL if we have a complete URL,
+            // otherwise simply html_escape.
+            if (htmlparser->IsUrlStart())
+              modvals.push_back(g_am_dirs[AM_URL_HTML]);
+            else
+              modvals.push_back(g_am_dirs[AM_HTML]);
+          }
+          break;
+        case HtmlParser::ATTR_REGULAR:
+          // If the value is quoted, simply HTML escape, otherwise
+          // apply stricter escaping using H=attribute.
+          if (htmlparser->IsAttributeQuoted())
+            modvals.push_back(g_am_dirs[AM_HTML]);
+          else
+            modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
+          break;
+        case HtmlParser::ATTR_STYLE:
+          // If the value is quoted apply :c, otherwise fail.
+          if (htmlparser->IsAttributeQuoted()) {
+            modvals.push_back(g_am_dirs[AM_STYLE]);
+          } else {
+            error_msg->append("Value of style attribute \"" + attribute_name +
+                              "\" must be enclosed in quotes.");
+            assert(modvals.empty());
+            return modvals;   // Empty
+          }
+          break;
+        case HtmlParser::ATTR_JS:
+          // We require javascript accepting attributes (such as onclick)
+          // to be HTML quoted, otherwise they are vulnerable to
+          // HTML attribute insertion via the use of whitespace.
+          if (!htmlparser->IsAttributeQuoted()) {
+            error_msg->append("Value of javascript attribute \"" +
+                              attribute_name +
+                              "\" must be enclosed in quotes.");
+            assert(modvals.empty());
+            return modvals;   // Empty
+          }
+          // If the variable is quoted apply javascript_escape otherwise
+          // apply javascript_number which will ensure it is safe against
+          // code injection.
+          // Note: We normally need to HTML escape after javascript escape
+          // but the javascript escape implementation provided makes the
+          // HTML escape redundant so simply javascript escape.
+          if (htmlparser->IsJavascriptQuoted())
+            modvals.push_back(g_am_dirs[AM_JS]);
+          else
+            modvals.push_back(g_am_dirs[AM_JS_NUMBER]);
+          break;
+        case HtmlParser::ATTR_NONE:
+          assert("We should be in attribute!" && 0);
+        default:
+          assert("Should not be able to get here." && 0);
+          return modvals;  // Empty
+      }
+      // In STATE_VALUE particularly, the parser may get out of sync with
+      // the correct state - that the browser sees - due to the fact that
+      // it does not get to parse run-time content (variables). So we tell
+      // the parser there is content that will be expanded here.
+      // A good example is:
+      //   <a href={{URL}} alt={{NAME}}>
+      // The parser sees <a href= alt=> and interprets 'alt=' to be
+      // the value of href.
+      htmlparser->InsertText();  // Ignore return value.
+      assert(modvals.size() == 1);
+      return modvals;
+    }
+    case HtmlParser::STATE_TAG:{
+      // Apply H=attribute to tag names since they are alphabetic.
+      // Examples of tag names: TITLE, BODY, A and BR.
+      modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
+      assert(modvals.size() == 1);
+      return modvals;
+    }
+    case HtmlParser::STATE_ATTR:{
+      // Apply H=attribute to attribute names since they are alphabetic.
+      // Examples of attribute names: HREF, SRC and WIDTH.
+      modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
+      assert(modvals.size() == 1);
+      return modvals;
+    }
+    case HtmlParser::STATE_COMMENT:
+    case HtmlParser::STATE_TEXT:{
+      // Apply :h to regular HTML text and :c if within a style tag.
+      if (htmlparser->InCss())
+        modvals.push_back(g_am_dirs[AM_STYLE]);
+      else
+        modvals.push_back(g_am_dirs[AM_HTML]);
+      assert(modvals.size() == 1);
+      return modvals;
+    }
+    default:{
+      assert("Should not be able to get here." && 0);
+      return modvals;   // Empty
+    }
+  }
+  assert("Should not be able to get here." && 0);
+  return modvals;   // Empty
+}
+
+// TODO(jad): Memoize all GetModifierForXXX functions below.
+//            They don't depend on parser context (from csilvers).
+vector<const ModifierAndValue*> GetModifierForCss(HtmlParser* htmlparser,
+                                                  string* error_msg) {
+  vector<const ModifierAndValue*> modvals;
+  modvals.push_back(g_am_dirs[AM_STYLE]);
+  return modvals;
+}
+
+vector<const ModifierAndValue*> GetModifierForXml(HtmlParser* htmlparser,
+                                                        string* error_msg) {
+  vector<const ModifierAndValue*> modvals;
+  modvals.push_back(g_am_dirs[AM_XML]);
+  return modvals;
+}
+
+vector<const ModifierAndValue*> GetModifierForJson(HtmlParser* htmlparser,
+                                                         string* error_msg) {
+  vector<const ModifierAndValue*> modvals;
+  modvals.push_back(g_am_dirs[AM_JS]);
+  return modvals;
+}
+
+vector<const ModifierAndValue*> GetDefaultModifierForHtml() {
+  vector<const ModifierAndValue*> modvals;
+  modvals.push_back(g_am_dirs[AM_HTML]);
+  return modvals;
+}
+
+vector<const ModifierAndValue*> GetDefaultModifierForJs() {
+  vector<const ModifierAndValue*> modvals;
+  modvals.push_back(g_am_dirs[AM_JS]);
+  return modvals;
+}
+
+vector<const ModifierAndValue*> GetDefaultModifierForCss() {
+  return GetModifierForCss(NULL, NULL);
+}
+
+vector<const ModifierAndValue*> GetDefaultModifierForXml() {
+  return GetModifierForXml(NULL, NULL);
+}
+
+vector<const ModifierAndValue*> GetDefaultModifierForJson() {
+  return GetModifierForJson(NULL, NULL);
+}
+
+}
commit	8ab8a65883d39259a33ea8927c4f3411e44d5e41	[log] [tgz]
author	Brian Silverman <brians>	Mon Sep 21 17:49:11 2015 -0400
committer	Brian Silverman <brians>	Mon Sep 21 17:49:11 2015 -0400
tree	e95fa14d1cb63e92012afc0a4d09bae7745d422f
parent	f20afaa78874f228ad554a24021ab1dd72f172e9 [diff] [blame]