blob: 8ca4b7b8412d9c2073cdc2430138ccf7bbd9f2c6 [file] [log] [blame]
Brian Silverman70325d62015-09-20 17:00:43 -04001# Copyright (c) 2008, Google Inc.
2# All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are
6# met:
7#
8# * Redistributions of source code must retain the above copyright
9# notice, this list of conditions and the following disclaimer.
10# * Redistributions in binary form must reproduce the above
11# copyright notice, this list of conditions and the following disclaimer
12# in the documentation and/or other materials provided with the
13# distribution.
14# * Neither the name of Google Inc. nor the names of its
15# contributors may be used to endorse or promote products derived from
16# this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29# ---
30#
31# Author: falmeida@google.com (Filipe Almeida)
32
33# TODO(falmeida): Add more descriptive names to the states and drop the
34# abbreviations.
35# TODO(falmeida): Reorder the states so that it's easier to read.
36# TODO(falmeida): Support CDATA blocks in the form: <![CDATA[.
37
38name = 'htmlparser'
39
40comment = 'Definition of a finite state machine for a subset of HTTP 4.1'
41
42condition('<', '<')
43condition('>', '>')
44condition('=', '=')
45
46# TODO(falmeida): This is not the correct expression. tag and attribute names
47# can only consist of alpha character.
48condition('id', 'A-Za-z0-9_:.-')
49condition('idtag', 'A-Za-z0-9/_:.-')
50
51# Whitespace according to: http://www.w3.org/TR/html401/struct/text.html#h-9.1
52condition('space', ' \t\n\r')
53condition('!', '!')
54condition('q', '\'')
55condition('dq', '\"')
56condition('/', '/')
57condition('*', '*')
58condition('-', '-')
59condition('?', '?')
60condition('lf', '\n')
61condition('quote', '\\')
62
63# TODO(falmeida): This default rule is a hack and shouldn't be here.
64condition('default', '[:default:]')
65
66state(name = 'text',
67 external = 'text',
68 transitions = [
69 ['<', 'tag_start'],
70 ['default', 'text']
71 ])
72
73# When we found the < character in text.
74# Tag opening is defined in the HTML5 draft here:
75# http://www.whatwg.org/specs/web-apps/current-work/#tag-open-state
76# We don't exactly follow this and are much more loose in order to mimic the way
77# the major browsers behave.
78state(name = 'tag_start',
79 external = 'tag',
80 transitions = [
81 ['idtag', 'tag_name'],
82 ['?', 'pi'],
83 ['!', 'declaration_start'],
84 ['<', 'tag_start'],
85 ['default', 'text']
86 ])
87
88# Name of the tag. Includes the closing tag character '/'.
89state(name = 'tag_name',
90 external = 'tag',
91 transitions = [
92 ['idtag', 'tag_name'],
93 ['space', 'tag_space'],
94 ['>', 'tag_close']
95 ])
96
97# HTML declaration and comment parsing
98#
99# We don't expose declaration state because at this point we only want to
100# ensure that we are parsing them correctly so we don't get out of sync.
101# This is specifically made for DOCTYPE declarations and won't work if DTD's
102# are defined inside the declaration.
103# The HTML5 spec says we should specificly look for the string '<!DOCTYPE HTML'
104# but that will add a lot of unecessary states, and unless we build a simple
105# declarative way to unfold a string match into multiple states, I don't
106# think it's worth worrying about for now.
107
108# Got '<!'. The next character will decide if we open a declaration or a
109# comment.
110state(name = 'declaration_start',
111 external = 'text',
112 transitions = [
113 ['-', 'comment_open'],
114 ['>', 'text'],
115 ['default', 'declaration_body']
116 ])
117
118# Inside a declaration. Ie: <!DOCTYPE. We close when we see a '>'
119state(name = 'declaration_body',
120 external = 'text',
121 transitions = [
122 ['>', 'text'],
123 ['default', 'declaration_body']
124 ])
125
126# Got '<!-'.
127state(name = 'comment_open',
128 external = 'text',
129 transitions = [
130 ['-', 'comment_body'],
131 ['default', 'text']
132 ])
133
134# Inside a comment. We only close when we see '-->'
135state(name = 'comment_body',
136 external = 'comment',
137 transitions = [
138 ['-', 'comment_dash'],
139 ['default', 'comment_body']
140 ])
141
142# Got '-' inside a comment.
143state(name = 'comment_dash',
144 external = 'comment',
145 transitions = [
146 ['-', 'comment_dash_dash'],
147 ['default', 'comment_body']
148 ])
149
150# Got '--' inside a comment.
151state(name = 'comment_dash_dash',
152 external = 'comment',
153 transitions = [
154 ['-', 'comment_dash_dash'],
155 ['>', 'text'],
156 ['default', 'comment_body']
157 ])
158
159# XML Processing instruction parsing according to:
160# http://www.w3.org/TR/REC-xml/#sec-pi
161#
162# Everything between the characters <? and ?> is considered to be part of the
163# processing instruction.
164state(name = 'pi',
165 external = 'text',
166 transitions = [
167 ['?', 'pi_may_end'],
168 ['default', 'pi']
169 ])
170
171state(name = 'pi_may_end',
172 external = 'text',
173 transitions = [
174 ['>', 'text'],
175 ['default', 'pi']
176 ])
177
178# Whitespace between tag name, attributes.
179state(name = 'tag_space',
180 external = 'tag',
181 transitions = [
182 ['>', 'tag_close'],
183 ['space', 'tag_space'],
184 ['id', 'attr'],
185 ['/', 'tag_space']
186 ])
187
188state(name = 'tag_close',
189 external = 'text',
190 transitions = [
191 ['<', 'tag_start'],
192 ['default', 'text']
193 ])
194
195# Name of the attribute.
196state(name = 'attr',
197 external = 'attr',
198 transitions = [
199 ['id', 'attr'],
200 ['>', 'tag_close'],
201 ['/', 'tag_space'],
202 ['=', 'value'],
203 ['space', 'attr_space']
204 ])
205
206# After the attribute name.
207state(name = 'attr_space',
208 external = 'attr',
209 transitions = [
210 ['>', 'tag_close'],
211 ['space', 'attr_space'],
212 ['id', 'attr'],
213 ['/', 'tag_space'],
214 ['=', 'value']
215 ])
216
217# Expecting a value, after attribute=
218state(name = 'value',
219 external = 'value',
220 transitions = [
221 ['q', 'value_q_start'],
222 ['dq', 'value_dq_start'],
223 ['space', 'value'],
224 ['>', 'tag_close'],
225 ['default', 'value_text']
226 ])
227
228# Unquoted attribute value.
229state(name = 'value_text',
230 external = 'value',
231 transitions = [
232 ['>', 'tag_close'],
233 ['space', 'tag_space'],
234 ['default', 'value_text']
235 ])
236
237# First character of a single quoted attribute value.
238state(name = 'value_q_start',
239 external = 'value',
240 transitions = [
241 ['q', 'tag_space'],
242 ['default', 'value_q']
243 ])
244
245# In the middle of a single quoted attribute value.
246state(name = 'value_q',
247 external = 'value',
248 transitions = [
249 ['q', 'tag_space'],
250 ['default', 'value_q']
251 ])
252
253# First character of a double quoted attribute value.
254state(name = 'value_dq_start',
255 external = 'value',
256 transitions = [
257 ['dq', 'tag_space'],
258 ['default', 'value_dq']
259 ])
260
261# In the middle of a double quoted attribute value.
262state(name = 'value_dq',
263 external = 'value',
264 transitions = [
265 ['dq', 'tag_space'],
266 ['default', 'value_dq']
267 ])
268
269# CDATA escaping text spans.
270# TODO(falmeida): These states should go after cdata_text.
271
272# Got '<!'
273state(name = 'cdata_comment_start',
274 external = 'text',
275 transitions = [
276 ['-', 'cdata_comment_start_dash'],
277 ['default', 'cdata_text'],
278 ])
279
280# Got '<!-'.
281state(name = 'cdata_comment_start_dash',
282 external = 'text',
283 transitions = [
284 ['-', 'cdata_comment_body'],
285 ['default', 'cdata_text']
286 ])
287
288# Inside a comment
289state(name = 'cdata_comment_body',
290 external = 'text',
291 transitions = [
292 ['-', 'cdata_comment_dash'],
293 ['default', 'cdata_comment_body']
294 ])
295
296# Got '-' inside a comment.
297state(name = 'cdata_comment_dash',
298 external = 'text',
299 transitions = [
300 ['-', 'cdata_comment_dash_dash'],
301 ['default', 'cdata_comment_body']
302 ])
303
304# Got '--' inside a comment.
305state(name = 'cdata_comment_dash_dash',
306 external = 'text',
307 transitions = [
308 ['-', 'cdata_comment_dash_dash'],
309 ['>', 'cdata_text'],
310 ['default', 'cdata_comment_body']
311 ])
312
313# CDATA processing
314#
315# To simplify the code, we treat RCDATA and CDATA sections the same since the
316# differences between them don't affect the context we are in.
317state(name = 'cdata_text',
318 external = 'text',
319 transitions = [
320 ['<', 'cdata_lt'],
321 ['default', 'cdata_text']
322 ])
323
324# Possible beginning of the closing tag.
325state(name = 'cdata_lt',
326 external = 'text',
327 transitions = [
328 ['/', 'cdata_may_close'],
329 ['!', 'cdata_comment_start'],
330 ['default', 'cdata_text']
331 ])
332
333# If we encounter </tag where tag matches the last opened tag, we exit the
334# CDATA section. Part of this logic is handled in the code.
335state(name = 'cdata_may_close',
336 external = 'text',
337 transitions = [
338 ['idtag', 'cdata_may_close'],
339 ['>', 'text'],
340 ['space', 'tag_space'],
341 ['default', 'cdata_text']
342 ])
343
344# The next states are used for specialized parser modes.
345state(name = 'js_file',
346 external = 'js_file',
347 transitions = [
348 ['default', 'js_file']
349 ])
350
351# TODO(falmeida): Having css_file and js_file as the external name doesn't make
352# sense. This should instead be text and the js/css state be
353# returned by # in_js() and in_css().
354state(name = 'css_file',
355 external = 'css_file',
356 transitions = [
357 ['default', 'css_file']
358 ])
359
360state(name = 'null',
361 external = 'text',
362 transitions = [
363 ['default', 'null']
364 ])
365