| Class | HTML5::HTMLParser |
| In: |
lib/feed_tools/vendor/html5/lib/html5/html5parser.rb
|
| Parent: | Object |
HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML
| errors | [R] | |
| first_start_tag | [RW] | |
| inner_html | [RW] | |
| insert_from_table | [RW] | |
| last_phase | [RW] | |
| phase | [RW] | |
| phases | [R] | |
| tokenizer | [R] | |
| tree | [R] |
:strict - raise an exception when a parse error is encountered :tree - a treebuilder class controlling the type of tree that will be returned. Built in treebuilders can be accessed through HTML5::TreeBuilders[treeType]
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 41
41: def initialize(options = {})
42: @strict = false
43: @errors = []
44:
45: @tokenizer = HTMLTokenizer
46: @tree = TreeBuilders::REXML::TreeBuilder
47:
48: options.each {|name, value| instance_variable_set("@#{name}", value) }
49: @lowercase_attr_name = nil unless instance_variables.include?("@lowercase_attr_name")
50: @lowercase_element_name = nil unless instance_variables.include?("@lowercase_element_name")
51:
52: @tree = @tree.new
53:
54: @phases = @@phases.inject({}) do |phases, phase_name|
55: phase_class_name = phase_name.sub(/(.)/) { $1.upcase } + 'Phase'
56: phases[phase_name.to_sym] = HTML5.const_get(phase_class_name).new(self, @tree)
57: phases
58: end
59: end
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 23
23: def self.parse(stream, options = {})
24: encoding = options.delete(:encoding)
25: new(options).parse(stream,encoding)
26: end
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 28
28: def self.parse_fragment(stream, options = {})
29: container = options.delete(:container) || 'div'
30: encoding = options.delete(:encoding)
31: new(options).parse_fragment(stream, container, encoding)
32: end
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 245
245: def _(string); string; end
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 61
61: def _parse(stream, inner_html, encoding, container = 'div')
62: @tree.reset
63: @first_start_tag = false
64: @errors = []
65:
66: @tokenizer = @tokenizer.class unless Class === @tokenizer
67: @tokenizer = @tokenizer.new(stream, :encoding => encoding,
68: :parseMeta => !inner_html, :lowercase_attr_name => @lowercase_attr_name, :lowercase_element_name => @lowercase_element_name)
69:
70: if inner_html
71: case @inner_html = container.downcase
72: when 'title', 'textarea'
73: @tokenizer.content_model_flag = :RCDATA
74: when 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'
75: @tokenizer.content_model_flag = :CDATA
76: when 'plaintext'
77: @tokenizer.content_model_flag = :PLAINTEXT
78: else
79: # content_model_flag already is PCDATA
80: @tokenizer.content_model_flag = :PCDATA
81: end
82:
83: @phase = @phases[:rootElement]
84: @phase.insert_html_element
85: reset_insertion_mode
86: else
87: @inner_html = false
88: @phase = @phases[:initial]
89: end
90:
91: # We only seem to have InBodyPhase testcases where the following is
92: # relevant ... need others too
93: @last_phase = nil
94:
95: # XXX This is temporary for the moment so there isn't any other
96: # changes needed for the parser to work with the iterable tokenizer
97: @tokenizer.each do |token|
98: token = normalize_token(token)
99:
100: method = 'process%s' % token[:type]
101:
102: case token[:type]
103: when :Characters, :SpaceCharacters, :Comment
104: @phase.send method, token[:data]
105: when :StartTag
106: @phase.send method, token[:name], token[:data]
107: when :EndTag
108: @phase.send method, token[:name]
109: when :Doctype
110: @phase.send method, token[:name], token[:publicId],
111: token[:systemId], token[:correct]
112: else
113: parse_error(token[:data], token[:datavars])
114: end
115: end
116:
117: # When the loop finishes it's EOF
118: @phase.process_eof
119: end
HTML5 specific normalizations to the token stream
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 157
157: def normalize_token(token)
158:
159: if token[:type] == :EmptyTag
160: # When a solidus (/) is encountered within a tag name what happens
161: # depends on whether the current tag name matches that of a void
162: # element. If it matches a void element atheists did the wrong
163: # thing and if it doesn't it's wrong for everyone.
164:
165: unless VOID_ELEMENTS.include?(token[:name])
166: parse_error("incorrectly-placed-solidus")
167: end
168:
169: token[:type] = :StartTag
170: end
171:
172: if token[:type] == :StartTag
173: token[:name] = token[:name].downcase
174:
175: # We need to remove the duplicate attributes and convert attributes
176: # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
177:
178: unless token[:data].empty?
179: data = token[:data].reverse.map {|attr, value| [attr.downcase, value] }
180: token[:data] = Hash[*data.flatten]
181: end
182:
183: elsif token[:type] == :EndTag
184: parse_error("attributes-in-end-tag") unless token[:data].empty?
185: token[:name] = token[:name].downcase
186: end
187:
188: token
189: end
Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 129
129: def parse(stream, encoding=nil)
130: _parse(stream, false, encoding)
131: @tree.get_document
132: end
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 150
150: def parse_error(code = 'XXX-undefined-error', data = {})
151: # XXX The idea is to make data mandatory.
152: @errors.push([@tokenizer.stream.position, code, data])
153: raise ParseError if @strict
154: end
container - name of the element we‘re setting the inner_html property if set to nil, default to ‘div‘
stream - a filelike object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element)
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 145
145: def parse_fragment(stream, container='div', encoding=nil)
146: _parse(stream, true, encoding, container)
147: @tree.get_fragment
148: end
# File lib/feed_tools/vendor/html5/lib/html5/html5parser.rb, line 207
207: def reset_insertion_mode
208: # The name of this method is mostly historical. (It's also used in the
209: # specification.)
210: last = false
211:
212: @tree.open_elements.reverse.each do |node|
213: node_name = node.name
214:
215: if node == @tree.open_elements.first
216: last = true
217: unless ['td', 'th'].include?(node_name)
218: # XXX
219: # assert @inner_html
220: node_name = @inner_html
221: end
222: end
223:
224: # Check for conditions that should only happen in the inner_html
225: # case
226: if ['select', 'colgroup', 'head', 'frameset'].include?(node_name)
227: # XXX
228: # assert @inner_html
229: end
230:
231: if @@new_modes.has_key?(node_name)
232: @phase = @phases[@@new_modes[node_name]]
233: elsif node_name == 'html'
234: @phase = @phases[@tree.head_pointer.nil?? :beforeHead : :afterHead]
235: elsif last
236: @phase = @phases[:inBody]
237: else
238: next
239: end
240:
241: break
242: end
243: end