| Class | HTML5::HTMLTokenizer |
| In: |
lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb
|
| Parent: | Object |
This class takes care of tokenizing HTML.
| content_model_flag | [RW] | |
| current_token | [RW] | |
| stream | [R] |
XXX need to fix documentation
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 26
26: def initialize(stream, options = {})
27: @stream = HTMLInputStream.new(stream, options)
28:
29: # Setup the initial tokenizer state
30: @content_model_flag = :PCDATA
31: @state = :data_state
32: @escapeFlag = false
33: @lastFourChars = []
34:
35: # The current token being created
36: @current_token = nil
37:
38: # Tokens to be processed.
39: @token_queue = []
40: @lowercase_element_name = options[:lowercase_element_name] != false
41: @lowercase_attr_name = options[:lowercase_attr_name] != false
42: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 491
491: def after_attribute_name_state
492: data = @stream.char
493: if SPACE_CHARACTERS.include? data
494: @stream.chars_until(SPACE_CHARACTERS, true)
495: elsif data == "="
496: @state = :before_attribute_value_state
497: elsif data == ">"
498: emit_current_token
499: elsif data == :EOF
500: @token_queue << {:type => :ParseError, :data => "expected-end-of-tag-but-got-eof"}
501: emit_current_token
502: elsif ASCII_LETTERS.include? data
503: @current_token[:data].push([data, ""])
504: @state = :attribute_name_state
505: elsif data == "/"
506: process_solidus_in_tag
507: @state = :before_attribute_name_state
508: else
509: @current_token[:data].push([data, ""])
510: @state = :attribute_name_state
511: end
512: return true
513: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 762
762: def after_doctype_name_state
763: data = @stream.char
764: if SPACE_CHARACTERS.include? data
765: elsif data == ">"
766: @token_queue << @current_token
767: @state = :data_state
768: elsif data == :EOF
769: @current_token[:correct] = false
770: @stream.unget(data)
771: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
772: @token_queue << @current_token
773: @state = :data_state
774: else
775: char_stack = [data]
776: 5.times { char_stack << stream.char }
777: token = char_stack.join('').tr(ASCII_UPPERCASE,ASCII_LOWERCASE)
778: if token == "public" and !char_stack.include?(:EOF)
779: @state = :before_doctype_public_identifier_state
780: elsif token == "system" and !char_stack.include?(:EOF)
781: @state = :before_doctype_system_identifier_state
782: else
783: @stream.unget(char_stack)
784: @token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
785: @state = :bogus_doctype_state
786: end
787: end
788: return true
789: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 849
849: def after_doctype_public_identifier_state
850: data = @stream.char
851: if SPACE_CHARACTERS.include?(data)
852: elsif data == "\""
853: @current_token[:systemId] = ""
854: @state = :doctype_system_identifier_double_quoted_state
855: elsif data == "'"
856: @current_token[:systemId] = ""
857: @state = :doctype_system_identifier_single_quoted_state
858: elsif data == ">"
859: @token_queue << @current_token
860: @state = :data_state
861: elsif data == :EOF
862: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
863: @current_token[:correct] = false
864: @token_queue << @current_token
865: @state = :data_state
866: else
867: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
868: @state = :bogus_doctype_state
869: end
870: return true
871: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 929
929: def after_doctype_system_identifier_state
930: data = @stream.char
931: if SPACE_CHARACTERS.include?(data)
932: elsif data == ">"
933: @token_queue << @current_token
934: @state = :data_state
935: elsif data == :EOF
936: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
937: @current_token[:correct] = false
938: @token_queue << @current_token
939: @state = :data_state
940: else
941: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
942: @state = :bogus_doctype_state
943: end
944: return true
945: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 444
444: def attribute_name_state
445: data = @stream.char
446: leavingThisState = true
447: emitToken = false
448: if data == "="
449: @state = :before_attribute_value_state
450: elsif data == :EOF
451: @token_queue << {:type => :ParseError, :data => "eof-in-attribute-name"}
452: @state = :data_state
453: emitToken = true
454: elsif ASCII_LETTERS.include? data
455: @current_token[:data][-1][0] += data + @stream.chars_until(ASCII_LETTERS, true)
456: leavingThisState = false
457: elsif data == ">"
458: # XXX If we emit here the attributes are converted to a dict
459: # without being checked and when the code below runs we error
460: # because data is a dict not a list
461: emitToken = true
462: elsif SPACE_CHARACTERS.include? data
463: @state = :after_attribute_name_state
464: elsif data == "/"
465: process_solidus_in_tag
466: @state = :before_attribute_name_state
467: else
468: @current_token[:data][-1][0] += data
469: leavingThisState = false
470: end
471:
472: if leavingThisState
473: # Attributes are not dropped at this stage. That happens when the
474: # start tag token is emitted so values can still be safely appended
475: # to attributes, but we do want to report the parse error in time.
476: if @lowercase_attr_name
477: @current_token[:data][-1][0] = @current_token[:data].last.first.downcase
478: end
479: @current_token[:data][0...-1].each {|name,value|
480: if @current_token[:data].last.first == name
481: @token_queue << {:type => :ParseError, :data => "duplicate-attribute"}
482: break # don't report an error more than once
483: end
484: }
485: # XXX Fix for above XXX
486: emit_current_token if emitToken
487: end
488: return true
489: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 538
538: def attribute_value_double_quoted_state
539: data = @stream.char
540: if data == "\""
541: @state = :before_attribute_name_state
542: elsif data == "&"
543: process_entity_in_attribute
544: elsif data == :EOF
545: @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-double-quote"}
546: emit_current_token
547: else
548: @current_token[:data][-1][1] += data + @stream.chars_until(["\"", "&"])
549: end
550: return true
551: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 553
553: def attribute_value_single_quoted_state
554: data = @stream.char
555: if data == "'"
556: @state = :before_attribute_name_state
557: elsif data == "&"
558: process_entity_in_attribute
559: elsif data == :EOF
560: @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-single-quote"}
561: emit_current_token
562: else
563: @current_token[:data][-1][1] += data +\
564: @stream.chars_until(["'", "&"])
565: end
566: return true
567: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 569
569: def attribute_value_unquoted_state
570: data = @stream.char
571: if SPACE_CHARACTERS.include? data
572: @state = :before_attribute_name_state
573: elsif data == "&"
574: process_entity_in_attribute
575: elsif data == ">"
576: emit_current_token
577: elsif data == :EOF
578: @token_queue << {:type => :ParseError, :data => "eof-in-attribute-value-no-quotes"}
579: emit_current_token
580: else
581: @current_token[:data][-1][1] += data + @stream.chars_until(["&", ">","<"] + SPACE_CHARACTERS)
582: end
583: return true
584: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 423
423: def before_attribute_name_state
424: data = @stream.char
425: if SPACE_CHARACTERS.include? data
426: @stream.chars_until(SPACE_CHARACTERS, true)
427: elsif data == :EOF
428: @token_queue << {:type => :ParseError, :data => "expected-attribute-name-but-got-eof"}
429: emit_current_token
430: elsif ASCII_LETTERS.include? data
431: @current_token[:data].push([data, ""])
432: @state = :attribute_name_state
433: elsif data == ">"
434: emit_current_token
435: elsif data == "/"
436: process_solidus_in_tag
437: else
438: @current_token[:data].push([data, ""])
439: @state = :attribute_name_state
440: end
441: return true
442: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 515
515: def before_attribute_value_state
516: data = @stream.char
517: if SPACE_CHARACTERS.include? data
518: @stream.chars_until(SPACE_CHARACTERS, true)
519: elsif data == "\""
520: @state = :attribute_value_double_quoted_state
521: elsif data == "&"
522: @state = :attribute_value_unquoted_state
523: @stream.unget(data);
524: elsif data == "'"
525: @state = :attribute_value_single_quoted_state
526: elsif data == ">"
527: emit_current_token
528: elsif data == :EOF
529: @token_queue << {:type => :ParseError, :data => "expected-attribute-value-but-got-eof"}
530: emit_current_token
531: else
532: @current_token[:data][-1][1] += data
533: @state = :attribute_value_unquoted_state
534: end
535: return true
536: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 723
723: def before_doctype_name_state
724: data = @stream.char
725: if SPACE_CHARACTERS.include? data
726: elsif data == ">"
727: @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-right-bracket"}
728: @current_token[:correct] = false
729: @token_queue << @current_token
730: @state = :data_state
731: elsif data == :EOF
732: @token_queue << {:type => :ParseError, :data => "expected-doctype-name-but-got-eof"}
733: @current_token[:correct] = false
734: @token_queue << @current_token
735: @state = :data_state
736: else
737: @current_token[:name] = data
738: @state = :doctype_name_state
739: end
740: return true
741: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 791
791: def before_doctype_public_identifier_state
792: data = @stream.char
793:
794: if SPACE_CHARACTERS.include?(data)
795: elsif data == "\""
796: @current_token[:publicId] = ""
797: @state = :doctype_public_identifier_double_quoted_state
798: elsif data == "'"
799: @current_token[:publicId] = ""
800: @state = :doctype_public_identifier_single_quoted_state
801: elsif data == ">"
802: @token_queue << {:type => :ParseError, :data => "unexpected-end-of-doctype"}
803: @current_token[:correct] = false
804: @token_queue << @current_token
805: @state = :data_state
806: elsif data == :EOF
807: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
808: @current_token[:correct] = false
809: @token_queue << @current_token
810: @state = :data_state
811: else
812: @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
813: @state = :bogus_doctype_state
814: end
815:
816: return true
817: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 873
873: def before_doctype_system_identifier_state
874: data = @stream.char
875: if SPACE_CHARACTERS.include?(data)
876: elsif data == "\""
877: @current_token[:systemId] = ""
878: @state = :doctype_system_identifier_double_quoted_state
879: elsif data == "'"
880: @current_token[:systemId] = ""
881: @state = :doctype_system_identifier_single_quoted_state
882: elsif data == ">"
883: @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
884: @current_token[:correct] = false
885: @token_queue << @current_token
886: @state = :data_state
887: elsif data == :EOF
888: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
889: @current_token[:correct] = false
890: @token_queue << @current_token
891: @state = :data_state
892: else
893: @token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
894: @state = :bogus_doctype_state
895: end
896: return true
897: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 586
586: def bogus_comment_state
587: # Make a new comment token and give it as value all the characters
588: # until the first > or :EOF (chars_until checks for :EOF automatically)
589: # and emit it.
590: @token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}
591:
592: # Eat the character directly after the bogus comment which is either a
593: # ">" or an :EOF.
594: @stream.char
595: @state = :data_state
596: return true
597: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 947
947: def bogus_doctype_state
948: data = @stream.char
949: @current_token[:correct] = false
950: if data == ">"
951: @token_queue << @current_token
952: @state = :data_state
953: elsif data == :EOF
954: # XXX EMIT
955: @stream.unget(data)
956: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
957: @current_token[:correct] = false
958: @token_queue << @current_token
959: @state = :data_state
960: end
961: return true
962: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 343
343: def close_tag_open_state
344: if (@content_model_flag == :RCDATA or @content_model_flag == :CDATA)
345: if @current_token
346: char_stack = []
347:
348: # So far we know that "</" has been consumed. We now need to know
349: # whether the next few characters match the name of last emitted
350: # start tag which also happens to be the current_token. We also need
351: # to have the character directly after the characters that could
352: # match the start tag name.
353: (@current_token[:name].length + 1).times do
354: char_stack.push(@stream.char)
355: # Make sure we don't get hit by :EOF
356: break if char_stack[-1] == :EOF
357: end
358:
359: # Since this is just for checking. We put the characters back on
360: # the stack.
361: @stream.unget(char_stack)
362: end
363:
364: if @current_token and
365: @current_token[:name].downcase ==
366: char_stack[0...-1].join('').downcase and
367: (SPACE_CHARACTERS + [">", "/", "<", :EOF]).include? char_stack[-1]
368: # Because the characters are correct we can safely switch to
369: # PCDATA mode now. This also means we don't have to do it when
370: # emitting the end tag token.
371: @content_model_flag = :PCDATA
372: else
373: @token_queue << {:type => :Characters, :data => "</"}
374: @state = :data_state
375:
376: # Need to return here since we don't want the rest of the
377: # method to be walked through.
378: return true
379: end
380: end
381:
382: data = @stream.char
383: if data == :EOF
384: @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-eof"}
385: @token_queue << {:type => :Characters, :data => "</"}
386: @state = :data_state
387: elsif ASCII_LETTERS.include? data
388: @current_token = {:type => :EndTag, :name => data, :data => []}
389: @state = :tag_name_state
390: elsif data == ">"
391: @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-right-bracket"}
392: @state = :data_state
393: else
394: # XXX data can be _'_...
395: @token_queue << {:type => :ParseError, :data => "expected-closing-tag-but-got-char", :datavars => {:data => data}}
396: @stream.unget(data)
397: @state = :bogus_comment_state
398: end
399:
400: return true
401: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 671
671: def comment_end_dash_state
672: data = @stream.char
673: if data == "-"
674: @state = :comment_end_state
675: elsif data == :EOF
676: @token_queue << {:type => :ParseError, :data => "eof-in-comment-end-dash"}
677: @token_queue << @current_token
678: @state = :data_state
679: else
680: @current_token[:data] += "-" + data +\
681: @stream.chars_until("-")
682: # Consume the next character which is either a "-" or an :EOF as
683: # well so if there's a "-" directly after the "-" we go nicely to
684: # the "comment end state" without emitting a ParseError there.
685: @stream.char
686: end
687: return true
688: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 690
690: def comment_end_state
691: data = @stream.char
692: if data == ">"
693: @token_queue << @current_token
694: @state = :data_state
695: elsif data == "-"
696: @token_queue << {:type => :ParseError, :data => "unexpected-dash-after-double-dash-in-comment"}
697: @current_token[:data] += data
698: elsif data == :EOF
699: @token_queue << {:type => :ParseError, :data => "eof-in-comment-double-dash"}
700: @token_queue << @current_token
701: @state = :data_state
702: else
703: # XXX
704: @token_queue << {:type => :ParseError, :data => "unexpected-char-in-comment"}
705: @current_token[:data] += "--" + data
706: @state = :comment_state
707: end
708: return true
709: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 638
638: def comment_start_dash_state
639: data = @stream.char
640: if data == "-"
641: @state = :comment_end_state
642: elsif data == ">"
643: @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
644: @token_queue << @current_token
645: @state = :data_state
646: elsif data == :EOF
647: @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
648: @token_queue << @current_token
649: @state = :data_state
650: else
651: @current_token[:data] += '-' + data + @stream.chars_until("-")
652: @state = :comment_state
653: end
654: return true
655: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 619
619: def comment_start_state
620: data = @stream.char
621: if data == "-"
622: @state = :comment_start_dash_state
623: elsif data == ">"
624: @token_queue << {:type => :ParseError, :data => "incorrect-comment"}
625: @token_queue << @current_token
626: @state = :data_state
627: elsif data == :EOF
628: @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
629: @token_queue << @current_token
630: @state = :data_state
631: else
632: @current_token[:data] += data + @stream.chars_until("-")
633: @state = :comment_state
634: end
635: return true
636: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 657
657: def comment_state
658: data = @stream.char
659: if data == "-"
660: @state = :comment_end_dash_state
661: elsif data == :EOF
662: @token_queue << {:type => :ParseError, :data => "eof-in-comment"}
663: @token_queue << @current_token
664: @state = :data_state
665: else
666: @current_token[:data] += data + @stream.chars_until("-")
667: end
668: return true
669: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 137
137: def consume_entity(from_attribute=false)
138: char = nil
139: char_stack = [@stream.char]
140: if SPACE_CHARACTERS.include?(char_stack[0]) or [:EOF, '<', '&'].include?(char_stack[0])
141: @stream.unget(char_stack)
142: elsif char_stack[0] == '#'
143: # We might have a number entity here.
144: char_stack += [@stream.char, @stream.char]
145: if char_stack[0 .. 1].include? :EOF
146: # If we reach the end of the file put everything up to :EOF
147: # back in the queue
148: char_stack = char_stack[0...char_stack.index(:EOF)]
149: @stream.unget(char_stack)
150: @token_queue << {:type => :ParseError, :data => "expected-numeric-entity-but-got-eof"}
151: else
152: if char_stack[1].downcase == "x" and HEX_DIGITS.include? char_stack[2]
153: # Hexadecimal entity detected.
154: @stream.unget(char_stack[2])
155: char = consume_number_entity(true)
156: elsif DIGITS.include? char_stack[1]
157: # Decimal entity detected.
158: @stream.unget(char_stack[1..-1])
159: char = consume_number_entity(false)
160: else
161: # No number entity detected.
162: @stream.unget(char_stack)
163: @token_queue << {:type => :ParseError, :data => "expected-numeric-entity"}
164: end
165: end
166: else
167: # At this point in the process might have named entity. Entities
168: # are stored in the global variable "entities".
169: #
170: # Consume characters and compare to these to a substring of the
171: # entity names in the list until the substring no longer matches.
172: filteredEntityList = ENTITIES.keys
173: filteredEntityList.reject! {|e| e[0].chr != char_stack[0]}
174: entityName = nil
175:
176: # Try to find the longest entity the string will match to take care
177: # of ¬i for instance.
178: while char_stack.last != :EOF
179: name = char_stack.join('')
180: if filteredEntityList.any? {|e| e[0...name.length] == name}
181: filteredEntityList.reject! {|e| e[0...name.length] != name}
182: char_stack.push(@stream.char)
183: else
184: break
185: end
186:
187: if ENTITIES.include? name
188: entityName = name
189: break if entityName[-1] == ';'
190: end
191: end
192:
193: if entityName != nil
194: char = ENTITIES[entityName]
195:
196: # Check whether or not the last character returned can be
197: # discarded or needs to be put back.
198: if entityName[-1] != ?;
199: @token_queue << {:type => :ParseError, :data => "named-entity-without-semicolon"}
200: end
201:
202: if entityName[-1] != ";" and from_attribute and
203: (ASCII_LETTERS.include?(char_stack[entityName.length]) or
204: DIGITS.include?(char_stack[entityName.length]))
205: @stream.unget(char_stack)
206: char = '&'
207: else
208: @stream.unget(char_stack[entityName.length..-1])
209: end
210: else
211: @token_queue << {:type => :ParseError, :data => "expected-named-entity"}
212: @stream.unget(char_stack)
213: end
214: end
215: return char
216: end
This function returns either U+FFFD or the character based on the decimal or hexadecimal representation. It also discards ";" if present. If not present @token_queue << {:type => :ParseError}" is invoked.
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 84
84: def consume_number_entity(isHex)
85:
86: # XXX More need to be done here. For instance, #13 should prolly be
87: # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
88: # such. Thoughts on this appreciated.
89: allowed = DIGITS
90: radix = 10
91: if isHex
92: allowed = HEX_DIGITS
93: radix = 16
94: end
95:
96: char_stack = []
97:
98: # Consume all the characters that are in range while making sure we
99: # don't hit an EOF.
100: c = @stream.char
101: while allowed.include?(c) and c != :EOF
102: char_stack.push(c)
103: c = @stream.char
104: end
105:
106: # Convert the set of characters consumed to an int.
107: charAsInt = char_stack.join('').to_i(radix)
108:
109: if charAsInt == 13
110: @token_queue << {:type => :ParseError, :data => "incorrect-cr-newline-entity"}
111: charAsInt = 10
112: elsif (128..159).include? charAsInt
113: # If the integer is between 127 and 160 (so 128 and bigger and 159
114: # and smaller) we need to do the "windows trick".
115: @token_queue << {:type => :ParseError, :data => "illegal-windows-1252-entity"}
116:
117: charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
118: end
119:
120: if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
121: char = [charAsInt].pack('U')
122: else
123: char = [0xFFFD].pack('U')
124: @token_queue << {:type => :ParseError, :data => "cant-convert-numeric-entity", :datavars => {"charAsInt" => charAsInt}}
125: end
126:
127: # Discard the ; if present. Otherwise, put it back on the queue and
128: # invoke parse_error on parser.
129: if c != ";"
130: @token_queue << {:type => :ParseError, :data => "numeric-entity-without-semicolon"}
131: @stream.unget(c)
132: end
133:
134: return char
135: end
XXX AT Perhaps we should have Hixie run some evaluation on billions of documents to figure out what the order of the various if and elsif statements should be.
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 249
249: def data_state
250: data = @stream.char
251:
252: if @content_model_flag == :CDATA or @content_model_flag == :RCDATA
253: @lastFourChars << data
254: @lastFourChars.shift if @lastFourChars.length > 4
255: end
256:
257: if data == "&" and [:PCDATA,:RCDATA].include?(@content_model_flag) and !@escapeFlag
258: @state = :entity_data_state
259: elsif data == "-" && [:CDATA, :RCDATA].include?(@content_model_flag) && !@escapeFlag && @lastFourChars.join('') == "<!--"
260: @escapeFlag = true
261: @token_queue << {:type => :Characters, :data => data}
262: elsif data == "<" and !@escapeFlag and
263: [:PCDATA,:CDATA,:RCDATA].include?(@content_model_flag)
264: @state = :tag_open_state
265: elsif data == ">" and @escapeFlag and
266: [:CDATA,:RCDATA].include?(@content_model_flag) and
267: @lastFourChars[1..-1].join('') == "-->"
268: @escapeFlag = false
269: @token_queue << {:type => :Characters, :data => data}
270:
271: elsif data == :EOF
272: # Tokenization ends.
273: return false
274:
275: elsif SPACE_CHARACTERS.include? data
276: # Directly after emitting a token you switch back to the "data
277: # state". At that point SPACE_CHARACTERS are important so they are
278: # emitted separately.
279: # XXX need to check if we don't need a special "spaces" flag on
280: # characters.
281: @token_queue << {:type => :SpaceCharacters, :data => data + @stream.chars_until(SPACE_CHARACTERS, true)}
282: else
283: @token_queue << {:type => :Characters, :data => data + @stream.chars_until(%w[& < > -])}
284: end
285: return true
286: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 743
743: def doctype_name_state
744: data = @stream.char
745: if SPACE_CHARACTERS.include? data
746: @state = :after_doctype_name_state
747: elsif data == ">"
748: @token_queue << @current_token
749: @state = :data_state
750: elsif data == :EOF
751: @token_queue << {:type => :ParseError, :data => "eof-in-doctype-name"}
752: @current_token[:correct] = false
753: @token_queue << @current_token
754: @state = :data_state
755: else
756: @current_token[:name] += data
757: end
758:
759: return true
760: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 819
819: def doctype_public_identifier_double_quoted_state
820: data = @stream.char
821: if data == "\""
822: @state = :after_doctype_public_identifier_state
823: elsif data == :EOF
824: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
825: @current_token[:correct] = false
826: @token_queue << @current_token
827: @state = :data_state
828: else
829: @current_token[:publicId] += data
830: end
831: return true
832: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 834
834: def doctype_public_identifier_single_quoted_state
835: data = @stream.char
836: if data == "'"
837: @state = :after_doctype_public_identifier_state
838: elsif data == :EOF
839: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
840: @current_token[:correct] = false
841: @token_queue << @current_token
842: @state = :data_state
843: else
844: @current_token[:publicId] += data
845: end
846: return true
847: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 711
711: def doctype_state
712: data = @stream.char
713: if SPACE_CHARACTERS.include? data
714: @state = :before_doctype_name_state
715: else
716: @token_queue << {:type => :ParseError, :data => "need-space-after-doctype"}
717: @stream.unget(data)
718: @state = :before_doctype_name_state
719: end
720: return true
721: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 899
899: def doctype_system_identifier_double_quoted_state
900: data = @stream.char
901: if data == "\""
902: @state = :after_doctype_system_identifier_state
903: elsif data == :EOF
904: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
905: @current_token[:correct] = false
906: @token_queue << @current_token
907: @state = :data_state
908: else
909: @current_token[:systemId] += data
910: end
911: return true
912: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 914
914: def doctype_system_identifier_single_quoted_state
915: data = @stream.char
916: if data == "'"
917: @state = :after_doctype_system_identifier_state
918: elsif data == :EOF
919: @token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
920: @current_token[:correct] = false
921: @token_queue << @current_token
922: @state = :data_state
923: else
924: @current_token[:systemId] += data
925: end
926: return true
927: end
This is where the magic happens.
We do our usually processing through the states and when we have a token to return we yield the token which pauses processing until the next token is requested.
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 49
49: def each
50: @token_queue = []
51: # Start processing. When EOF is reached @state will return false
52: # instead of true and the loop will terminate.
53: while send @state
54: yield :type => :ParseError, :data => @stream.errors.shift until @stream.errors.empty?
55: yield @token_queue.shift until @token_queue.empty?
56: end
57: end
This method is a generic handler for emitting the tags. It also sets the state to "data" because that‘s what‘s needed after a token has been emitted.
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 231
231: def emit_current_token
232: # Add token to the queue to be yielded
233: token = @current_token
234: if [:StartTag, :EndTag, :EmptyTag].include?(token[:type])
235: if @lowercase_element_name
236: token[:name] = token[:name].downcase
237: end
238: @token_queue << token
239: @state = :data_state
240: end
241:
242: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 288
288: def entity_data_state
289: entity = consume_entity
290: if entity
291: @token_queue << {:type => :Characters, :data => entity}
292: else
293: @token_queue << {:type => :Characters, :data => "&"}
294: end
295: @state = :data_state
296: return true
297: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 599
599: def markup_declaration_open_state
600: char_stack = [@stream.char, @stream.char]
601: if char_stack == ["-", "-"]
602: @current_token = {:type => :Comment, :data => ""}
603: @state = :comment_start_state
604: else
605: 5.times { char_stack.push(@stream.char) }
606: # Put in explicit :EOF check
607: if !char_stack.include?(:EOF) && char_stack.join("").upcase == "DOCTYPE"
608: @current_token = {:type => :Doctype, :name => "", :publicId => nil, :systemId => nil, :correct => true}
609: @state = :doctype_state
610: else
611: @token_queue << {:type => :ParseError, :data => "expected-dashes-or-doctype"}
612: @stream.unget(char_stack)
613: @state = :bogus_comment_state
614: end
615: end
616: return true
617: end
This method replaces the need for "entityInAttributeValueState".
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 219
219: def process_entity_in_attribute
220: entity = consume_entity()
221: if entity
222: @current_token[:data][-1][1] += entity
223: else
224: @current_token[:data][-1][1] += "&"
225: end
226: end
If the next character is a ’>’, convert the current_token into an EmptyTag
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 64
64: def process_solidus_in_tag
65:
66: # We need to consume another character to make sure it's a ">"
67: data = @stream.char
68:
69: if @current_token[:type] == :StartTag and data == ">"
70: @current_token[:type] = :EmptyTag
71: else
72: @token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
73: end
74:
75: # The character we just consumed need to be put back on the stack so it
76: # doesn't get lost...
77: @stream.unget(data)
78: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 403
403: def tag_name_state
404: data = @stream.char
405: if SPACE_CHARACTERS.include? data
406: @state = :before_attribute_name_state
407: elsif data == :EOF
408: @token_queue << {:type => :ParseError, :data => "eof-in-tag-name"}
409: emit_current_token
410: elsif ASCII_LETTERS.include? data
411: @current_token[:name] += data + @stream.chars_until(ASCII_LETTERS, true)
412: elsif data == ">"
413: emit_current_token
414: elsif data == "/"
415: process_solidus_in_tag
416: @state = :before_attribute_name_state
417: else
418: @current_token[:name] += data
419: end
420: return true
421: end
# File lib/feed_tools/vendor/html5/lib/html5/tokenizer.rb, line 299
299: def tag_open_state
300: data = @stream.char
301: if @content_model_flag == :PCDATA
302: if data == "!"
303: @state = :markup_declaration_open_state
304: elsif data == "/"
305: @state = :close_tag_open_state
306: elsif data != :EOF and ASCII_LETTERS.include? data
307: @current_token = {:type => :StartTag, :name => data, :data => []}
308: @state = :tag_name_state
309: elsif data == ">"
310: # XXX In theory it could be something besides a tag name. But
311: # do we really care?
312: @token_queue << {:type => :ParseError, :data => "expected-tag-name-but-got-right-bracket"}
313: @token_queue << {:type => :Characters, :data => "<>"}
314: @state = :data_state
315: elsif data == "?"
316: # XXX In theory it could be something besides a tag name. But
317: # do we really care?
318: @token_queue.push({:type => :ParseError, :data => "expected-tag-name-but-got-question-mark"})
319: @stream.unget(data)
320: @state = :bogus_comment_state
321: else
322: # XXX
323: @token_queue << {:type => :ParseError, :data => "expected-tag-name"}
324: @token_queue << {:type => :Characters, :data => "<"}
325: @stream.unget(data)
326: @state = :data_state
327: end
328: else
329: # We know the content model flag is set to either RCDATA or CDATA
330: # now because this state can never be entered with the PLAINTEXT
331: # flag.
332: if data == "/"
333: @state = :close_tag_open_state
334: else
335: @token_queue << {:type => :Characters, :data => "<"}
336: @stream.unget(data)
337: @state = :data_state
338: end
339: end
340: return true
341: end