| Class | HTML5::EncodingParser |
| In: |
lib/feed_tools/vendor/html5/lib/html5/inputstream.rb
|
| Parent: | Object |
Mini parser for detecting character encoding from meta elements
string - the data to work on for encoding detection
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 412
412: def initialize(data)
413: @data = EncodingBytes.new(data.to_s)
414: @encoding = nil
415: end
Return a name,value pair for the next attribute in the stream, if one is found, or nil
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 514
514: def get_attribute
515: @data.skip(SPACE_CHARACTERS + ['/'])
516:
517: if @data.current_byte == '<'
518: @data.position -= 1
519: return nil
520: elsif @data.current_byte == '>'
521: return nil
522: end
523:
524: attr_name = []
525: attr_value = []
526: space_found = false
527: #Step 5 attribute name
528: while true
529: if @data.current_byte == '=' and attr_name
530: break
531: elsif SPACE_CHARACTERS.include?(@data.current_byte)
532: space_found = true
533: break
534: elsif ['/', '<', '>'].include?(@data.current_byte)
535: return [attr_name.join(''), '']
536: elsif ASCII_UPPERCASE.include?(@data.current_byte)
537: attr_name.push(@data.current_byte.downcase)
538: else
539: attr_name.push(@data.current_byte)
540: end
541: #Step 6
542: @data.position += 1
543: end
544: #Step 7
545: if space_found
546: @data.skip
547: #Step 8
548: unless @data.current_byte == '='
549: @data.position -= 1
550: return [attr_name.join(''), '']
551: end
552: end
553: #XXX need to advance position in both spaces and value case
554: #Step 9
555: @data.position += 1
556: #Step 10
557: @data.skip
558: #Step 11
559: if ["'", '"'].include?(@data.current_byte)
560: #11.1
561: quote_char = @data.current_byte
562: while true
563: @data.position+=1
564: #11.3
565: if @data.current_byte == quote_char
566: @data.position += 1
567: return [attr_name.join(''), attr_value.join('')]
568: #11.4
569: elsif ASCII_UPPERCASE.include?(@data.current_byte)
570: attr_value.push(@data.current_byte.downcase)
571: #11.5
572: else
573: attr_value.push(@data.current_byte)
574: end
575: end
576: elsif ['>', '<'].include?(@data.current_byte)
577: return [attr_name.join(''), '']
578: elsif ASCII_UPPERCASE.include?(@data.current_byte)
579: attr_value.push(@data.current_byte.downcase)
580: else
581: attr_value.push(@data.current_byte)
582: end
583: while true
584: @data.position += 1
585: if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
586: return [attr_name.join(''), attr_value.join('')]
587: elsif ASCII_UPPERCASE.include?(@data.current_byte)
588: attr_value.push(@data.current_byte.downcase)
589: else
590: attr_value.push(@data.current_byte)
591: end
592: end
593: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 426
426: def get_encoding
427: @data.each do |byte|
428: keep_parsing = true
429: @@method_dispatch.each do |(key, method)|
430: if @data.match_bytes(key, lower = true)
431: keep_parsing = send(method)
432: break
433: end
434: end
435: break unless keep_parsing
436: end
437: @encoding = @encoding.strip unless @encoding.nil?
438: return @encoding
439: end
Skip over comments
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 442
442: def handle_comment
443: return @data.jump_to('-->')
444: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 446
446: def handle_meta
447: # if we have <meta not followed by a space so just keep going
448: return true unless SPACE_CHARACTERS.include?(@data.current_byte)
449:
450: #We have a valid meta element we want to search for attributes
451: while true
452: #Try to find the next attribute after the current position
453: attr = get_attribute
454:
455: return true if attr.nil?
456:
457: if attr[0] == 'charset'
458: tentative_encoding = attr[1]
459: if HTML5.is_valid_encoding(tentative_encoding)
460: @encoding = tentative_encoding
461: return false
462: end
463: elsif attr[0] == 'content'
464: content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
465: tentative_encoding = content_parser.parse
466: if HTML5.is_valid_encoding(tentative_encoding)
467: @encoding = tentative_encoding
468: return false
469: end
470: end
471: end
472: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 508
508: def handle_other
509: return @data.jump_to('>')
510: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 478
478: def handle_possible_end_tag
479: @data.position += 1
480: return handle_possible_tag(true)
481: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 474
474: def handle_possible_start_tag
475: return handle_possible_tag(false)
476: end
# File lib/feed_tools/vendor/html5/lib/html5/inputstream.rb, line 483
483: def handle_possible_tag(end_tag)
484: unless ASCII_LETTERS.include?(@data.current_byte)
485: #If the next byte is not an ascii letter either ignore this
486: #fragment (possible start tag case) or treat it according to
487: #handleOther
488: if end_tag
489: @data.position -= 1
490: handle_other
491: end
492: return true
493: end
494:
495: @data.find_next(SPACE_CHARACTERS + ['<', '>'])
496:
497: if @data.current_byte == '<'
498: #return to the first step in the overall "two step" algorithm
499: #reprocessing the < byte
500: @data.position -= 1
501: else
502: #Read all attributes
503: {} until get_attribute.nil?
504: end
505: return true
506: end