| Class | String |
| In: |
lib/feedparser/text-output.rb
lib/feedparser/textconverters.rb |
| Parent: | Object |
This class provides various converters
returns true if the text contains escaped HTML (with HTML entities). used by String#text2html
# File lib/feedparser/textconverters.rb, line 13
13: def escaped_html?
14: return (self =~ /<img src=/) || (self =~ /<a href=/) || (self =~ /<br(\/| \/|)>/) || (self =~ /<p>/)
15: end
Convert an HTML text to plain text
# File lib/feedparser/text-output.rb, line 6
6: def html2text
7: text = self.clone
8: # parse HTML
9: p = FeedParser::HTML2TextParser::new(true)
10: p.feed(text)
11: p.close
12: text = p.savedata
13: # remove leading and trailing whilespace
14: text.gsub!(/\A\s*/m, '')
15: text.gsub!(/\s*\Z/m, '')
16: # remove whitespace around \n
17: text.gsub!(/ *\n/m, "\n")
18: text.gsub!(/\n */m, "\n")
19: # and duplicates \n
20: text.gsub!(/\n\n+/m, "\n\n")
21: text
22: end
is this text HTML ? search for tags. used by String#text2html
# File lib/feedparser/textconverters.rb, line 8
8: def html?
9: return (self =~ /<p>/) || (self =~ /<\/p>/) || (self =~ /<br>/) || (self =~ /<br\s*(\/)?\s*>/) || (self =~ /<\/a>/) || (self =~ /<img.*>/)
10: end
Remove white space around the text
# File lib/feedparser/textconverters.rb, line 51
51: def rmWhiteSpace!
52: return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'')
53: end
convert text to HTML
# File lib/feedparser/textconverters.rb, line 35
35: def text2html
36: text = self.clone
37: return text if text.html?
38: if text.escaped_html?
39: return text.unescape_html
40: end
41: # paragraphs
42: text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>')
43: text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>")
44: # uris
45: text.gsub!(/(#{URI::regexp(['http','ftp','https'])})/,
46: '<a href="\1">\1</a>')
47: text
48: end
Convert a text in inputenc to a text in UTF8 must take care of wrong input locales
# File lib/feedparser/textconverters.rb, line 57
57: def toUTF8(inputenc)
58: if inputenc.downcase != 'utf-8'
59: # it is said it is not UTF-8. Ensure it is REALLY not UTF-8
60: begin
61: if self.unpack('U*').pack('U*') == self
62: return self
63: end
64: rescue
65: # do nothing
66: end
67: begin
68: return self.unpack('C*').pack('U*')
69: rescue
70: return self #failsafe solution. but a dirty one :-)
71: end
72: else
73: return self
74: end
75: end
un-escape HTML in the text. used by String#text2html
# File lib/feedparser/textconverters.rb, line 18
18: def unescape_html
19: {
20: '<' => '<',
21: '>' => '>',
22: "'" => ''',
23: '"' => '"',
24: '&' => '&',
25: "\047" => ''',
26: "\046" => '&',
27: "\046" => '&'
28: }.each do |k, v|
29: gsub!(v, k)
30: end
31: self
32: end