| Class | String |
| In: |
lib/feedparser/textconverters.rb
lib/feedparser/text-output.rb |
| Parent: | Object |
This class provides various converters
| MY_ENTITIES | = | {} |
# File lib/feedparser/textconverters.rb, line 17
17: def escape_html
18: r = self.gsub('&', '&')
19: r = r.gsub('<', '<')
20: r = r.gsub('>', '>')
21: r
22: end
returns true if the text contains escaped HTML (with HTML entities). used by String#text2html
# File lib/feedparser/textconverters.rb, line 13
13: def escaped_html?
14: return (self =~ /<img src=/i) || (self =~ /<a href=/i) || (self =~ /<br(\/| \/|)>/i) || (self =~ /<p>/i)
15: end
Convert an HTML text to plain text
# File lib/feedparser/text-output.rb, line 7
7: def html2text(wrapto = false)
8: text = self.clone
9: # parse HTML
10: p = FeedParser::HTML2TextParser::new(true)
11: p.feed(text)
12: p.close
13: text = p.savedata
14: # remove leading and trailing whilespace
15: text.gsub!(/\A\s*/m, '')
16: text.gsub!(/\s*\Z/m, '')
17: # remove whitespace around \n
18: text.gsub!(/ *\n/m, "\n")
19: text.gsub!(/\n */m, "\n")
20: # and duplicates \n
21: text.gsub!(/\n\n+/m, "\n\n")
22: # and remove duplicated whitespace
23: text.gsub!(/[ \t]+/, ' ')
24:
25: # finally, wrap the text if requested
26: return wrap_text(text, wrapto) if wrapto
27: text
28: end
is this text HTML ? search for tags. used by String#text2html
# File lib/feedparser/textconverters.rb, line 8
8: def html?
9: return (self =~ /<p>/i) || (self =~ /<\/p>/i) || (self =~ /<br>/i) || (self =~ /<br\s*(\/)?\s*>/i) || (self =~ /<\/a>/i) || (self =~ /<img.*>/i)
10: end
Remove white space around the text
# File lib/feedparser/textconverters.rb, line 95
95: def rmWhiteSpace!
96: return self.gsub!(/\A\s*/m, '').gsub!(/\s*\Z/m,'')
97: end
convert text to HTML
# File lib/feedparser/textconverters.rb, line 40
40: def text2html(feed)
41: text = self.clone
42: realhtml = text.html?
43: eschtml = text.escaped_html?
44: # fix for RSS feeds with both real and escaped html (crazy!):
45: # we take the first one
46: if (realhtml && eschtml)
47: if (realhtml < eschtml)
48: eschtml = nil
49: else
50: realhtml = nil
51: end
52: end
53: if realhtml
54: # do nothing
55: elsif eschtml
56: text = text.unescape_html
57: else
58: # paragraphs
59: text.gsub!(/\A\s*(.*)\Z/m, '<p>\1</p>')
60: text.gsub!(/\s*\n(\s*\n)+\s*/, "</p>\n<p>")
61: # uris
62: text.gsub!(/(#{URI::regexp(['http','ftp','https'])})/,
63: '<a href="\1">\1</a>')
64: end
65: # Handle broken hrefs in <a> and <img>
66: if feed and feed.link
67: text.gsub!(/(\s(src|href)=['"])([^'"]*)(['"])/) do |m|
68: begin
69: first, url, last = $1, $3, $4
70: if (url =~ /^\s*\w+:\/\//) or (url =~ /^\s*\w+:\w/)
71: m
72: elsif url =~ /^\//
73: (first + feed.link.split(/\//)[0..2].join('/') + url + last)
74: else
75: t = feed.link.split(/\//)
76: if t.length == 3 # http://toto with no trailing /
77: (first + feed.link + '/' + url + last)
78: else
79: if feed.link =~ /\/$/
80: (first + feed.link + url + last)
81: else
82: (first + t[0...-1].join('/') + '/' + url + last)
83: end
84: end
85: end
86: rescue
87: m
88: end
89: end
90: end
91: text
92: end
Convert a text in inputenc to a text in UTF8 must take care of wrong input locales
# File lib/feedparser/textconverters.rb, line 101
101: def toUTF8(inputenc)
102: if inputenc.downcase != 'utf-8'
103: # it is said it is not UTF-8. Ensure it is REALLY not UTF-8
104: begin
105: if self.unpack('U*').pack('U*') == self
106: return self
107: end
108: rescue
109: # do nothing
110: end
111: begin
112: return self.unpack('C*').pack('U*')
113: rescue
114: return self #failsafe solution. but a dirty one :-)
115: end
116: else
117: return self
118: end
119: end
un-escape HTML in the text. used by String#text2html
# File lib/feedparser/textconverters.rb, line 31
31: def unescape_html
32: r = self
33: MY_ENTITIES.each do |k, v|
34: r = r.gsub(k, v)
35: end
36: r
37: end