| Class | FeedParser::SGMLParser |
| In: |
lib/feedparser/sgml-parser.rb
|
| Parent: | Object |
| Interesting | = | /[&<]/ | Regular expressions used for parsing: | |
| Incomplete | = | Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' + '![^<>]*)?') | ||
| Entityref | = | /&([a-zA-Z][-.a-zA-Z0-9]*);/ | ||
| Charref | = | /&#([0-9]+);/ | ||
| Starttagopen | = | /<[>a-zA-Z]/ | ||
| Endtagopen | = | /<\/[<>a-zA-Z]/ | ||
| Endbracket | = | /[<>]/ | ||
| Special | = | /<![^<>]*>/ | ||
| Commentopen | = | /<!--/ | ||
| Commentclose | = | /--[ \t\n]*>/ | ||
| Tagfind | = | /[a-zA-Z][a-zA-Z0-9.-]*/ | ||
| Attrfind | = | Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' + '(\s*=\s*' + "('[^']*'" + '|"[^"]*"' + '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?') | ||
| Entitydefs | = | {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''} |
# File lib/feedparser/sgml-parser.rb, line 30
30: def initialize(verbose=false)
31: @verbose = verbose
32: reset
33: end
# File lib/feedparser/sgml-parser.rb, line 56
56: def feed(data)
57: @rawdata << data
58: goahead(false)
59: end
# File lib/feedparser/sgml-parser.rb, line 242
242: def finish_endtag(tag)
243: if tag == ''
244: found = @stack.length - 1
245: if found < 0
246: unknown_endtag(tag)
247: return
248: end
249: else
250: unless @stack.include? tag
251: method = 'end_' + tag
252: unless self.respond_to?(method)
253: unknown_endtag(tag)
254: end
255: return
256: end
257: found = @stack.index(tag) #or @stack.length
258: end
259: while @stack.length > found
260: tag = @stack[-1]
261: method = 'end_' + tag
262: if respond_to?(method)
263: handle_endtag(tag, method)
264: else
265: unknown_endtag(tag)
266: end
267: @stack.pop
268: end
269: end
# File lib/feedparser/sgml-parser.rb, line 224
224: def finish_starttag(tag, attrs)
225: method = 'start_' + tag
226: if self.respond_to?(method)
227: @stack << tag
228: handle_starttag(tag, method, attrs)
229: return 1
230: else
231: method = 'do_' + tag
232: if self.respond_to?(method)
233: handle_starttag(tag, method, attrs)
234: return 0
235: else
236: unknown_starttag(tag, attrs)
237: return -1
238: end
239: end
240: end
# File lib/feedparser/sgml-parser.rb, line 65
65: def goahead(_end)
66: rawdata = @rawdata
67: i = 0
68: n = rawdata.length
69: while i < n
70: if @nomoretags
71: handle_data(rawdata[i..(n-1)])
72: i = n
73: break
74: end
75: j = rawdata.index(Interesting, i)
76: j = n unless j
77: if i < j
78: handle_data(rawdata[i..(j-1)])
79: end
80: i = j
81: break if (i == n)
82: if rawdata[i] == ?< #
83: if rawdata.index(Starttagopen, i) == i
84: if @literal
85: handle_data(rawdata[i, 1])
86: i += 1
87: next
88: end
89: k = parse_starttag(i)
90: break unless k
91: i = k
92: next
93: end
94: if rawdata.index(Endtagopen, i) == i
95: k = parse_endtag(i)
96: break unless k
97: i = k
98: @literal = false
99: next
100: end
101: if rawdata.index(Commentopen, i) == i
102: if @literal
103: handle_data(rawdata[i,1])
104: i += 1
105: next
106: end
107: k = parse_comment(i)
108: break unless k
109: i += k
110: next
111: end
112: if rawdata.index(Special, i) == i
113: if @literal
114: handle_data(rawdata[i, 1])
115: i += 1
116: next
117: end
118: k = parse_special(i)
119: break unless k
120: i += k
121: next
122: end
123: elsif rawdata[i] == ?& #
124: if rawdata.index(Charref, i) == i
125: i += $&.length
126: handle_charref($1)
127: i -= 1 unless rawdata[i-1] == ?;
128: next
129: end
130: if rawdata.index(Entityref, i) == i
131: i += $&.length
132: handle_entityref($1)
133: i -= 1 unless rawdata[i-1] == ?;
134: next
135: end
136: else
137: raise RuntimeError, 'neither < nor & ??'
138: end
139: # We get here only if incomplete matches but
140: # nothing else
141: match = rawdata.index(Incomplete, i)
142: unless match == i
143: handle_data(rawdata[i, 1])
144: i += 1
145: next
146: end
147: j = match + $&.length
148: break if j == n # Really incomplete
149: handle_data(rawdata[i..(j-1)])
150: i = j
151: end
152: # end while
153: if _end and i < n
154: handle_data(@rawdata[i..(n-1)])
155: i = n
156: end
157: @rawdata = rawdata[i..-1]
158: end
# File lib/feedparser/sgml-parser.rb, line 295
295: def handle_charref(name)
296: n = name.to_i
297: if !(0 <= n && n <= 255)
298: unknown_charref(name)
299: return
300: end
301: handle_data(n.chr)
302: end
# File lib/feedparser/sgml-parser.rb, line 284
284: def handle_endtag(tag, method)
285: self.send(method)
286: end
# File lib/feedparser/sgml-parser.rb, line 304
304: def handle_entityref(name)
305: table = Entitydefs
306: if table.include?(name)
307: handle_data(table[name])
308: else
309: unknown_entityref(name)
310: return
311: end
312: end
# File lib/feedparser/sgml-parser.rb, line 280
280: def handle_starttag(tag, method, attrs)
281: self.send(method, attrs)
282: end
# File lib/feedparser/sgml-parser.rb, line 43
43: def has_context(gi)
44: @stack.include? gi
45: end
# File lib/feedparser/sgml-parser.rb, line 160
160: def parse_comment(i)
161: rawdata = @rawdata
162: if rawdata[i, 4] != '<!--'
163: raise RuntimeError, 'unexpected call to handle_comment'
164: end
165: match = rawdata.index(Commentclose, i)
166: return nil unless match
167: matched_length = $&.length
168: j = match
169: handle_comment(rawdata[i+4..(j-1)])
170: j = match + matched_length
171: return j-i
172: end
# File lib/feedparser/sgml-parser.rb, line 212
212: def parse_endtag(i)
213: rawdata = @rawdata
214: j = rawdata.index(Endbracket, i + 1)
215: return nil unless j
216: tag = (rawdata[i+2..j-1].strip).downcase
217: if rawdata[j] == ?> #
218: j += 1
219: end
220: finish_endtag(tag)
221: return j
222: end
# File lib/feedparser/sgml-parser.rb, line 271
271: def parse_special(i)
272: rawdata = @rawdata
273: match = rawdata.index(Endbracket, i+1)
274: return nil unless match
275: matched_length = $&.length
276: handle_special(rawdata[i+1..(match-1)])
277: return match - i + matched_length
278: end
# File lib/feedparser/sgml-parser.rb, line 174
174: def parse_starttag(i)
175: rawdata = @rawdata
176: j = rawdata.index(Endbracket, i + 1)
177: return nil unless j
178: attrs = []
179: if rawdata[i+1] == ?> #
180: # SGML shorthand: <> == <last open tag seen>
181: k = j
182: tag = @lasttag
183: else
184: match = rawdata.index(Tagfind, i + 1)
185: unless match
186: raise RuntimeError, 'unexpected call to parse_starttag'
187: end
188: k = i + 1 + ($&.length)
189: tag = $&.downcase
190: @lasttag = tag
191: end
192: while k < j
193: break unless rawdata.index(Attrfind, k)
194: matched_length = $&.length
195: attrname, rest, attrvalue = $1, $2, $3
196: if not rest
197: attrvalue = '' # was: = attrname
198: elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
199: (attrvalue[0] == ?" && attrvalue[-1,1] == ?")
200: attrvalue = attrvalue[1..-2]
201: end
202: attrs << [attrname.downcase, attrvalue]
203: k += matched_length
204: end
205: if rawdata[j] == ?> #
206: j += 1
207: end
208: finish_starttag(tag, attrs)
209: return j
210: end
# File lib/feedparser/sgml-parser.rb, line 288
288: def report_unbalanced(tag)
289: if @verbose
290: print '*** Unbalanced </' + tag + '>', "\n"
291: print '*** Stack:', self.stack, "\n"
292: end
293: end
# File lib/feedparser/sgml-parser.rb, line 35
35: def reset
36: @rawdata = ''
37: @stack = []
38: @lasttag = '???'
39: @nomoretags = false
40: @literal = false
41: end
# File lib/feedparser/sgml-parser.rb, line 47
47: def setnomoretags
48: @nomoretags = true
49: @literal = true
50: end