001 /*
002 // $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#4 $
003 // Package org.eigenbase.xom is an XML Object Mapper.
004 // Copyright (C) 2008-2008 The Eigenbase Project
005 // Copyright (C) 2008-2008 Disruptive Tech
006 // Copyright (C) 2008-2008 LucidEra, Inc.
007 //
008 // This library is free software; you can redistribute it and/or modify it
009 // under the terms of the GNU Lesser General Public License as published by the
010 // Free Software Foundation; either version 2 of the License, or (at your
011 // option) any later version approved by The Eigenbase Project.
012 //
013 // This library is distributed in the hope that it will be useful,
014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016 // GNU Lesser General Public License for more details.
017 //
018 // You should have received a copy of the GNU Lesser General Public License
019 // along with this library; if not, write to the Free Software
020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
021 */
022 package org.eigenbase.xom.wrappers;
023
024 import org.eigenbase.xom.*;
025 import org.w3c.dom.Node;
026
027 import java.util.*;
028 import java.io.PrintWriter;
029
030 /**
031 * Quick and dirty XML parser that finds the precise start and end
032 * position of all nodes in a document. Also finds all line endings, so
033 * that character offsets can be converted to line/column positions.
034 *
035 * @author jhyde
036 * @since 13 October, 2008
037 * @version $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#4 $
038 */
039 public class Annotator {
040 private final List/*<LocInfo>*/ locInfoList = new ArrayList();
041 private int[] lineStartPositions;
042 private final String xml;
043 private final Map/*<DOMWrapper, LocInfo>*/ wrapperLocMap =
044 new HashMap();
045 private final Map/*<Node, LocInfo>*/ nodeLocMap = new HashMap();
046 private int seq; // workspace for populateMap
047
048 /**
049 * Creates an Annotator.
050 *
051 * <p>For testing purposes, <code>wrapper</code> may be null. Parses the XML
052 * but does not build the mapping from location information to DOM nodes.
053 *
054 * @param xml XML source string
055 * @param def Wrapper around root DOM node
056 */
057 Annotator(String xml, DOMWrapper def) {
058 this.xml = xml;
059 parse(xml);
060 if (def != null) {
061 seq = 0;
062 populateMap(def);
063 assert this.nodeLocMap.size() == this.wrapperLocMap.size();
064 }
065 }
066
067 public Location getLocation(DOMWrapper wrapper) {
068 LocInfo location0 = (LocInfo) wrapperLocMap.get(wrapper);
069 if (location0 == null) {
070 location0 = (Annotator.LocInfo)
071 nodeLocMap.get(((W3CDOMWrapper) wrapper).node);
072 if (location0 == null) {
073 return null;
074 }
075 }
076 final LocInfo location = location0;
077 return new Location() {
078 public int getStartLine() {
079 return getLine(getStartPos()) + 1;
080 }
081
082 public int getStartColumn() {
083 return getCol(getStartPos()) + 1;
084 }
085
086 public int getStartPos() {
087 return location.startTagStartPos;
088 }
089
090 public int getEndLine() {
091 return getLine(getEndPos()) + 1;
092 }
093
094 public int getEndColumn() {
095 return getCol(getEndPos()) + 1;
096 }
097
098 public int getEndPos() {
099 return location.endTagEndPos >= 0
100 ? location.endTagEndPos
101 : location.startTagEndPos;
102 }
103
104 public String getText(boolean headOnly) {
105 return location.getText(headOnly);
106 }
107
108 public String toString() {
109 return location.toString(Annotator.this);
110 }
111 };
112 }
113
114 /**
115 * Returns the list of LocInfo. For testing.
116 *
117 * @return list of LocInfo.
118 */
119 List getLocInfoList() {
120 return locInfoList;
121 }
122
123 // enum State
124 private static final int
125 STATE_NORMAL = 0,
126 STATE_TAG = 1,
127 STATE_ENDTAG = 2,
128 STATE_QUOT = 3,
129 STATE_APOS = 4,
130 STATE_COMMENT = 5,
131 STATE_CDATA = 6;
132
133 void parse(String s)
134 {
135 final ArrayStack/*<LocInfo>*/ lockInfoStack = new ArrayStack();
136 final List lineStartPositions = new ArrayList();
137 int state = STATE_NORMAL;
138 final int count = s.length();
139 int i = 0;
140 int last = 0;
141 lineStartPositions.add(new Integer(i));
142 lockInfoStack.push(null);
143 LocInfo location = null;
144 loop:
145 while (i < count) {
146 final char c = s.charAt(i);
147 switch (c) {
148 case '<':
149 stateSwitch:
150 switch (state) {
151 case STATE_NORMAL:
152 if (i > last) {
153 // Unlike other node types, we create the LocInfo
154 // at the end of the element. No need to add the node
155 // to the stack, because we'd just remove it again.
156 LocInfo loc2 =
157 new LocInfo(locInfoList.size(), TYPE_TEXT, last);
158 loc2.endTagEndPos = i;
159 locInfoList.add(loc2);
160 }
161 if (i + 1 < count) {
162 final char c1 = s.charAt(i + 1);
163 switch (c1) {
164 case '/':
165 // ^</Tag>
166 state = STATE_ENDTAG;
167 assert location != null;
168 break stateSwitch;
169 case '?':
170 // ^<?xml ... ?>
171 location =
172 new LocInfo(
173 locInfoList.size(),
174 TYPE_PROCESSING_INSTRUCTION, i);
175 locInfoList.add(location);
176 state = STATE_TAG;
177 i += "<?".length();
178 continue loop;
179 case '!':
180 if (s.startsWith("--", i + 2)) {
181 // ^<!--
182 location =
183 new LocInfo(
184 locInfoList.size(),
185 TYPE_COMMENT, i);
186 locInfoList.add(location);
187 state = STATE_COMMENT;
188 i += "<!--".length();
189 continue loop;
190 }
191 if (s.startsWith("[CDATA[", i + 2)) {
192 // ^<![CDATA[
193 location =
194 new LocInfo(
195 locInfoList.size(),
196 TYPE_CDATA_SECTION, i);
197 locInfoList.add(location);
198 state = STATE_CDATA;
199 i += "<![CDATA[".length();
200 continue loop;
201 }
202 break;
203 }
204 }
205 // Start of an element,
206 // ^<Tag a1=v a2=v>
207 // Don't push until we see end of the head tag <Tag ... ^>
208 state = STATE_TAG;
209 location = new LocInfo(locInfoList.size(), TYPE_ELEMENT, i);
210 locInfoList.add(location);
211 ++i;
212 continue loop;
213 }
214 break;
215
216 case '>':
217 switch (state) {
218 case STATE_TAG:
219 ++i;
220 assert location != null;
221 switch (location.type) {
222 case TYPE_PROCESSING_INSTRUCTION:
223 // <? ... ?^>
224 case TYPE_CDATA_SECTION:
225 // <![CDATA[ ... ]]^>
226 case TYPE_COMMENT:
227 // <!-- ... --^>
228 location.endTagEndPos = i;
229 location = (LocInfo) lockInfoStack.peek();
230 break;
231 default:
232 // <Tag^>
233 location.startTagEndPos = i;
234 lockInfoStack.push(location);
235 break;
236 }
237 last = i;
238 state = STATE_NORMAL;
239 continue loop;
240
241 case STATE_ENDTAG:
242 // </Tag^>
243 ++i;
244 assert location != null;
245 location.endTagEndPos = i;
246 try {
247 location = (LocInfo) lockInfoStack.pop();
248 } catch (IndexOutOfBoundsException e) {
249 throw new RuntimeException(
250 "i=" + i + ", xml=" + xml.substring(i)
251 + ", nodeList=" + locInfoList,
252 e);
253 }
254 last = i;
255 state = STATE_NORMAL;
256 continue loop;
257 }
258 break;
259
260 case '/':
261 switch (state) {
262 case STATE_TAG:
263 ++i;
264 if (i < count && s.charAt(i) == '>') {
265 // <Tag a1=v1 a2=v2 ^/>
266 ++i;
267 location.endTagEndPos = i;
268 // no need to pop; we never pushed when we saw '<'
269 location = (LocInfo) lockInfoStack.peek();
270 last = i;
271 state = STATE_NORMAL;
272 }
273 continue loop;
274 }
275 break;
276
277 case ']':
278 switch (state) {
279 case STATE_CDATA:
280 if (s.startsWith("]>", i + 1)) {
281 // <![CDATA[ ... ^]]>
282 state = STATE_NORMAL;
283 i += "]]>".length();
284 location.endTagEndPos = i;
285 location = (LocInfo) lockInfoStack.peek();
286 last = i;
287 continue loop;
288 }
289 }
290 break;
291
292 case '-':
293 switch (state) {
294 case STATE_COMMENT:
295 if (s.startsWith("->", i + 1)) {
296 // <!-- xxxxx^-->
297 i += "-->".length();
298 location.endTagEndPos = i;
299 last = i;
300 location = (LocInfo) lockInfoStack.peek();
301 state = STATE_NORMAL;
302 continue loop;
303 }
304 }
305 break;
306
307 case '\r':
308 ++i;
309 if (i < count && s.charAt(i) == '\n') {
310 // only count windows line ending CR LF as one line
311 ++i;
312 }
313 lineStartPositions.add(new Integer(i));
314 continue loop;
315
316 case '\n':
317 ++i;
318 lineStartPositions.add(new Integer(i));
319 continue loop;
320
321 case '\'':
322 switch (state) {
323 case STATE_APOS:
324 // a='xxx^'
325 state = STATE_TAG;
326 break;
327 case STATE_TAG:
328 // a=^'xxx'
329 state = STATE_APOS;
330 break;
331 case STATE_QUOT:
332 // a="doesn^'t matter"
333 default:
334 break;
335 }
336 break;
337
338 case '"':
339 switch (state) {
340 case STATE_QUOT:
341 // a="xxx^"
342 state = STATE_TAG;
343 break;
344 case STATE_TAG:
345 // a=^"xxx"
346 state = STATE_QUOT;
347 break;
348 case STATE_APOS:
349 // a='doesn^"t matter'
350 default:
351 break;
352 }
353 break;
354 }
355
356 ++i;
357 }
358 this.lineStartPositions = new int[lineStartPositions.size()];
359 for (int j = 0; j < lineStartPositions.size(); j++) {
360 this.lineStartPositions[j] =
361 ((Integer) lineStartPositions.get(j)).intValue();
362 }
363 }
364
365 private void populateMap(DOMWrapper def)
366 {
367 final int defType = def.getType();
368 LocInfo location;
369 while (true) {
370 location = (LocInfo) locInfoList.get(seq++);
371 if (defType == DOMWrapper.ELEMENT
372 && location.type == TYPE_ELEMENT)
373 {
374 break;
375 }
376 if (defType == DOMWrapper.CDATA
377 && location.type == TYPE_TEXT)
378 {
379 break;
380 }
381 if (seq >= locInfoList.size()) {
382 return;
383 }
384 }
385 wrapperLocMap.put(def, location);
386 nodeLocMap.put(((W3CDOMWrapper) def).node, location);
387 final DOMWrapper[] elementChildren = def.getElementChildren();
388 for (int i = 0; i < elementChildren.length; i++) {
389 DOMWrapper domWrapper = elementChildren[i];
390 populateMap(domWrapper);
391 }
392 }
393
394 /**
395 * Returns the line that a character position falls on. The first line in a
396 * document is numbered 0.
397 *
398 * @param pos Character position
399 * @return Line (starting from 0)
400 */
401 int getLine(int pos)
402 {
403 int index = Arrays.binarySearch(lineStartPositions, pos);
404 if (index >= 0) {
405 return index;
406 } else {
407 return -2 - index;
408 }
409 }
410
411 /**
412 * Returns the column that a character position falls on. The first column
413 * in a line is numbered 0.
414 *
415 * @param pos Character position
416 * @return column (starting from 0)
417 */
418 int getCol(int pos)
419 {
420 int index = Arrays.binarySearch(lineStartPositions, pos);
421 if (index >= 0) {
422 return 0;
423 } else {
424 index = -2 - index;
425 return pos - lineStartPositions[index];
426 }
427 }
428
429 void list(PrintWriter pw)
430 {
431 for (int i = 0; i < locInfoList.size(); i++) {
432 LocInfo location = (LocInfo) locInfoList.get(i);
433 pw.println(
434 location.seq + ": " + location.toString(this) + " ["
435 + location.getText(xml) + "]");
436 }
437 pw.flush();
438 }
439
440 // enum Type
441 private static final int
442 TYPE_ELEMENT = Node.ELEMENT_NODE,
443 TYPE_PROCESSING_INSTRUCTION = Node.PROCESSING_INSTRUCTION_NODE,
444 TYPE_COMMENT = Node.COMMENT_NODE,
445 TYPE_CDATA_SECTION = Node.CDATA_SECTION_NODE,
446 TYPE_TEXT = Node.TEXT_NODE;
447
448 class LocInfo {
449 /** Sequence in document, ordered by start position (prefix order) */
450 final int seq;
451 /** Node type, typically {@link Node#ELEMENT_NODE}. */
452 final int startTagStartPos;
453 final int type;
454 int startTagEndPos = -1; // -1 if entity is a single tag
455 int endTagEndPos = -1;
456
457 /**
458 * Creates a LocInfo.
459 *
460 * @param seq Sequence number in document
461 * @param nodeType Node type, typically {@link Node#ELEMENT_NODE}.
462 * @param startTagStartPos Position of start of element
463 */
464 LocInfo(int seq, int nodeType, int startTagStartPos) {
465 this.seq = seq;
466 this.type = nodeType;
467 this.startTagStartPos = startTagStartPos;
468 }
469
470 public String toString(Annotator annotator) {
471 return "line " + annotator.getLine(startTagStartPos)
472 + ", column " + annotator.getCol(startTagStartPos);
473 }
474
475 /**
476 * Returns the fragment of source XML that this node encompasses.
477 *
478 * @param xml Whole source XML
479 * @return fragment of source XML
480 */
481 public String getText(String xml) {
482 return xml.substring(
483 startTagStartPos,
484 endTagEndPos >= 0 ? endTagEndPos
485 : xml.length());
486 }
487
488 /**
489 * Returns the fragment of source XML corresponding to the head tag
490 * of this element, if this is an element, otherwise the whole node.
491 *
492 * @param xml Whole source XML
493 * @return fragment of source XML
494 */
495 public String getHeadText(String xml) {
496 return xml.substring(
497 startTagStartPos,
498 startTagEndPos >= 0 ? startTagEndPos
499 : endTagEndPos >= 0 ? endTagEndPos
500 : xml.length());
501 }
502
503 public String toString() {
504 return getHeadText(xml);
505 }
506
507 /**
508 * Returns the text of this location. Specification as for
509 * {@link org.eigenbase.xom.Location#getText(boolean)}.
510 *
511 * @param headOnly Whether to return only the head of elements
512 * @return Source text underlying a location
513 */
514 public String getText(boolean headOnly) {
515 return xml.substring(
516 startTagStartPos,
517 headOnly && startTagEndPos >= 0
518 ? startTagEndPos
519 : endTagEndPos >= 0
520 ? endTagEndPos
521 : xml.length());
522 }
523 }
524
525 /**
526 * Similar to {@link Stack} but based on {@link ArrayList} instead of
527 * {@link Vector}, and therefore more efficient.
528 */
529 private static class ArrayStack extends ArrayList {
530 public final void push(Object t)
531 {
532 if (false) System.out.println(size() + " push [" + t + "]");
533 add(t);
534 }
535
536 public final Object peek()
537 {
538 return get(size() - 1);
539 }
540
541 public final Object pop()
542 {
543 final int index = size() - 1;
544 Object t = remove(index);
545 if (false) System.out.println(size() + " pop [" + t + "]");
546 return get(index - 1);
547 }
548 }
549 }
550
551 // End Annotator.java