View Javadoc

1   /*
2    * Copyright (c) 2009 Kathryn Huxtable
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   * 
16   * $Id$
17   */
18  package org.kathrynhuxtable.maven.plugins.htmlfiltersite;
19  
20  import java.io.IOException;
21  import java.io.Writer;
22  import java.util.List;
23  
24  import org.jdom.Attribute;
25  import org.jdom.CDATA;
26  import org.jdom.Comment;
27  import org.jdom.Element;
28  import org.jdom.EntityRef;
29  import org.jdom.Namespace;
30  import org.jdom.ProcessingInstruction;
31  import org.jdom.Text;
32  import org.jdom.Verifier;
33  import org.jdom.output.EscapeStrategy;
34  import org.jdom.output.Format;
35  import org.jdom.output.XMLOutputter;
36  
37  /**
38   * @author Kathryn Huxtable
39   */
40  public class HTMLOutputter extends XMLOutputter {
41  
42      protected Format         nonBreakingFormat  = Format.getRawFormat();
43      protected EscapeStrategy htmlEscapeStrategy = new HTMLEscapeStrategy();
44  
45      public HTMLOutputter() {
46          super();
47          currentFormat.setEscapeStrategy(htmlEscapeStrategy);
48          nonBreakingFormat.setEscapeStrategy(htmlEscapeStrategy);
49      }
50  
51      /**
52       * This will create an <code>XMLOutputter</code> with the specified format
53       * characteristics. Note the format object is cloned internally before use.
54       */
55      public HTMLOutputter(Format format) {
56          super(format);
57          currentFormat.setEscapeStrategy(htmlEscapeStrategy);
58          nonBreakingFormat.setEscapeStrategy(htmlEscapeStrategy);
59      }
60  
61      /**
62       * {@inheritDoc}
63       */
64      public void setFormat(Format newFormat) {
65          super.setFormat(newFormat);
66          currentFormat.setEscapeStrategy(htmlEscapeStrategy);
67      }
68  
69      /**
70       * This will handle printing of a <code>{@link Element}</code>, its
71       * <code>{@link Attribute}</code>s, and all contained (child) elements, etc.
72       * 
73       * @param element
74       *            <code>Element</code> to output.
75       * @param out
76       *            <code>Writer</code> to use.
77       * @param level
78       *            <code>int</code> level of indention.
79       * @param namespaces
80       *            <code>List</code> stack of Namespaces in scope.
81       */
82      protected void printElement(Writer out, Element element, int level, NamespaceStack namespaces) throws IOException {
83  
84          List<Attribute> attributes = getElementAttributes(element);
85          List<?> content = element.getContent();
86  
87          // Check for xml:space and adjust format settings
88          String space = null;
89          if (attributes != null) {
90              space = element.getAttributeValue("space", Namespace.XML_NAMESPACE);
91          }
92  
93          Format previousFormat = currentFormat;
94          if ("default".equals(space)) {
95              currentFormat = getFormat();
96          } else if ("preserve".equals(space) || isNonBreaking(element)) {
97              currentFormat = nonBreakingFormat;
98          }
99  
100         // Print the beginning of the tag plus attributes and any
101         // necessary namespace declarations
102         out.write("<");
103         printQualifiedName(out, element);
104 
105         // Mark our namespace starting point
106         int previouslyDeclaredNamespaces = namespaces.size();
107 
108         // Print the element's namespace, if appropriate
109         printElementNamespace(out, element, namespaces);
110 
111         // Print out additional namespace declarations
112         printAdditionalNamespaces(out, element, namespaces);
113 
114         // Print out attributes
115         if (attributes != null) printAttributes(out, attributes, element, namespaces);
116 
117         // Depending on the settings (newlines, textNormalize, etc), we may
118         // or may not want to print all of the content, so determine the
119         // index of the start of the content we're interested
120         // in based on the current settings.
121 
122         int start = skipLeadingWhite(content, 0);
123         int size = content.size();
124         if (start >= size) {
125             // Case content is empty or all insignificant whitespace
126             if (true || currentFormat.getExpandEmptyElements()) {
127                 out.write("></");
128                 printQualifiedName(out, element);
129                 out.write(">");
130             } else {
131                 out.write(" />");
132             }
133         } else {
134             out.write(">");
135 
136             // For a special case where the content is only CDATA
137             // or Text we don't want to indent after the start or
138             // before the end tag.
139 
140             if (nextNonText(content, start) < size) {
141                 // Case Mixed Content - normal indentation
142                 newline(out);
143                 printContentRange(out, content, start, size, level + 1, namespaces);
144                 newline(out);
145                 indent(out, level);
146             } else {
147                 // Case all CDATA or Text - no indentation
148                 printTextRange(out, content, start, size);
149             }
150             out.write("</");
151             printQualifiedName(out, element);
152             out.write(">");
153         }
154 
155         // remove declared namespaces from stack
156         while (namespaces.size() > previouslyDeclaredNamespaces) {
157             namespaces.pop();
158         }
159 
160         // Restore our format settings
161         currentFormat = previousFormat;
162     }
163 
164     @SuppressWarnings("unchecked")
165     private List<Attribute> getElementAttributes(Element element) {
166         return (List<Attribute>) element.getAttributes();
167     }
168 
169     private boolean isNonBreaking(Element element) {
170         String eName = element.getName();
171 
172         if ("span".equals(eName) || "p".equals(eName) || "li".equals(eName) || "h1".equals(eName) || "h2".equals(eName)
173                 || "h3".equals(eName) || "caption".equals(eName) || "sup".equals(eName) || "sub".equals(eName)) {
174             return true;
175         }
176         return false;
177     }
178 
179     /**
180      * This will handle printing a string. Escapes the element entities, trims
181      * interior whitespace, etc. if necessary.
182      */
183     private void printString(Writer out, String str) throws IOException {
184         if (currentFormat.getTextMode() == Format.TextMode.NORMALIZE) {
185             str = Text.normalizeString(str);
186         } else if (currentFormat.getTextMode() == Format.TextMode.TRIM) {
187             str = str.trim();
188         }
189         out.write(escapeElementEntities(str));
190     }
191 
192     /**
193      * This will print a newline only if indent is not null.
194      * 
195      * @param out
196      *            <code>Writer</code> to use
197      */
198     private void newline(Writer out) throws IOException {
199         if (currentFormat.getIndent() != null) {
200             out.write(currentFormat.getLineSeparator());
201         }
202     }
203 
204     /**
205      * This will print indents only if indent is not null or the empty string.
206      * 
207      * @param out
208      *            <code>Writer</code> to use
209      * @param level
210      *            current indent level
211      */
212     private void indent(Writer out, int level) throws IOException {
213         if (currentFormat.getIndent() == null || currentFormat.getIndent().equals("")) {
214             return;
215         }
216 
217         for (int i = 0; i < level; i++) {
218             out.write(currentFormat.getIndent());
219         }
220     }
221 
222     // Returns the index of the first non-all-whitespace CDATA or Text,
223     // index = content.size() is returned if content contains
224     // all whitespace.
225     // @param start index to begin search (inclusive)
226     private int skipLeadingWhite(List<?> content, int start) {
227         if (start < 0) {
228             start = 0;
229         }
230 
231         int index = start;
232         int size = content.size();
233         if (currentFormat.getTextMode() == Format.TextMode.TRIM_FULL_WHITE
234                 || currentFormat.getTextMode() == Format.TextMode.NORMALIZE || currentFormat.getTextMode() == Format.TextMode.TRIM) {
235             while (index < size) {
236                 if (!isAllWhitespace(content.get(index))) {
237                     return index;
238                 }
239                 index++;
240             }
241         }
242         return index;
243     }
244 
245     // Return the index + 1 of the last non-all-whitespace CDATA or
246     // Text node, index < 0 is returned
247     // if content contains all whitespace.
248     // @param start index to begin search (exclusive)
249     private int skipTrailingWhite(List<?> content, int start) {
250         int size = content.size();
251         if (start > size) {
252             start = size;
253         }
254 
255         int index = start;
256         if (currentFormat.getTextMode() == Format.TextMode.TRIM_FULL_WHITE
257                 || currentFormat.getTextMode() == Format.TextMode.NORMALIZE || currentFormat.getTextMode() == Format.TextMode.TRIM) {
258             while (index >= 0) {
259                 if (!isAllWhitespace(content.get(index - 1))) break;
260                 --index;
261             }
262         }
263         return index;
264     }
265 
266     // Return the next non-CDATA, non-Text, or non-EntityRef node,
267     // index = content.size() is returned if there is no more non-CDATA,
268     // non-Text, or non-EntiryRef nodes
269     // @param start index to begin search (inclusive)
270     private static int nextNonText(List<?> content, int start) {
271         if (start < 0) {
272             start = 0;
273         }
274 
275         int index = start;
276         int size = content.size();
277         while (index < size) {
278             Object node = content.get(index);
279             if (!((node instanceof Text) || (node instanceof EntityRef))) {
280                 return index;
281             }
282             index++;
283         }
284         return size;
285     }
286 
287     // Determine if a Object is all whitespace
288     private boolean isAllWhitespace(Object obj) {
289         String str = null;
290 
291         if (obj instanceof String) {
292             str = (String) obj;
293         } else if (obj instanceof Text) {
294             str = ((Text) obj).getText();
295         } else if (obj instanceof EntityRef) {
296             return false;
297         } else {
298             return false;
299         }
300 
301         for (int i = 0; i < str.length(); i++) {
302             if (!Verifier.isXMLWhitespace(str.charAt(i))) return false;
303         }
304         return true;
305     }
306 
307     // Determine if a string starts with a XML whitespace.
308     private boolean startsWithWhite(String str) {
309         if ((str != null) && (str.length() > 0) && Verifier.isXMLWhitespace(str.charAt(0))) {
310             return true;
311         }
312         return false;
313     }
314 
315     // Determine if a string ends with a XML whitespace.
316     private boolean endsWithWhite(String str) {
317         if ((str != null) && (str.length() > 0) && Verifier.isXMLWhitespace(str.charAt(str.length() - 1))) {
318             return true;
319         }
320         return false;
321     }
322 
323     // Support method to print a name without using elt.getQualifiedName()
324     // and thus avoiding a StringBuffer creation and memory churn
325     private void printQualifiedName(Writer out, Element e) throws IOException {
326         if (e.getNamespace().getPrefix().length() == 0) {
327             out.write(e.getName());
328         } else {
329             out.write(e.getNamespace().getPrefix());
330             out.write(':');
331             out.write(e.getName());
332         }
333     }
334 
335     /**
336      * This will handle printing of content within a given range. The range to
337      * print is specified in typical Java fashion; the starting index is
338      * inclusive, while the ending index is exclusive.
339      * 
340      * @param content
341      *            <code>List</code> of content to output
342      * @param start
343      *            index of first content node (inclusive.
344      * @param end
345      *            index of last content node (exclusive).
346      * @param out
347      *            <code>Writer</code> to use.
348      * @param level
349      *            <code>int</code> level of indentation.
350      * @param namespaces
351      *            <code>List</code> stack of Namespaces in scope.
352      */
353     private void printContentRange(Writer out, List<?> content, int start, int end, int level, NamespaceStack namespaces)
354             throws IOException {
355         boolean firstNode; // Flag for 1st node in content
356         Object next; // Node we're about to print
357         int first, index; // Indexes into the list of content
358 
359         index = start;
360         while (index < end) {
361             firstNode = (index == start) ? true : false;
362             next = content.get(index);
363 
364             //
365             // Handle consecutive CDATA, Text, and EntityRef nodes all at
366             // once
367             //
368             if ((next instanceof Text) || (next instanceof EntityRef)) {
369                 first = skipLeadingWhite(content, index);
370                 // Set index to next node for loop
371                 index = nextNonText(content, first);
372 
373                 // If it's not all whitespace - print it!
374                 if (first < index) {
375                     if (!firstNode) newline(out);
376                     indent(out, level);
377                     printTextRange(out, content, first, index);
378                 }
379                 continue;
380             }
381 
382             //
383             // Handle other nodes
384             //
385             if (!firstNode) {
386                 newline(out);
387             }
388 
389             indent(out, level);
390 
391             if (next instanceof Comment) {
392                 printComment(out, (Comment) next);
393             } else if (next instanceof Element) {
394                 printElement(out, (Element) next, level, namespaces);
395             } else if (next instanceof ProcessingInstruction) {
396                 printProcessingInstruction(out, (ProcessingInstruction) next);
397             } else {
398                 // XXX if we get here then we have a illegal content, for
399                 // now we'll just ignore it (probably should throw
400                 // a exception)
401             }
402 
403             index++;
404         } /* while */
405     }
406 
407     /**
408      * This will handle printing of a sequence of <code>{@link CDATA}</code> or
409      * <code>{@link Text}</code> nodes. It is an error to have any other pass
410      * this method any other type of node.
411      * 
412      * @param content
413      *            <code>List</code> of content to output
414      * @param start
415      *            index of first content node (inclusive).
416      * @param end
417      *            index of last content node (exclusive).
418      * @param out
419      *            <code>Writer</code> to use.
420      */
421     private void printTextRange(Writer out, List<?> content, int start, int end) throws IOException {
422         String previous; // Previous text printed
423         Object node; // Next node to print
424         String next; // Next text to print
425 
426         previous = null;
427 
428         // Remove leading whitespace-only nodes
429         start = skipLeadingWhite(content, start);
430 
431         int size = content.size();
432         if (start < size) {
433             // And remove trialing whitespace-only nodes
434             end = skipTrailingWhite(content, end);
435 
436             for (int i = start; i < end; i++) {
437                 node = content.get(i);
438 
439                 // Get the unmangled version of the text
440                 // we are about to print
441                 if (node instanceof Text) {
442                     next = ((Text) node).getText();
443                 } else if (node instanceof EntityRef) {
444                     next = "&" + ((EntityRef) node).getValue() + ";";
445                 } else {
446                     throw new IllegalStateException("Should see only " + "CDATA, Text, or EntityRef");
447                 }
448 
449                 // This may save a little time
450                 if (next == null || "".equals(next)) {
451                     continue;
452                 }
453 
454                 // Determine if we need to pad the output (padding is
455                 // only need in trim or normalizing mode)
456                 if (previous != null) { // Not 1st node
457                     if (currentFormat.getTextMode() == Format.TextMode.NORMALIZE
458                             || currentFormat.getTextMode() == Format.TextMode.TRIM) {
459                         if ((endsWithWhite(previous)) || (startsWithWhite(next))) {
460                             out.write(" ");
461                         }
462                     }
463                 }
464 
465                 // Print the node
466                 if (node instanceof CDATA) {
467                     printCDATA(out, (CDATA) node);
468                 } else if (node instanceof EntityRef) {
469                     printEntityRef(out, (EntityRef) node);
470                 } else {
471                     printString(out, next);
472                 }
473 
474                 previous = next;
475             }
476         }
477     }
478 
479     /**
480      * This will handle printing of any needed <code>{@link Namespace}</code>
481      * declarations.
482      * 
483      * @param ns
484      *            <code>Namespace</code> to print definition of
485      * @param out
486      *            <code>Writer</code> to use.
487      */
488     private void printNamespace(Writer out, Namespace ns, NamespaceStack namespaces) throws IOException {
489         String prefix = ns.getPrefix();
490         String uri = ns.getURI();
491 
492         // Already printed namespace decl?
493         if (uri.equals(namespaces.getURI(prefix))) {
494             return;
495         }
496 
497         out.write(" xmlns");
498         if (!prefix.equals("")) {
499             out.write(":");
500             out.write(prefix);
501         }
502         out.write("=\"");
503         out.write(escapeAttributeEntities(uri));
504         out.write("\"");
505         namespaces.push(ns);
506     }
507 
508     private void printElementNamespace(Writer out, Element element, NamespaceStack namespaces) throws IOException {
509         // Add namespace decl only if it's not the XML namespace and it's
510         // not the NO_NAMESPACE with the prefix "" not yet mapped
511         // (we do output xmlns="" if the "" prefix was already used and we
512         // need to reclaim it for the NO_NAMESPACE)
513         Namespace ns = element.getNamespace();
514         if (ns == Namespace.XML_NAMESPACE) {
515             return;
516         }
517         if (!((ns == Namespace.NO_NAMESPACE) && (namespaces.getURI("") == null))) {
518             printNamespace(out, ns, namespaces);
519         }
520     }
521 
522     private void printAdditionalNamespaces(Writer out, Element element, NamespaceStack namespaces) throws IOException {
523         List<Namespace> list = getElementAdditionalNamespaces(element);
524         if (list != null) {
525             for (int i = 0; i < list.size(); i++) {
526                 Namespace additional = list.get(i);
527                 printNamespace(out, additional, namespaces);
528             }
529         }
530     }
531 
532     @SuppressWarnings("unchecked")
533     private List<Namespace> getElementAdditionalNamespaces(Element element) {
534         return (List<Namespace>) element.getAdditionalNamespaces();
535     }
536 
537     public class HTMLEscapeStrategy implements EscapeStrategy {
538         public boolean shouldEscape(char ch) {
539             // Magic numbers for ASCII. If the character isn't
540             // printable ASCII, then escape it. Normal XML
541             // syntax characters will always be escaped.
542             if (ch < ' ' || ch > 127) {
543                 return true;
544             } else {
545                 return false;
546             }
547         }
548     }
549 }