SU3 News: Fix parsing of the XHTMl nodes

2014-10-22 18:20:31 +00:00
parent a36ef62358
commit 7955b8ae71
2 changed files with 179 additions and 4 deletions
--- a/apps/routerconsole/java/src/net/i2p/router/news/NewsXMLParser.java
+++ b/apps/routerconsole/java/src/net/i2p/router/news/NewsXMLParser.java
@ -18,7 +18,6 @@ import net.i2p.util.Log;
 import org.cybergarage.util.Debug;
 import org.cybergarage.xml.Node;
 import org.cybergarage.xml.ParserException;
-import org.cybergarage.xml.parser.JaxpParser;

 /**
 *  Parse out the news.xml file which is in Atom format (RFC4287).
@ -39,7 +38,9 @@ public class NewsXMLParser {
        "del", "ins", "em", "strong", "mark", "sub", "sup", "tt", "code", "strike", "s", "u",
        "h4", "h5", "h6",
        "ol", "ul", "li", "dl", "dt", "dd",
-        "table", "tr", "td", "th"
+        "table", "tr", "td", "th",
+        // put in by parser
+        XMLParser.TEXT_NAME
    }));

    /**
@ -94,7 +95,7 @@ public class NewsXMLParser {
    public void parse(InputStream in) throws IOException {
        _entries = null;
        _metadata = null;
-        JaxpParser parser = new JaxpParser();
+        XMLParser parser = new XMLParser(_context);
        try {
            Node root = parser.parse(in);
            extract(root);
@ -255,7 +256,7 @@ public class NewsXMLParser {
                    }
                    if (e == null)
                        break;
-                    buf.append(sn.toString());
+                    XMLParser.toString(buf, sn);
                }
                if (e == null)
                    continue;
--- a/apps/routerconsole/java/src/net/i2p/router/news/XMLParser.java
+++ b/apps/routerconsole/java/src/net/i2p/router/news/XMLParser.java
@ -0,0 +1,174 @@
+package net.i2p.router.news;
+
+/******************************************************************
+*  Contains code modified from JaxpParser:
+*
+*    CyberXML for Java
+*
+*    Copyright (C) Satoshi Konno 2004
+*
+*    Author: Markus Thurner (http://thoean.com)
+*
+*  Contains code modified from Node:
+*
+*    CyberXML for Java
+*
+*    Copyright (C) Satoshi Konno 2002
+******************************************************************/
+
+import org.w3c.dom.NamedNodeMap;
+
+import net.i2p.I2PAppContext;
+import net.i2p.util.Log;
+import org.cybergarage.xml.Attribute;
+import org.cybergarage.xml.Node;
+import org.cybergarage.xml.XML;
+import org.cybergarage.xml.parser.JaxpParser;
+
+
+/**
+ *  Override so that XHTML is parsed correctly.
+ *
+ *  This requires us to maintain mixed text and subnodes and output both.
+ *
+ *  @since 0.9.17
+ */
+public class XMLParser extends JaxpParser {
+    private final Log _log;
+
+    public static final String TEXT_NAME = "#text";
+
+    public XMLParser(I2PAppContext ctx) {
+        super();
+        _log = ctx.logManager().getLog(XMLParser.class);
+    }
+
+    /**
+     *  Modified from UPnP JaxpParser
+     *
+     *  @param parentNode null if at top
+     *  @param rank parse level, only for debug
+     *  @return the parsed node, or the parent node, unused except at top level
+     */
+    @Override
+    public org.cybergarage.xml.Node parse(Node parentNode, org.w3c.dom.Node domNode, int rank) {
+        int domNodeType = domNode.getNodeType();
+        String domNodeName = domNode.getNodeName();
+        String domNodeValue = domNode.getNodeValue();
+        NamedNodeMap attrs = domNode.getAttributes(); 
+        int arrrsLen = (attrs != null) ? attrs.getLength() : 0;
+
+        if (_log.shouldLog(Log.DEBUG)) {
+            String val = domNodeValue != null ?
+                         " = \"" + domNodeValue.replace("\n", "\\n").replace("\r", "\\r") + '"' :
+                         "";
+            _log.debug("[" + rank + "] ELEM : \"" + domNodeName + '"' + val +
+                       " type = " + domNodeType + " with " + arrrsLen + " attrs");
+        }
+
+        // I2P -
+        // If it's only whitespace, skip it altogether.
+        // Only add it to the value if we don't have any other nodes.
+        // Otherwise, add it as a node.
+        if (domNodeType == org.w3c.dom.Node.TEXT_NODE) {
+            if (domNodeValue.replaceAll("[ \t\r\n]", "").length() == 0) {
+                return parentNode;
+            }
+            if (!parentNode.hasNodes()) {
+                parentNode.addValue(domNodeValue);
+                return parentNode;
+            }
+            // else we will add it as a node below
+        } else if (domNodeType != org.w3c.dom.Node.ELEMENT_NODE) {
+            return parentNode;
+        }
+
+        Node node = new Node();
+        node.setName(domNodeName);
+        node.setValue(domNodeValue);
+
+        if (parentNode != null) {
+            // I2P - take the value and convert it to a text node, if it's not just whitespace
+            String oldValue = parentNode.getValue();
+            if (oldValue != null && oldValue.length() > 0) {
+                parentNode.setValue("");
+                Node text = new Node();
+                text.setName(TEXT_NAME);
+                text.setValue(oldValue);
+                parentNode.addNode(text);
+                if (_log.shouldLog(Log.DEBUG))
+                    _log.debug("Converted value to node");
+            }
+            parentNode.addNode(node);
+        }
+        if (domNodeType == org.w3c.dom.Node.TEXT_NODE)
+            return parentNode;
+
+        if (attrs != null) {
+            for (int n = 0; n < arrrsLen; n++) {
+                org.w3c.dom.Node attr = attrs.item(n);
+                String attrName = attr.getNodeName();
+                String attrValue = attr.getNodeValue();
+                node.setAttribute(attrName, attrValue);
+            }
+        }
+        
+        org.w3c.dom.Node child = domNode.getFirstChild();
+        if (child == null) { 
+            node.setValue(""); 
+            return node; 
+        }
+        do{
+            parse(node, child, rank+1);
+            child = child.getNextSibling();
+        } while (child != null);        
+        
+        return node;
+    }
+
+    /**
+     *  A replacement for Node.toString(), which does not recognize #text.
+     */
+    public static void toString(StringBuilder buf, Node node) {
+        output(buf, node, 0);
+    }
+
+
+    /**
+     *  A replacement for Node.output(), which does not recognize #text.
+     *  Also, we use the empty entity, so <br /> does not turn into <br></br>.
+     */
+    private static void output(StringBuilder buf, Node node, int indentLevel) {
+        String name = node.getName();
+        String value = XML.escapeXMLChars(node.getValue());
+        if (name.equals(TEXT_NAME)) {
+            buf.append(value);
+            return;
+        }
+        
+        String indentString = node.getIndentLevelString(indentLevel);
+        buf.append(indentString).append('<').append(name);
+        int nAttributes = node.getNAttributes();
+        for (int n = 0; n < nAttributes; n++) {
+            Attribute attr = node.getAttribute(n);
+            buf.append(' ').append(attr.getName()).append("=\"").append(XML.escapeXMLChars(attr.getValue())).append('"');
+        }
+
+        // As in Node, output either the nodes or the value.
+        // If mixed values and nodes, the values must be text nodes. See parser above.
+        if (node.hasNodes()) {        
+            buf.append(">\n");
+            int nChildNodes = node.getNNodes();
+            for (int n = 0; n < nChildNodes; n++) {
+                Node cnode = node.getNode(n);
+                output(buf, cnode, indentLevel + 1);
+            }
+            buf.append(indentString).append("</").append(name).append(">\n");
+        } else {
+            if (value == null || value.length() == 0)
+                buf.append("/>");
+            else
+                buf.append('>').append(value).append("</").append(name).append('>');
+        }
+    }
+}