package semdiff; /* * Unibz - XML Data Management 2010/2011 - Miniproject * * semdiff * * we would like to develop a command line application * that tells us whether two XML documents are *semantically* * identical or not * * note that being semantically identical is a "weaker" condition * than being syntactically identical - XML doesn't for example * mandate the order of attributes in an element and ignores * whitespace in certain cases; documents that differ only by * such things would be considered semantically identical while * being syntactically different * * we start from the DOM pretty printer code seen in lesson 05 * and work from there * * -- chris */ import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import java.io.File; import java.util.ArrayList; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; public class Main { public static void main(String[] argv) throws Exception { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); // factory.setValidating(true); // factory.setNamespaceAware(true); DocumentBuilder builder = factory.newDocumentBuilder(); Document doca; Document docb; doca = builder.parse(new File("/Users/chris/prj/unibz/xml/06_samples/recipe1.xml")); docb = builder.parse(new File("/Users/chris/prj/unibz/xml/06_samples/recipe2.xml")); StringBuffer bufa = new StringBuffer(""); StringBuffer bufb = new StringBuffer(""); // let's call our traversal code traverse(doca.getDocumentElement(), 0, bufa); traverse(docb.getDocumentElement(), 0, bufb); // System.out.println(bufa.toString()); // System.out.println(bufb.toString()); if (bufa.toString().equals(bufb.toString())) { System.out.println("documents are semantically identical"); } else { System.out.println("documents are semantically different"); } } private static final int OPEN=1; private static final int CLOSE=2; private static void traverse(Node n, int level, StringBuffer buf) { int i; // find out what type of node this is short t = n.getNodeType(); if (t == Document.ELEMENT_NODE) { buf.append(print_element(level, n, OPEN)); } else if (t == Document.TEXT_NODE) { if (!is_empty(n)) { buf.append(print_text(level, n)); } } NodeList nl = n.getChildNodes(); if (nl.getLength() == 0) { return; } for (i = 0; i < nl.getLength(); i++) { traverse(nl.item(i), level + 1, buf); } if (t == Document.ELEMENT_NODE) { buf.append(print_element(level, n, CLOSE)); } } static String print_element(int level, Node n, int mode) { String slash = ""; String attr = ""; if (mode == CLOSE) { slash = "/"; } if (mode == OPEN) { NamedNodeMap amap = n.getAttributes(); ArrayList attr_list = new ArrayList(); int i; for (i = 0; i < amap.getLength(); i++) { String p = amap.item(i).getNodeName() + "=\"" + amap.item(i).getNodeValue() + "\""; attr_list.add(p); } java.util.Collections.sort(attr_list); for (i = 0; i < attr_list.size(); i++) { attr += " " + attr_list.get(i); } } return get_indent(level) + "<" + slash + n.getNodeName() + attr + ">"; } static String print_text(int level, Node n) { String txt = n.getNodeValue().trim(); txt = txt.replaceAll("\\s+", " "); return get_indent(level) + txt; } static String get_indent(int level) { int i; StringBuffer buf = new StringBuffer(); /* for (i = 0; i < level; i++) { buf.append(" "); } */ return buf.toString(); } static boolean is_empty(Node n) { String val = n.getNodeValue(); val = val.replaceAll("\\s+", ""); val = val.replaceAll("\n+", ""); if (val.equals("")) { return true; } return false; } }