Kagee · October 2, 2011 21:22
diff --git a/dom.java b/dom.java
 /*

 1. The input documents somtimes has characters that are not valid as UTF-8 text. Where/how can i filter/remove them ?

 hildenae@sektober:~/Dokumenter/Kildekode/statsparser$ java dom
 com.sun.org.apache.xerces.internal.impl.io.MalformedByteSequenceException: Invalid byte 1 of 1-byte UTF-8 sequence.
        at com.sun.org.apache.xerces.internal.impl.io.UTF8Reader.invalidByte(UTF8Reader.java:684)
        at com.sun.org.apache.xerces.internal.impl.io.UTF8Reader.read(UTF8Reader.java:554)
        at com.sun.org.apache.xerces.internal.impl.XMLEntityScanner.load(XMLEntityScanner.java:1742)
        at com.sun.org.apache.xerces.internal.impl.XMLEntityScanner.skipChar(XMLEntityScanner.java:1416)
        at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl$FragmentContentDriver.next(XMLDocumentFragmentScannerImpl.java:2792)
        at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl.next(XMLDocumentScannerImpl.java:648)
        at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl.scanDocument(XMLDocumentFragmentScannerImpl.java:511)
        at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(XML11Configuration.java:808)
        at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(XML11Configuration.java:737)
        at com.sun.org.apache.xerces.internal.parsers.XMLParser.parse(XMLParser.java:119)
        at com.sun.org.apache.xerces.internal.parsers.DOMParser.parse(DOMParser.java:235)
        at com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderImpl.parse(DocumentBuilderImpl.java:284)
        at javax.xml.parsers.DocumentBuilder.parse(DocumentBuilder.java:124)
        at dom.main(dom.java:65)



 */
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Enumeration;
 import java.util.Hashtable;
 import java.io.*;
 import java.io.FilterInputStream;
 import java.util.Arrays;
 import java.net.*;
 import javax.xml.parsers.*;
 import org.w3c.dom.*;
 import org.xml.sax.*;

 public class dom {
 	public static void main(String argv[]) {
 		try {
 			InputStream is = new BufferedInputStream(
 			new FileInputStream("week.html"));
 			//InputSource inputSource = new InputSource("week.html");
 			InputSource inputSource = new InputSource(is);
 			inputSource.setEncoding("iso-8859-1");

 			DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
 			DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();

 			LocalDTDResolver localDTDResolver =
 			new LocalDTDResolver("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",	new File("xhtml-lat1.ent"));
 			dBuilder.setEntityResolver(localDTDResolver);

 			Document doc = dBuilder.parse(new ReplacingInputStream( inputSource.getByteStream() ));

 			doc.getDocumentElement().normalize();
 			NodeList tableList = doc.getElementsByTagName("table");

 			int numUsers = 30;
 			numUsers = 1;

 			String[][] a = new String[numUsers][6];
 			Node nNode = tableList.item(8);
 			for(int i = 0; i < numUsers; i++) {
 				a[i][0] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(2).getFirstChild().getFirstChild().getNodeValue();
 				a[i][1] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(3).getFirstChild().getNodeValue();
 				a[i][2] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(4).getFirstChild().getNodeValue();
 				a[i][3] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(5).getFirstChild().getNodeValue();
 				a[i][4] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(6).getFirstChild().getNodeValue();
 				
 				//This is a text int the html. Do some more work to get full urls
 				Node n = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(7).getFirstChild();
 				if(n.getParentNode().getChildNodes().getLength() > 1) {
 						n = n.getParentNode();
 						n.normalize();
 						a[i][5] = n.getTextContent();
 				} else {
 					a[i][5] = n.getTextContent();
 				}

 			}
 			System.out.println(Arrays.deepToString(a));

 		} catch (Exception e) {
 			e.printStackTrace();
 		}
 	}
 }

 class ReplacingInputStream extends FilterInputStream {
 	public ReplacingInputStream(InputStream in) {
 		super(in);
 		this.in = in;
 	}
 	public int read() throws IOException {
 		int read = super.read();
 		if (read!=-1 && read<0x20 && !(read==0x9 || read==0xA || read==0xB))
 		{ read = 0x20;}
 		return read;
 	}
 }



 class LocalDTDResolver implements EntityResolver {
 	String mySystemIdToIntercept;
 	File myLocalDtdPath;
 	URL localDtdFileAsUrl;
 	public LocalDTDResolver( String systemIdToIntercept, File localDtdPath ) throws MalformedURLException {
 		mySystemIdToIntercept = systemIdToIntercept;
 		myLocalDtdPath = localDtdPath;
 		localDtdFileAsUrl = myLocalDtdPath.toURI().toURL();
 	}
 	public InputSource resolveEntity (String publicId, String systemId) {
 		if (systemId.equals( mySystemIdToIntercept )) {
 			return new InputSource( localDtdFileAsUrl.toString() );
 		}
 		else {
 			// use the default behaviour (?)
 			return null;
 		}
 	}
 }
	/*

	1. The input documents somtimes has characters that are not valid as UTF-8 text. Where/how can i filter/remove them ?

	hildenae@sektober:~/Dokumenter/Kildekode/statsparser$ java dom
	com.sun.org.apache.xerces.internal.impl.io.MalformedByteSequenceException: Invalid byte 1 of 1-byte UTF-8 sequence.
	at com.sun.org.apache.xerces.internal.impl.io.UTF8Reader.invalidByte(UTF8Reader.java:684)
	at com.sun.org.apache.xerces.internal.impl.io.UTF8Reader.read(UTF8Reader.java:554)
	at com.sun.org.apache.xerces.internal.impl.XMLEntityScanner.load(XMLEntityScanner.java:1742)
	at com.sun.org.apache.xerces.internal.impl.XMLEntityScanner.skipChar(XMLEntityScanner.java:1416)
	at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl$FragmentContentDriver.next(XMLDocumentFragmentScannerImpl.java:2792)
	at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl.next(XMLDocumentScannerImpl.java:648)
	at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl.scanDocument(XMLDocumentFragmentScannerImpl.java:511)
	at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(XML11Configuration.java:808)
	at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(XML11Configuration.java:737)
	at com.sun.org.apache.xerces.internal.parsers.XMLParser.parse(XMLParser.java:119)
	at com.sun.org.apache.xerces.internal.parsers.DOMParser.parse(DOMParser.java:235)
	at com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderImpl.parse(DocumentBuilderImpl.java:284)
	at javax.xml.parsers.DocumentBuilder.parse(DocumentBuilder.java:124)
	at dom.main(dom.java:65)



	*/
	import java.io.IOException;
	import java.io.InputStream;
	import java.util.Enumeration;
	import java.util.Hashtable;
	import java.io.*;
	import java.io.FilterInputStream;
	import java.util.Arrays;
	import java.net.*;
	import javax.xml.parsers.*;
	import org.w3c.dom.*;
	import org.xml.sax.*;

	public class dom {
	public static void main(String argv[]) {
	try {
	InputStream is = new BufferedInputStream(
	new FileInputStream("week.html"));
	//InputSource inputSource = new InputSource("week.html");
	InputSource inputSource = new InputSource(is);
	inputSource.setEncoding("iso-8859-1");

	DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
	DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();

	LocalDTDResolver localDTDResolver =
	new LocalDTDResolver("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", new File("xhtml-lat1.ent"));
	dBuilder.setEntityResolver(localDTDResolver);

	Document doc = dBuilder.parse(new ReplacingInputStream( inputSource.getByteStream() ));

	doc.getDocumentElement().normalize();
	NodeList tableList = doc.getElementsByTagName("table");

	int numUsers = 30;
	numUsers = 1;

	String[][] a = new String[numUsers][6];
	Node nNode = tableList.item(8);
	for(int i = 0; i < numUsers; i++) {
	a[i][0] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(2).getFirstChild().getFirstChild().getNodeValue();
	a[i][1] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(3).getFirstChild().getNodeValue();
	a[i][2] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(4).getFirstChild().getNodeValue();
	a[i][3] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(5).getFirstChild().getNodeValue();
	a[i][4] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(6).getFirstChild().getNodeValue();

	//This is a text int the html. Do some more work to get full urls
	Node n = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(7).getFirstChild();
	if(n.getParentNode().getChildNodes().getLength() > 1) {
	n = n.getParentNode();
	n.normalize();
	a[i][5] = n.getTextContent();
	} else {
	a[i][5] = n.getTextContent();
	}

	}
	System.out.println(Arrays.deepToString(a));

	} catch (Exception e) {
	e.printStackTrace();
	}
	}
	}

	class ReplacingInputStream extends FilterInputStream {
	public ReplacingInputStream(InputStream in) {
	super(in);
	this.in = in;
	}
	public int read() throws IOException {
	int read = super.read();
	if (read!=-1 && read<0x20 && !(read==0x9 \|\| read==0xA \|\| read==0xB))
	{ read = 0x20;}
	return read;
	}
	}



	class LocalDTDResolver implements EntityResolver {
	String mySystemIdToIntercept;
	File myLocalDtdPath;
	URL localDtdFileAsUrl;
	public LocalDTDResolver( String systemIdToIntercept, File localDtdPath ) throws MalformedURLException {
	mySystemIdToIntercept = systemIdToIntercept;
	myLocalDtdPath = localDtdPath;
	localDtdFileAsUrl = myLocalDtdPath.toURI().toURL();
	}
	public InputSource resolveEntity (String publicId, String systemId) {
	if (systemId.equals( mySystemIdToIntercept )) {
	return new InputSource( localDtdFileAsUrl.toString() );
	}
	else {
	// use the default behaviour (?)
	return null;
	}
	}
	}