Created
October 2, 2011 21:22
-
-
Save Kagee/1257983 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
1. The input documents somtimes has characters that are not valid as UTF-8 text. Where/how can i filter/remove them ? | |
hildenae@sektober:~/Dokumenter/Kildekode/statsparser$ java dom | |
com.sun.org.apache.xerces.internal.impl.io.MalformedByteSequenceException: Invalid byte 1 of 1-byte UTF-8 sequence. | |
at com.sun.org.apache.xerces.internal.impl.io.UTF8Reader.invalidByte(UTF8Reader.java:684) | |
at com.sun.org.apache.xerces.internal.impl.io.UTF8Reader.read(UTF8Reader.java:554) | |
at com.sun.org.apache.xerces.internal.impl.XMLEntityScanner.load(XMLEntityScanner.java:1742) | |
at com.sun.org.apache.xerces.internal.impl.XMLEntityScanner.skipChar(XMLEntityScanner.java:1416) | |
at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl$FragmentContentDriver.next(XMLDocumentFragmentScannerImpl.java:2792) | |
at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl.next(XMLDocumentScannerImpl.java:648) | |
at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl.scanDocument(XMLDocumentFragmentScannerImpl.java:511) | |
at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(XML11Configuration.java:808) | |
at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(XML11Configuration.java:737) | |
at com.sun.org.apache.xerces.internal.parsers.XMLParser.parse(XMLParser.java:119) | |
at com.sun.org.apache.xerces.internal.parsers.DOMParser.parse(DOMParser.java:235) | |
at com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderImpl.parse(DocumentBuilderImpl.java:284) | |
at javax.xml.parsers.DocumentBuilder.parse(DocumentBuilder.java:124) | |
at dom.main(dom.java:65) | |
*/ | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.util.Enumeration; | |
import java.util.Hashtable; | |
import java.io.*; | |
import java.io.FilterInputStream; | |
import java.util.Arrays; | |
import java.net.*; | |
import javax.xml.parsers.*; | |
import org.w3c.dom.*; | |
import org.xml.sax.*; | |
public class dom { | |
public static void main(String argv[]) { | |
try { | |
InputStream is = new BufferedInputStream( | |
new FileInputStream("week.html")); | |
//InputSource inputSource = new InputSource("week.html"); | |
InputSource inputSource = new InputSource(is); | |
inputSource.setEncoding("iso-8859-1"); | |
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); | |
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); | |
LocalDTDResolver localDTDResolver = | |
new LocalDTDResolver("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", new File("xhtml-lat1.ent")); | |
dBuilder.setEntityResolver(localDTDResolver); | |
Document doc = dBuilder.parse(new ReplacingInputStream( inputSource.getByteStream() )); | |
doc.getDocumentElement().normalize(); | |
NodeList tableList = doc.getElementsByTagName("table"); | |
int numUsers = 30; | |
numUsers = 1; | |
String[][] a = new String[numUsers][6]; | |
Node nNode = tableList.item(8); | |
for(int i = 0; i < numUsers; i++) { | |
a[i][0] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(2).getFirstChild().getFirstChild().getNodeValue(); | |
a[i][1] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(3).getFirstChild().getNodeValue(); | |
a[i][2] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(4).getFirstChild().getNodeValue(); | |
a[i][3] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(5).getFirstChild().getNodeValue(); | |
a[i][4] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(6).getFirstChild().getNodeValue(); | |
//This is a text int the html. Do some more work to get full urls | |
Node n = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(7).getFirstChild(); | |
if(n.getParentNode().getChildNodes().getLength() > 1) { | |
n = n.getParentNode(); | |
n.normalize(); | |
a[i][5] = n.getTextContent(); | |
} else { | |
a[i][5] = n.getTextContent(); | |
} | |
} | |
System.out.println(Arrays.deepToString(a)); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
} | |
class ReplacingInputStream extends FilterInputStream { | |
public ReplacingInputStream(InputStream in) { | |
super(in); | |
this.in = in; | |
} | |
public int read() throws IOException { | |
int read = super.read(); | |
if (read!=-1 && read<0x20 && !(read==0x9 || read==0xA || read==0xB)) | |
{ read = 0x20;} | |
return read; | |
} | |
} | |
class LocalDTDResolver implements EntityResolver { | |
String mySystemIdToIntercept; | |
File myLocalDtdPath; | |
URL localDtdFileAsUrl; | |
public LocalDTDResolver( String systemIdToIntercept, File localDtdPath ) throws MalformedURLException { | |
mySystemIdToIntercept = systemIdToIntercept; | |
myLocalDtdPath = localDtdPath; | |
localDtdFileAsUrl = myLocalDtdPath.toURI().toURL(); | |
} | |
public InputSource resolveEntity (String publicId, String systemId) { | |
if (systemId.equals( mySystemIdToIntercept )) { | |
return new InputSource( localDtdFileAsUrl.toString() ); | |
} | |
else { | |
// use the default behaviour (?) | |
return null; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment