Created
October 1, 2011 18:58
-
-
Save Kagee/1256495 to your computer and use it in GitHub Desktop.
How to filter out invalid UTF-8 charaters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
The input documents somtimes has characters that are not valid as UTF-8 text. Where/how can i filter/remove them ? | |
[Fatal Error] week.html:191:320: An invalid XML character (Unicode: 0x19) was found in the element content of the document. | |
org.xml.sax.SAXParseException: An invalid XML character (Unicode: 0x19) was found in the element content of the document. | |
at com.sun.org.apache.xerces.internal.parsers.DOMParser.parse(DOMParser.java:249) | |
at com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderImpl.parse(DocumentBuilderImpl.java:284) | |
at dom.main(dom.java:22) | |
*/ | |
import java.io.*; | |
import java.util.Arrays; | |
import java.net.*; | |
import javax.xml.parsers.*; | |
import org.w3c.dom.*; | |
import org.xml.sax.*; | |
public class dom { | |
public static void main(String argv[]) { | |
try { | |
InputSource inputSource = new InputSource("week.html"); | |
inputSource.setEncoding("iso-8859-1"); | |
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); | |
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); | |
LocalDTDResolver localDTDResolver = | |
new LocalDTDResolver("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", | |
new File("xhtml-lat1.ent")); | |
dBuilder.setEntityResolver(localDTDResolver); | |
Document doc = dBuilder.parse(inputSource); | |
doc.getDocumentElement().normalize(); | |
NodeList tableList = doc.getElementsByTagName("table"); | |
int numUsers = 30; | |
numUsers = 1; | |
String[][] a = new String[numUsers][6]; | |
Node nNode = tableList.item(8); | |
for(int i = 0; i < numUsers; i++) { | |
a[i][0] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(2).getFirstChild().getFirstChild().getNodeValue(); | |
a[i][1] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(3).getFirstChild().getNodeValue(); | |
a[i][2] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(4).getFirstChild().getNodeValue(); | |
a[i][3] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(5).getFirstChild().getNodeValue(); | |
a[i][4] = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(6).getFirstChild().getNodeValue(); | |
//This is a text int the html. Do some more work to get full urls | |
Node n = nNode.getChildNodes().item((i+1)*2).getChildNodes().item(7).getFirstChild(); | |
if(n.getParentNode().getChildNodes().getLength() > 1) { | |
n = n.getParentNode(); | |
n.normalize(); | |
a[i][5] = n.getTextContent(); | |
} else { | |
a[i][5] = n.getTextContent(); | |
} | |
} | |
System.out.println(Arrays.deepToString(a)); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
} | |
class LocalDTDResolver implements EntityResolver { | |
String mySystemIdToIntercept; | |
File myLocalDtdPath; | |
URL localDtdFileAsUrl; | |
public LocalDTDResolver( String systemIdToIntercept, File localDtdPath ) throws MalformedURLException { | |
mySystemIdToIntercept = systemIdToIntercept; | |
myLocalDtdPath = localDtdPath; | |
localDtdFileAsUrl = myLocalDtdPath.toURI().toURL(); | |
} | |
public InputSource resolveEntity (String publicId, String systemId) { | |
if (systemId.equals( mySystemIdToIntercept )) { | |
return new InputSource( localDtdFileAsUrl.toString() ); | |
} | |
else { | |
// use the default behaviour (?) | |
return null; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment