Skip to content

Instantly share code, notes, and snippets.

@spullara
Created November 10, 2010 05:29
Show Gist options
  • Select an option

  • Save spullara/670401 to your computer and use it in GitHub Desktop.

Select an option

Save spullara/670401 to your computer and use it in GitHub Desktop.
package bagcheck.scraper;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import org.w3c.dom.Document;
import org.w3c.tidy.Tidy;
import org.w3c.tidy.TidyUtils;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringReader;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.Proxy;
import java.net.URL;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
/**
* Scrape HTML pages
* <p/>
* User: sam
* Date: Jun 22, 2010
* Time: 12:55:38 PM
*/
@Singleton
public class Scrape {
@Inject
Proxy proxy;
public static final DocumentBuilderFactory DBF = DocumentBuilderFactory.newInstance();
static {
DBF.setNamespaceAware(false);
DBF.setValidating(false);
try {
DBF.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
} catch (ParserConfigurationException e) {
e.printStackTrace();
}
}
public Document scrape(String u) throws IOException, ParserConfigurationException {
URL url = new URL(u);
HttpURLConnection urlc;
if (proxy != null) {
urlc = (HttpURLConnection) url.openConnection(proxy);
} else {
urlc = (HttpURLConnection) url.openConnection();
}
urlc.addRequestProperty("User-Agent", "Mozilla/5.0 (en-us; BagCheck) AppleWebKit (KHTML, like Gecko) Version/5.0 Safari");
urlc.addRequestProperty("Accept-Encoding", "gzip, deflate");
urlc.setInstanceFollowRedirects(true);
String contentType = urlc.getContentType();
int index = contentType == null ? -1 : contentType.indexOf("charset=");
String charSet;
if (index == -1 || !TidyUtils.isCharEncodingSupported(charSet = contentType.substring(index + 8).trim())) {
charSet = "utf-8";
}
Tidy tidy = new Tidy();
tidy.setErrout(new PrintWriter(System.out));
tidy.setInputEncoding(charSet);
tidy.setQuiet(true);
tidy.setShowWarnings(false);
tidy.setXmlOut(true);
tidy.setForceOutput(true);
tidy.setQuoteMarks(true);
tidy.setXmlSpace(true);
tidy.setWord2000(true);
tidy.setDropEmptyParas(true);
tidy.setDropProprietaryAttributes(true);
tidy.setEncloseBlockText(true);
tidy.setEncloseText(true);
tidy.setEscapeCdata(true);
tidy.setFixBackslash(true);
tidy.setLogicalEmphasis(true);
tidy.setLowerLiterals(true);
tidy.setNumEntities(true);
tidy.setDocType("omit");
StringWriter sw = new StringWriter();
String encoding = urlc.getContentEncoding();
InputStream is = urlc.getInputStream();
if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
is = new GZIPInputStream(is);
} else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
is = new InflaterInputStream(is, new Inflater(true));
}
tidy.parse(is, sw);
String html = sw.toString();
html = html.replace("<?xml?>", "");
DocumentBuilder builder = DBF.newDocumentBuilder();
try {
return builder.parse(new InputSource(new StringReader(html)));
} catch (SAXException e) {
throw new IOException("Failed to parse html", e);
} finally {
if (is != null) {
is.close();
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment