spullara · November 10, 2010 05:29
diff --git a/gistfile1.txt b/gistfile1.txt
 package bagcheck.scraper;

 import com.google.inject.Inject;
 import com.google.inject.Singleton;
 import org.w3c.dom.Document;
 import org.w3c.tidy.Tidy;
 import org.w3c.tidy.TidyUtils;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;

 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.PrintWriter;
 import java.io.StringReader;
 import java.io.StringWriter;
 import java.net.HttpURLConnection;
 import java.net.Proxy;
 import java.net.URL;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.Inflater;
 import java.util.zip.InflaterInputStream;

 /**
 * Scrape HTML pages
 * <p/>
 * User: sam
 * Date: Jun 22, 2010
 * Time: 12:55:38 PM
 */
 @Singleton
 public class Scrape {

  @Inject
  Proxy proxy;

  public static final DocumentBuilderFactory DBF = DocumentBuilderFactory.newInstance();

  static {
    DBF.setNamespaceAware(false);
    DBF.setValidating(false);
    try {
      DBF.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
    } catch (ParserConfigurationException e) {
      e.printStackTrace();
    }
  }

  public Document scrape(String u) throws IOException, ParserConfigurationException {
    URL url = new URL(u);
    HttpURLConnection urlc;
    if (proxy != null) {
      urlc = (HttpURLConnection) url.openConnection(proxy);
    } else {
      urlc = (HttpURLConnection) url.openConnection();
    }
    urlc.addRequestProperty("User-Agent", "Mozilla/5.0 (en-us; BagCheck) AppleWebKit (KHTML, like Gecko) Version/5.0 Safari");
    urlc.addRequestProperty("Accept-Encoding", "gzip, deflate");
    urlc.setInstanceFollowRedirects(true);
    
    String contentType = urlc.getContentType();
    int index = contentType == null ? -1 : contentType.indexOf("charset=");
    String charSet;
    if (index == -1 || !TidyUtils.isCharEncodingSupported(charSet = contentType.substring(index + 8).trim())) {
      charSet = "utf-8";
    }

    Tidy tidy = new Tidy();
    tidy.setErrout(new PrintWriter(System.out));
    tidy.setInputEncoding(charSet);
    tidy.setQuiet(true);
    tidy.setShowWarnings(false);
    tidy.setXmlOut(true);
    tidy.setForceOutput(true);
    tidy.setQuoteMarks(true);
    tidy.setXmlSpace(true);
    tidy.setWord2000(true);
    tidy.setDropEmptyParas(true);
    tidy.setDropProprietaryAttributes(true);
    tidy.setEncloseBlockText(true);
    tidy.setEncloseText(true);
    tidy.setEscapeCdata(true);
    tidy.setFixBackslash(true);
    tidy.setLogicalEmphasis(true);
    tidy.setLowerLiterals(true);
    tidy.setNumEntities(true);
    tidy.setDocType("omit");

    StringWriter sw = new StringWriter();
    String encoding = urlc.getContentEncoding();
    InputStream is = urlc.getInputStream();
    if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
      is = new GZIPInputStream(is);
    } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
      is = new InflaterInputStream(is, new Inflater(true));
    }
    tidy.parse(is, sw);
    String html = sw.toString();
    html = html.replace("<?xml?>", "");
    DocumentBuilder builder = DBF.newDocumentBuilder();
    try {
      return builder.parse(new InputSource(new StringReader(html)));
    } catch (SAXException e) {
      throw new IOException("Failed to parse html", e);
    } finally {
      if (is != null) {
        is.close();
      }
    }
  }
 }
	package bagcheck.scraper;

	import com.google.inject.Inject;
	import com.google.inject.Singleton;
	import org.w3c.dom.Document;
	import org.w3c.tidy.Tidy;
	import org.w3c.tidy.TidyUtils;
	import org.xml.sax.InputSource;
	import org.xml.sax.SAXException;

	import javax.xml.parsers.DocumentBuilder;
	import javax.xml.parsers.DocumentBuilderFactory;
	import javax.xml.parsers.ParserConfigurationException;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.PrintWriter;
	import java.io.StringReader;
	import java.io.StringWriter;
	import java.net.HttpURLConnection;
	import java.net.Proxy;
	import java.net.URL;
	import java.util.zip.GZIPInputStream;
	import java.util.zip.Inflater;
	import java.util.zip.InflaterInputStream;

	/**
	* Scrape HTML pages
	* <p/>
	* User: sam
	* Date: Jun 22, 2010
	* Time: 12:55:38 PM
	*/
	@Singleton
	public class Scrape {

	@Inject
	Proxy proxy;

	public static final DocumentBuilderFactory DBF = DocumentBuilderFactory.newInstance();

	static {
	DBF.setNamespaceAware(false);
	DBF.setValidating(false);
	try {
	DBF.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
	} catch (ParserConfigurationException e) {
	e.printStackTrace();
	}
	}

	public Document scrape(String u) throws IOException, ParserConfigurationException {
	URL url = new URL(u);
	HttpURLConnection urlc;
	if (proxy != null) {
	urlc = (HttpURLConnection) url.openConnection(proxy);
	} else {
	urlc = (HttpURLConnection) url.openConnection();
	}
	urlc.addRequestProperty("User-Agent", "Mozilla/5.0 (en-us; BagCheck) AppleWebKit (KHTML, like Gecko) Version/5.0 Safari");
	urlc.addRequestProperty("Accept-Encoding", "gzip, deflate");
	urlc.setInstanceFollowRedirects(true);

	String contentType = urlc.getContentType();
	int index = contentType == null ? -1 : contentType.indexOf("charset=");
	String charSet;
	if (index == -1 \|\| !TidyUtils.isCharEncodingSupported(charSet = contentType.substring(index + 8).trim())) {
	charSet = "utf-8";
	}

	Tidy tidy = new Tidy();
	tidy.setErrout(new PrintWriter(System.out));
	tidy.setInputEncoding(charSet);
	tidy.setQuiet(true);
	tidy.setShowWarnings(false);
	tidy.setXmlOut(true);
	tidy.setForceOutput(true);
	tidy.setQuoteMarks(true);
	tidy.setXmlSpace(true);
	tidy.setWord2000(true);
	tidy.setDropEmptyParas(true);
	tidy.setDropProprietaryAttributes(true);
	tidy.setEncloseBlockText(true);
	tidy.setEncloseText(true);
	tidy.setEscapeCdata(true);
	tidy.setFixBackslash(true);
	tidy.setLogicalEmphasis(true);
	tidy.setLowerLiterals(true);
	tidy.setNumEntities(true);
	tidy.setDocType("omit");

	StringWriter sw = new StringWriter();
	String encoding = urlc.getContentEncoding();
	InputStream is = urlc.getInputStream();
	if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
	is = new GZIPInputStream(is);
	} else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
	is = new InflaterInputStream(is, new Inflater(true));
	}
	tidy.parse(is, sw);
	String html = sw.toString();
	html = html.replace("<?xml?>", "");
	DocumentBuilder builder = DBF.newDocumentBuilder();
	try {
	return builder.parse(new InputSource(new StringReader(html)));
	} catch (SAXException e) {
	throw new IOException("Failed to parse html", e);
	} finally {
	if (is != null) {
	is.close();
	}
	}
	}
	}
No results found