Created
November 10, 2010 05:29
-
-
Save spullara/670401 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package bagcheck.scraper; | |
| import com.google.inject.Inject; | |
| import com.google.inject.Singleton; | |
| import org.w3c.dom.Document; | |
| import org.w3c.tidy.Tidy; | |
| import org.w3c.tidy.TidyUtils; | |
| import org.xml.sax.InputSource; | |
| import org.xml.sax.SAXException; | |
| import javax.xml.parsers.DocumentBuilder; | |
| import javax.xml.parsers.DocumentBuilderFactory; | |
| import javax.xml.parsers.ParserConfigurationException; | |
| import java.io.IOException; | |
| import java.io.InputStream; | |
| import java.io.PrintWriter; | |
| import java.io.StringReader; | |
| import java.io.StringWriter; | |
| import java.net.HttpURLConnection; | |
| import java.net.Proxy; | |
| import java.net.URL; | |
| import java.util.zip.GZIPInputStream; | |
| import java.util.zip.Inflater; | |
| import java.util.zip.InflaterInputStream; | |
| /** | |
| * Scrape HTML pages | |
| * <p/> | |
| * User: sam | |
| * Date: Jun 22, 2010 | |
| * Time: 12:55:38 PM | |
| */ | |
| @Singleton | |
| public class Scrape { | |
| @Inject | |
| Proxy proxy; | |
| public static final DocumentBuilderFactory DBF = DocumentBuilderFactory.newInstance(); | |
| static { | |
| DBF.setNamespaceAware(false); | |
| DBF.setValidating(false); | |
| try { | |
| DBF.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); | |
| } catch (ParserConfigurationException e) { | |
| e.printStackTrace(); | |
| } | |
| } | |
| public Document scrape(String u) throws IOException, ParserConfigurationException { | |
| URL url = new URL(u); | |
| HttpURLConnection urlc; | |
| if (proxy != null) { | |
| urlc = (HttpURLConnection) url.openConnection(proxy); | |
| } else { | |
| urlc = (HttpURLConnection) url.openConnection(); | |
| } | |
| urlc.addRequestProperty("User-Agent", "Mozilla/5.0 (en-us; BagCheck) AppleWebKit (KHTML, like Gecko) Version/5.0 Safari"); | |
| urlc.addRequestProperty("Accept-Encoding", "gzip, deflate"); | |
| urlc.setInstanceFollowRedirects(true); | |
| String contentType = urlc.getContentType(); | |
| int index = contentType == null ? -1 : contentType.indexOf("charset="); | |
| String charSet; | |
| if (index == -1 || !TidyUtils.isCharEncodingSupported(charSet = contentType.substring(index + 8).trim())) { | |
| charSet = "utf-8"; | |
| } | |
| Tidy tidy = new Tidy(); | |
| tidy.setErrout(new PrintWriter(System.out)); | |
| tidy.setInputEncoding(charSet); | |
| tidy.setQuiet(true); | |
| tidy.setShowWarnings(false); | |
| tidy.setXmlOut(true); | |
| tidy.setForceOutput(true); | |
| tidy.setQuoteMarks(true); | |
| tidy.setXmlSpace(true); | |
| tidy.setWord2000(true); | |
| tidy.setDropEmptyParas(true); | |
| tidy.setDropProprietaryAttributes(true); | |
| tidy.setEncloseBlockText(true); | |
| tidy.setEncloseText(true); | |
| tidy.setEscapeCdata(true); | |
| tidy.setFixBackslash(true); | |
| tidy.setLogicalEmphasis(true); | |
| tidy.setLowerLiterals(true); | |
| tidy.setNumEntities(true); | |
| tidy.setDocType("omit"); | |
| StringWriter sw = new StringWriter(); | |
| String encoding = urlc.getContentEncoding(); | |
| InputStream is = urlc.getInputStream(); | |
| if (encoding != null && encoding.equalsIgnoreCase("gzip")) { | |
| is = new GZIPInputStream(is); | |
| } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) { | |
| is = new InflaterInputStream(is, new Inflater(true)); | |
| } | |
| tidy.parse(is, sw); | |
| String html = sw.toString(); | |
| html = html.replace("<?xml?>", ""); | |
| DocumentBuilder builder = DBF.newDocumentBuilder(); | |
| try { | |
| return builder.parse(new InputSource(new StringReader(html))); | |
| } catch (SAXException e) { | |
| throw new IOException("Failed to parse html", e); | |
| } finally { | |
| if (is != null) { | |
| is.close(); | |
| } | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment