Skip to content

Instantly share code, notes, and snippets.

@hkarakose
Last active December 25, 2015 05:09
Show Gist options
  • Save hkarakose/6922357 to your computer and use it in GitHub Desktop.
Save hkarakose/6922357 to your computer and use it in GitHub Desktop.
Read and parse a web page
package reader;
import bilgiturk.commons.ParserFactory;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.htmlparser.Parser;
import org.htmlparser.filters.CssSelectorNodeFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
/**
* Hello world!
*/
public class ReadAndParseApp {
public static void main(String[] args) throws IOException, ParserException {
System.setProperty("sun.net.client.defaultReadTimeout", "10000");
System.setProperty("sun.net.client.defaultConnectTimeout", "10000");
URL url = new URL("http://www.javacodegeeks.com/2013/10/how-to-network-less-for-geeks.html");
InputStream htmlInputStream = (InputStream) url.getContent();
String html = new IOUtils().toString(htmlInputStream);
Parser parser = ParserFactory.newParser();
parser.setInputHTML(html);
NodeList parsed = parser.parse(new CssSelectorNodeFilter("div[class='entry entry-content']"));
FileUtils.writeStringToFile(new File("output.html"), parsed.toHtml(), "UTF-8");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment