Created
January 12, 2021 15:26
-
-
Save saasindustries/679b70bea866feac3cafe79f24fbef2b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import java.net.MalformedURLException; | |
import com.gargoylesoftware.htmlunit.BrowserVersion; | |
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; | |
import com.gargoylesoftware.htmlunit.WebClient; | |
import com.gargoylesoftware.htmlunit.html.DomNode; | |
import com.gargoylesoftware.htmlunit.html.DomNodeList; | |
import com.gargoylesoftware.htmlunit.html.HtmlPage; | |
public class myhtmlunit { | |
public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException { | |
//initialize a headless browser | |
WebClient webClient = new WebClient(BrowserVersion.CHROME); | |
//configuring options | |
webClient.getOptions().setUseInsecureSSL(true); | |
webClient.getOptions().setCssEnabled(false); | |
//webClient.getOptions().setJavaScriptEnabled(false); | |
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); | |
webClient.getOptions().setThrowExceptionOnScriptError(false); | |
//fetching the web page | |
HtmlPage page = webClient.getPage("https://www.reddit.com/r/scraping/"); | |
//selecting all headings | |
DomNodeList<DomNode> headings = page.querySelectorAll("h3._eYtD2XCVieq6emjKBH3m"); | |
//iterating and extracting | |
for (DomNode content: headings) { | |
System.out.println(content.asText()); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment