Created
March 9, 2015 11:34
-
-
Save revox/f6eefce45209e1d76c45 to your computer and use it in GitHub Desktop.
JSoup simple routine to extract all HREF links as absolute URLs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import org.jsoup.Jsoup; | |
| import org.jsoup.nodes.Document; | |
| import org.jsoup.nodes.Element; | |
| import org.jsoup.select.Elements; | |
| import java.io.IOException; | |
| public class ExtractLinks | |
| { | |
| public static void main(String[] args) throws IOException | |
| { | |
| if (args.length > 0) { | |
| String url = args[0]; | |
| Document doc = Jsoup.connect(url).get(); | |
| Elements links = doc.select("a[href]"); | |
| for (Element link : links) System.out.println(link.attr("abs:href")); | |
| } else { | |
| System.out.println("full URL required"); | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment