Last active
May 20, 2021 04:58
-
-
Save JonasCz/a3b81def26ecc047ceb5 to your computer and use it in GitHub Desktop.
Email and link / URL extraction using Jsoup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package jsouptest; | |
import java.io.IOException; | |
import java.util.HashSet; | |
import java.util.Set; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
public class JSoupTest { | |
public static void main(String[] args) throws IOException { | |
Document doc = Jsoup.connect("http://stackoverflow.com/questions/15893655/magento-ecomdev-phpunit-customer-fixtures-are-not-being-loaded/16668990#16668990").get(); | |
Pattern p = Pattern.compile("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+"); | |
Matcher matcher = p.matcher(doc.text()); | |
Set<String> emails = new HashSet<String>(); | |
while (matcher.find()) { | |
emails.add(matcher.group()); | |
} | |
Set<String> links = new HashSet<String>(); | |
Elements elements = doc.select("a[href]"); | |
for (Element e : elements) { | |
links.add(e.attr("href")); | |
} | |
System.out.println(emails); | |
System.out.println(links); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How do you solve leading and trailing random characters? Using this code I was able to pick up the following "email address" - [email protected]