Skip to content

Instantly share code, notes, and snippets.

@revox
Created March 9, 2015 14:03
Show Gist options
  • Save revox/a1e33759668bf3aa3f95 to your computer and use it in GitHub Desktop.
Save revox/a1e33759668bf3aa3f95 to your computer and use it in GitHub Desktop.
JSoup crawler variation
import java.util.ArrayList;
import java.util.HashSet;
import java.util.HashSet.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
public class Brokens
{
static boolean broken(String url)
{
try {
Jsoup.connect(url).get();
return false;
}
catch (java.net.MalformedURLException e) {
return true;
}
catch (IOException e){
if (e.toString().contains("java.io.IOException: 404")) return true;
return false; //this means it exists but isn't html
}
catch(Exception e) {
return false;
//for any other errror assume not broken - this is a guess
}
}
static HashSet<String> links (String url)
{
HashSet<String> a= new HashSet<String>();
try{
org.jsoup.Connection z=Jsoup.connect(url);
Document doc = z.get();
Elements links = doc.select("a[href]");
for (Element link : links) a.add(link.attr("abs:href"));
}
catch (Exception e)
{
}
return a;
}
static void Spider (String url, int n, String contains)
{
HashSet<String> alreadyVisited = new HashSet <String> ();
HashSet<String> toVisit = new HashSet <String> ();
toVisit.addAll(links(url));
alreadyVisited.add(url);
int i=0;
while (i<n && !toVisit.isEmpty())
{
String z= toVisit.iterator().next();
boolean already=alreadyVisited.contains(z);
if (already) toVisit.remove(z);
else
{
if (z.contains(contains))
{
System.out.println(z);
HashSet <String> k= links(z);
toVisit.addAll(k);
}
alreadyVisited.add(z);
i++;
}
}
for (String k:alreadyVisited)
if (broken(k)) System.out.println("Broken: " +k);
}
public static void main(String[] args) throws IOException
{
String url = args[0];
Spider(url,100,args[1]);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment