Skip to content

Instantly share code, notes, and snippets.

@revox
Created March 9, 2015 12:33
Show Gist options
  • Save revox/8c5c26cc810b0b428834 to your computer and use it in GitHub Desktop.
Save revox/8c5c26cc810b0b428834 to your computer and use it in GitHub Desktop.
Basic crawler accepts a single starting URL and total length of crawl int
import java.util.ArrayList;
import java.util.HashSet;
import java.util.HashSet.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
public class Crawler
{
// extracts the links from a URL, puts them in a HashSet and then returns it
static HashSet<String> links (String url)
{
HashSet<String> a = new HashSet<String>();
try{
org.jsoup.Connection z=Jsoup.connect(url);
Document doc = z.get();
Elements links = doc.select("a[href]");
for (Element link : links) a.add(link.attr("abs:href"));
}
catch (Exception e)
{
System.out.println(e);
}
return a;
}
static void Spider (String url, int n)
{
HashSet<String> alreadyVisited = new HashSet <String> ();
HashSet<String> toVisit = new HashSet <String> ();
toVisit.addAll(links(url));
alreadyVisited.add(url);
int i=0;
while (i<n && !toVisit.isEmpty())
{
String z= toVisit.iterator().next();
boolean already=alreadyVisited.contains(z);
if (already) toVisit.remove(z);
else
{
System.out.println(z);
HashSet <String> k= links(z);
toVisit.addAll(k);
alreadyVisited.add(z);
System.out.printf("observed: %d visited: %d \n",toVisit.size(),alreadyVisited.size());
i++;
}
}
}
public static void main(String[] args) throws IOException
{
String url = args[0];
Spider(url,Integer.parseInt(args[1]));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment