Skip to content

Instantly share code, notes, and snippets.

@kchodorow
Last active August 29, 2015 14:14
Show Gist options
  • Save kchodorow/d3c5438dda0eac0892df to your computer and use it in GitHub Desktop.
Save kchodorow/d3c5438dda0eac0892df to your computer and use it in GitHub Desktop.
Extra credit stub for Monday, February 9th
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.util.ArrayList;
import java.util.NoSuchElementException;
import java.util.Scanner;
/**
* Crawls the web.
*/
public class Spider {
public static void main(String args[]) {
WebCrawler crawler = new WebCrawler("http://www.google.com", 3);
crawler.crawl();
}
}
class WebCrawler {
// List of URLs to crawl. This only contains startingUrl when created, more URLs are added by
// addUrls().
private ArrayList<String> urls;
// How many links to follow.
private int depth;
// Used to give unique names to downloaded files.
private int counter;
public WebCrawler(String startingUrl, int depth) {
this.depth = depth;
this.urls = new ArrayList<String>();
urls.add(startingUrl);
this.counter = 0;
}
public void crawl() {
// TODO: fill in.
}
/**
* Download a page at a given URL. Returns the local filename that the page was saved under.
* Returns null if there was an error downloading the file.
*
* The downloaded files are saved in the top-level folder of your project (i.e., as siblings to
* the src/ directory).
*/
private String downloadPage(String url) {
String filename = "page" + counter + ".html";
counter = counter + 1;
try {
URL website = new URL(url);
ReadableByteChannel rbc = Channels.newChannel(website.openStream());
FileOutputStream fos = new FileOutputStream(filename);
System.out.println("At depth " + depth + ", downloading " + url);
// Download the page at the given URL to a file named page.html.
fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
fos.close();
} catch (IOException e) {
System.out.println("Failed to download URL " + url + ": " + e.getMessage());
return null;
}
return filename;
}
/**
* Add any URLs found in the given file to the list of files to crawl.
*/
private void addUrls(String filename) {
// Read the contents of the file into a String.
String content;
try {
content = new Scanner(new File(filename)).useDelimiter("\\Z").next();
} catch (FileNotFoundException e) {
System.out.println("Could not read file " + filename + ": " + e.getMessage());
return;
} catch (NoSuchElementException e) {
System.out.println(filename + " was empty, skipping (" + e.getMessage() + ")");
return;
}
int urlPosition = content.indexOf("\"http://");
while (urlPosition >= 0) {
int endOfUrlPosition = content.indexOf("\"", urlPosition+1);
String nextUrl = content.substring(urlPosition+1, endOfUrlPosition);
urls.add(nextUrl);
content = content.substring(endOfUrlPosition + 1);
urlPosition = content.indexOf("\"http://");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment