kchodorow · August 29, 2015 14:14
diff --git a/Spider.java b/Spider.java

 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.net.URL;
 import java.nio.channels.Channels;
 import java.nio.channels.ReadableByteChannel;
 import java.util.ArrayList;
 import java.util.NoSuchElementException;
 import java.util.Scanner;

 /**
 * Crawls the web.
 */
 public class Spider {
  public static void main(String args[]) {
    WebCrawler crawler = new WebCrawler("http://www.google.com", 3);
    crawler.crawl();
  }
 }

 class WebCrawler {

  // List of URLs to crawl.  This only contains startingUrl when created, more URLs are added by
  // addUrls().
  private ArrayList<String> urls;
  // How many links to follow.
  private int depth;

  // Used to give unique names to downloaded files.
  private int counter;

  public WebCrawler(String startingUrl, int depth) {
    this.depth = depth;
    this.urls = new ArrayList<String>();
    urls.add(startingUrl);
    this.counter = 0;
  }

  public void crawl() {
    // TODO: fill in.
  }

  /**
   * Download a page at a given URL.  Returns the local filename that the page was saved under.
   * Returns null if there was an error downloading the file.
   *
   * The downloaded files are saved in the top-level folder of your project (i.e., as siblings to
   * the src/ directory).
   */
  private String downloadPage(String url) {
    String filename = "page" + counter + ".html";
    counter = counter + 1;
    try {
      URL website = new URL(url);
      ReadableByteChannel rbc = Channels.newChannel(website.openStream());
      FileOutputStream fos = new FileOutputStream(filename);
      System.out.println("At depth " + depth + ", downloading " + url);
      // Download the page at the given URL to a file named page.html.
      fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
      fos.close();
    } catch (IOException e) {
      System.out.println("Failed to download URL " + url + ": " + e.getMessage());
      return null;
    }
    return filename;
  }

  /**
   * Add any URLs found in the given file to the list of files to crawl.
   */
  private void addUrls(String filename) {
    // Read the contents of the file into a String.
    String content;
    try {
      content = new Scanner(new File(filename)).useDelimiter("\\Z").next();
    } catch (FileNotFoundException e) {
      System.out.println("Could not read file " + filename + ": " + e.getMessage());
      return;
    } catch (NoSuchElementException e) {
      System.out.println(filename + " was empty, skipping (" + e.getMessage() + ")");
      return;
    }
    int urlPosition = content.indexOf("\"http://");
    while (urlPosition >= 0) {
      int endOfUrlPosition = content.indexOf("\"", urlPosition+1);
      String nextUrl = content.substring(urlPosition+1, endOfUrlPosition);
      urls.add(nextUrl);
      content = content.substring(endOfUrlPosition + 1);
      urlPosition = content.indexOf("\"http://");
    }
  }
 }

	import java.io.File;
	import java.io.FileNotFoundException;
	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.net.URL;
	import java.nio.channels.Channels;
	import java.nio.channels.ReadableByteChannel;
	import java.util.ArrayList;
	import java.util.NoSuchElementException;
	import java.util.Scanner;

	/**
	* Crawls the web.
	*/
	public class Spider {
	public static void main(String args[]) {
	WebCrawler crawler = new WebCrawler("http://www.google.com", 3);
	crawler.crawl();
	}
	}

	class WebCrawler {

	// List of URLs to crawl. This only contains startingUrl when created, more URLs are added by
	// addUrls().
	private ArrayList<String> urls;
	// How many links to follow.
	private int depth;

	// Used to give unique names to downloaded files.
	private int counter;

	public WebCrawler(String startingUrl, int depth) {
	this.depth = depth;
	this.urls = new ArrayList<String>();
	urls.add(startingUrl);
	this.counter = 0;
	}

	public void crawl() {
	// TODO: fill in.
	}

	/**
	* Download a page at a given URL. Returns the local filename that the page was saved under.
	* Returns null if there was an error downloading the file.
	*
	* The downloaded files are saved in the top-level folder of your project (i.e., as siblings to
	* the src/ directory).
	*/
	private String downloadPage(String url) {
	String filename = "page" + counter + ".html";
	counter = counter + 1;
	try {
	URL website = new URL(url);
	ReadableByteChannel rbc = Channels.newChannel(website.openStream());
	FileOutputStream fos = new FileOutputStream(filename);
	System.out.println("At depth " + depth + ", downloading " + url);
	// Download the page at the given URL to a file named page.html.
	fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
	fos.close();
	} catch (IOException e) {
	System.out.println("Failed to download URL " + url + ": " + e.getMessage());
	return null;
	}
	return filename;
	}

	/**
	* Add any URLs found in the given file to the list of files to crawl.
	*/
	private void addUrls(String filename) {
	// Read the contents of the file into a String.
	String content;
	try {
	content = new Scanner(new File(filename)).useDelimiter("\\Z").next();
	} catch (FileNotFoundException e) {
	System.out.println("Could not read file " + filename + ": " + e.getMessage());
	return;
	} catch (NoSuchElementException e) {
	System.out.println(filename + " was empty, skipping (" + e.getMessage() + ")");
	return;
	}
	int urlPosition = content.indexOf("\"http://");
	while (urlPosition >= 0) {
	int endOfUrlPosition = content.indexOf("\"", urlPosition+1);
	String nextUrl = content.substring(urlPosition+1, endOfUrlPosition);
	urls.add(nextUrl);
	content = content.substring(endOfUrlPosition + 1);
	urlPosition = content.indexOf("\"http://");
	}
	}
	}